npm - @agentv/core - Versions diffs - 0.22.2 → 0.25.0 - Mend

@agentv/core 0.22.2 → 0.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/dist/{chunk-B2J23S7D.js → chunk-OYTL3LNN.js} +24 -16
package/dist/chunk-OYTL3LNN.js.map +1 -0
package/dist/evaluation/validation/index.cjs +64 -17
package/dist/evaluation/validation/index.cjs.map +1 -1
package/dist/evaluation/validation/index.js +48 -2
package/dist/evaluation/validation/index.js.map +1 -1
package/dist/index.cjs +994 -50
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +205 -4
package/dist/index.d.ts +205 -4
package/dist/index.js +953 -23
package/dist/index.js.map +1 -1
package/package.json +3 -4
package/dist/chunk-B2J23S7D.js.map +0 -1

package/dist/index.cjs CHANGED Viewed

@@ -31,11 +31,15 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
 var index_exports = {};
 __export(index_exports, {
   CodeEvaluator: () => CodeEvaluator,
+  CompositeEvaluator: () => CompositeEvaluator,
+  ExpectedMessagesEvaluator: () => ExpectedMessagesEvaluator,
   LlmJudgeEvaluator: () => LlmJudgeEvaluator,
   TEST_MESSAGE_ROLES: () => TEST_MESSAGE_ROLES,
+  ToolTrajectoryEvaluator: () => ToolTrajectoryEvaluator,
   buildDirectoryChain: () => buildDirectoryChain2,
   buildPromptInputs: () => buildPromptInputs,
   buildSearchRoots: () => buildSearchRoots2,
+  computeTraceSummary: () => computeTraceSummary,
   consumeCodexLogEntries: () => consumeCodexLogEntries,
   createAgentKernel: () => createAgentKernel,
   createProvider: () => createProvider,
@@ -46,14 +50,18 @@ __export(index_exports, {
   generateRubrics: () => generateRubrics,
   getHitCount: () => getHitCount,
   isEvaluatorKind: () => isEvaluatorKind,
+  isExpectedToolCall: () => isExpectedToolCall,
   isGuidelineFile: () => isGuidelineFile,
   isJsonObject: () => isJsonObject,
   isJsonValue: () => isJsonValue,
   isTestMessage: () => isTestMessage,
   isTestMessageRole: () => isTestMessageRole,
+  isTraceEvent: () => isTraceEvent,
+  isTraceEventType: () => isTraceEventType,
   listTargetNames: () => listTargetNames,
   loadEvalCases: () => loadEvalCases,
   normalizeLineEndings: () => normalizeLineEndings,
+  readJsonFile: () => readJsonFile,
   readTargetDefinitions: () => readTargetDefinitions,
   readTestSuiteMetadata: () => readTestSuiteMetadata,
   readTextFile: () => readTextFile,
@@ -107,7 +115,14 @@ function isTestMessage(value) {
   }
   return candidate.content.every(isJsonObject);
 }
-var EVALUATOR_KIND_VALUES = ["code", "llm_judge", "rubric"];
+var EVALUATOR_KIND_VALUES = [
+  "code_judge",
+  "llm_judge",
+  "rubric",
+  "composite",
+  "tool_trajectory",
+  "expected_messages"
+];
 var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
 function isEvaluatorKind(value) {
   return typeof value === "string" && EVALUATOR_KIND_SET.has(value);
@@ -116,6 +131,44 @@ function getHitCount(result) {
   return result.hits.length;
 }
+// src/evaluation/trace.ts
+function isTraceEventType(value) {
+  return typeof value === "string" && ["model_step", "tool_call", "tool_result", "message", "error"].includes(value);
+}
+function isTraceEvent(value) {
+  if (typeof value !== "object" || value === null) {
+    return false;
+  }
+  const candidate = value;
+  return isTraceEventType(candidate.type) && typeof candidate.timestamp === "string";
+}
+function isExpectedToolCall(value) {
+  if (typeof value !== "object" || value === null) {
+    return false;
+  }
+  const candidate = value;
+  return typeof candidate.tool === "string";
+}
+function computeTraceSummary(trace) {
+  const toolCallCounts = {};
+  let errorCount = 0;
+  for (const event of trace) {
+    if (event.type === "tool_call" && event.name) {
+      toolCallCounts[event.name] = (toolCallCounts[event.name] ?? 0) + 1;
+    }
+    if (event.type === "error") {
+      errorCount++;
+    }
+  }
+  const toolNames = Object.keys(toolCallCounts).sort();
+  return {
+    eventCount: trace.length,
+    toolNames,
+    toolCallsByName: toolCallCounts,
+    errorCount
+  };
+}
 // src/evaluation/yaml-parser.ts
 var import_promises6 = require("fs/promises");
 var import_node_path6 = __toESM(require("path"), 1);
@@ -459,10 +512,10 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
       logWarning2(`Skipping evaluator with invalid name/type in '${evalId}'`);
       continue;
     }
-    if (typeValue === "code") {
+    if (typeValue === "code_judge") {
       const script = asString2(rawEvaluator.script);
       if (!script) {
-        logWarning2(`Skipping code evaluator '${name}' in '${evalId}': missing script`);
+        logWarning2(`Skipping code_judge evaluator '${name}' in '${evalId}': missing script`);
         continue;
       }
       const cwd = asString2(rawEvaluator.cwd);
@@ -473,7 +526,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
           resolvedCwd = import_node_path3.default.resolve(resolved.resolvedPath);
         } else {
           logWarning2(
-            `Code evaluator '${name}' in '${evalId}': cwd not found (${resolved.displayPath})`,
+            `Code_judge evaluator '${name}' in '${evalId}': cwd not found (${resolved.displayPath})`,
             resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => `  Tried: ${attempt}`) : void 0
           );
         }
@@ -489,6 +542,174 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
       });
       continue;
     }
+    if (typeValue === "composite") {
+      const rawMembers = rawEvaluator.evaluators;
+      if (!Array.isArray(rawMembers)) {
+        logWarning2(
+          `Skipping composite evaluator '${name}' in '${evalId}': missing evaluators array`
+        );
+        continue;
+      }
+      const rawAggregator = rawEvaluator.aggregator;
+      if (!isJsonObject2(rawAggregator)) {
+        logWarning2(`Skipping composite evaluator '${name}' in '${evalId}': missing aggregator`);
+        continue;
+      }
+      const aggregatorType = asString2(rawAggregator.type);
+      if (aggregatorType !== "weighted_average" && aggregatorType !== "code_judge" && aggregatorType !== "llm_judge") {
+        logWarning2(
+          `Skipping composite evaluator '${name}' in '${evalId}': invalid aggregator type '${aggregatorType}'`
+        );
+        continue;
+      }
+      const memberEvaluators = [];
+      for (const rawMember of rawMembers) {
+        if (!isJsonObject2(rawMember)) {
+          logWarning2(`Skipping invalid member evaluator in composite '${name}' (expected object)`);
+          continue;
+        }
+        const memberName = asString2(rawMember.name);
+        const memberType = rawMember.type;
+        if (!memberName || !isEvaluatorKind(memberType)) {
+          logWarning2(`Skipping member evaluator with invalid name/type in composite '${name}'`);
+          continue;
+        }
+        const memberConfigs = await parseEvaluators(
+          { evaluators: [rawMember] },
+          void 0,
+          searchRoots,
+          `${evalId}:${name}:${memberName}`
+        );
+        if (memberConfigs && memberConfigs.length > 0) {
+          memberEvaluators.push(memberConfigs[0]);
+        }
+      }
+      if (memberEvaluators.length === 0) {
+        logWarning2(
+          `Skipping composite evaluator '${name}' in '${evalId}': no valid member evaluators`
+        );
+        continue;
+      }
+      let aggregator;
+      if (aggregatorType === "weighted_average") {
+        const weights = isJsonObject2(rawAggregator.weights) ? rawAggregator.weights : void 0;
+        const parsedWeights = {};
+        if (weights) {
+          for (const [key, value] of Object.entries(weights)) {
+            if (typeof value === "number") {
+              parsedWeights[key] = value;
+            }
+          }
+        }
+        aggregator = {
+          type: "weighted_average",
+          ...Object.keys(parsedWeights).length > 0 ? { weights: parsedWeights } : {}
+        };
+      } else if (aggregatorType === "code_judge") {
+        const aggregatorPath = asString2(rawAggregator.path);
+        if (!aggregatorPath) {
+          logWarning2(
+            `Skipping composite evaluator '${name}' in '${evalId}': code_judge aggregator missing path`
+          );
+          continue;
+        }
+        aggregator = {
+          type: "code_judge",
+          path: aggregatorPath,
+          cwd: searchRoots[0]
+        };
+      } else {
+        const aggregatorPrompt = asString2(rawAggregator.prompt);
+        let promptPath2;
+        if (aggregatorPrompt) {
+          const resolved = await resolveFileReference(aggregatorPrompt, searchRoots);
+          if (resolved.resolvedPath) {
+            promptPath2 = import_node_path3.default.resolve(resolved.resolvedPath);
+          }
+        }
+        aggregator = {
+          type: "llm_judge",
+          ...aggregatorPrompt ? { prompt: aggregatorPrompt } : {},
+          ...promptPath2 ? { promptPath: promptPath2 } : {}
+        };
+      }
+      evaluators.push({
+        name,
+        type: "composite",
+        evaluators: memberEvaluators,
+        aggregator
+      });
+      continue;
+    }
+    if (typeValue === "expected_messages") {
+      evaluators.push({
+        name,
+        type: "expected_messages"
+      });
+      continue;
+    }
+    if (typeValue === "tool_trajectory") {
+      const mode = asString2(rawEvaluator.mode);
+      if (mode !== "any_order" && mode !== "in_order" && mode !== "exact") {
+        logWarning2(
+          `Skipping tool_trajectory evaluator '${name}' in '${evalId}': invalid mode '${mode}' (must be any_order, in_order, or exact)`
+        );
+        continue;
+      }
+      const rawMinimums = rawEvaluator.minimums;
+      let minimums;
+      if (rawMinimums !== void 0) {
+        if (!isJsonObject2(rawMinimums)) {
+          logWarning2(
+            `Skipping tool_trajectory evaluator '${name}' in '${evalId}': minimums must be an object`
+          );
+          continue;
+        }
+        minimums = {};
+        for (const [toolName, count] of Object.entries(rawMinimums)) {
+          if (typeof count === "number" && count >= 0) {
+            minimums[toolName] = count;
+          }
+        }
+      }
+      const rawExpected = rawEvaluator.expected;
+      let expected;
+      if (rawExpected !== void 0) {
+        if (!Array.isArray(rawExpected)) {
+          logWarning2(
+            `Skipping tool_trajectory evaluator '${name}' in '${evalId}': expected must be an array`
+          );
+          continue;
+        }
+        expected = [];
+        for (const item of rawExpected) {
+          if (isJsonObject2(item) && typeof item.tool === "string") {
+            expected.push({ tool: item.tool });
+          }
+        }
+      }
+      if (mode === "any_order" && !minimums) {
+        logWarning2(
+          `Skipping tool_trajectory evaluator '${name}' in '${evalId}': any_order mode requires minimums`
+        );
+        continue;
+      }
+      if ((mode === "in_order" || mode === "exact") && !expected) {
+        logWarning2(
+          `Skipping tool_trajectory evaluator '${name}' in '${evalId}': ${mode} mode requires expected`
+        );
+        continue;
+      }
+      const config = {
+        name,
+        type: "tool_trajectory",
+        mode,
+        ...minimums ? { minimums } : {},
+        ...expected ? { expected } : {}
+      };
+      evaluators.push(config);
+      continue;
+    }
     const prompt = asString2(rawEvaluator.prompt);
     let promptPath;
     if (prompt) {
@@ -742,6 +963,67 @@ ${detailBlock}${ANSI_RESET4}`);
     console.warn(`${ANSI_YELLOW4}Warning: ${message}${ANSI_RESET4}`);
   }
 }
+async function processExpectedMessages(options) {
+  const { messages, searchRoots, repoRootPath, verbose } = options;
+  const segments = [];
+  for (const message of messages) {
+    const segment = {
+      role: message.role
+    };
+    if (message.role === "assistant" && message.tool_calls !== void 0) {
+      segment.tool_calls = message.tool_calls;
+    }
+    const content = message.content;
+    if (typeof content === "string") {
+      segment.content = content;
+    } else if (Array.isArray(content)) {
+      const processedContent = [];
+      for (const rawSegment of content) {
+        if (!isJsonObject(rawSegment)) {
+          continue;
+        }
+        const segmentType = asString3(rawSegment.type);
+        if (segmentType === "file") {
+          const rawValue = asString3(rawSegment.value);
+          if (!rawValue) {
+            continue;
+          }
+          const { displayPath, resolvedPath, attempted } = await resolveFileReference(
+            rawValue,
+            searchRoots
+          );
+          if (!resolvedPath) {
+            const attempts = attempted.length ? ["  Tried:", ...attempted.map((candidate) => `    ${candidate}`)] : void 0;
+            logWarning3(`File not found in expected_messages: ${displayPath}`, attempts);
+            continue;
+          }
+          try {
+            const fileContent = (await (0, import_promises4.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
+            processedContent.push({
+              type: "file",
+              path: displayPath,
+              text: fileContent,
+              resolvedPath: import_node_path4.default.resolve(resolvedPath)
+            });
+            if (verbose) {
+              console.log(`  [Expected Output File] Found: ${displayPath}`);
+              console.log(`    Resolved to: ${resolvedPath}`);
+            }
+          } catch (error) {
+            logWarning3(
+              `Could not read expected output file ${resolvedPath}: ${error.message}`
+            );
+          }
+          continue;
+        }
+        processedContent.push(cloneJsonObject(rawSegment));
+      }
+      segment.content = processedContent;
+    }
+    segments.push(segment);
+  }
+  return segments;
+}
 // src/evaluation/formatting/prompt-builder.ts
 var import_promises5 = require("fs/promises");
@@ -1046,12 +1328,10 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
       messageType: "input",
       verbose
     });
-    const outputSegments = hasExpectedMessages ? await processMessages({
+    const outputSegments = hasExpectedMessages ? await processExpectedMessages({
       messages: expectedMessages,
       searchRoots,
       repoRootPath,
-      guidelinePatterns,
-      messageType: "output",
       verbose
     }) : [];
     const codeSnippets = extractCodeBlocks(inputSegments);
@@ -1178,6 +1458,10 @@ async function readTextFile(filePath) {
   const content = await (0, import_promises7.readFile)(filePath, "utf8");
   return normalizeLineEndings(content);
 }
+async function readJsonFile(filePath) {
+  const content = await (0, import_promises7.readFile)(filePath, "utf8");
+  return JSON.parse(content);
+}
 async function findGitRoot(startPath) {
   let currentDir = import_node_path7.default.dirname(import_node_path7.default.resolve(startPath));
   const root = import_node_path7.default.parse(currentDir).root;
@@ -1686,9 +1970,11 @@ var CliProvider = class {
       const message = detail ? `${detail} (exit code ${codeText})` : `CLI exited with code ${codeText}`;
       throw new Error(message);
     }
-    const responseText = await this.readAndCleanupOutputFile(outputFilePath);
+    const responseContent = await this.readAndCleanupOutputFile(outputFilePath);
+    const parsed = this.parseOutputContent(responseContent);
     return {
-      text: responseText,
+      text: parsed.text,
+      trace: parsed.trace,
       raw: {
         command: renderedCommand,
         stderr: result.stderr,
@@ -1698,6 +1984,31 @@ var CliProvider = class {
       }
     };
   }
+  /**
+   * Parse output content from CLI.
+   * If the content is valid JSON with a 'text' field, extract text and optional trace.
+   * Otherwise, treat the entire content as plain text.
+   */
+  parseOutputContent(content) {
+    try {
+      const parsed = JSON.parse(content);
+      if (typeof parsed === "object" && parsed !== null && "text" in parsed) {
+        const obj = parsed;
+        const text = typeof obj.text === "string" ? obj.text : String(obj.text);
+        const trace = this.parseTrace(obj.trace);
+        return { text, trace };
+      }
+    } catch {
+    }
+    return { text: content };
+  }
+  parseTrace(trace) {
+    if (!Array.isArray(trace)) {
+      return void 0;
+    }
+    const validEvents = trace.filter(isTraceEvent);
+    return validEvents.length > 0 ? validEvents : void 0;
+  }
   async readAndCleanupOutputFile(filePath) {
     try {
       const content = await readTextFile(filePath);
@@ -2684,6 +2995,7 @@ var MockProvider = class {
   delayMs;
   delayMinMs;
   delayMaxMs;
+  trace;
   constructor(targetName, config) {
     this.id = `mock:${targetName}`;
     this.targetName = targetName;
@@ -2691,6 +3003,7 @@ var MockProvider = class {
     this.delayMs = config.delayMs ?? 0;
     this.delayMinMs = config.delayMinMs ?? 0;
     this.delayMaxMs = config.delayMaxMs ?? 0;
+    this.trace = config.trace;
   }
   async invoke(request) {
     const delay = this.calculateDelay();
@@ -2702,7 +3015,8 @@ var MockProvider = class {
       raw: {
         question: request.question,
         guidelines: request.guidelines
-      }
+      },
+      trace: this.trace
     };
   }
   calculateDelay() {
@@ -2716,6 +3030,7 @@ var MockProvider = class {
 };
 // src/evaluation/providers/targets.ts
+var import_node_path11 = __toESM(require("path"), 1);
 var import_zod = require("zod");
 var CLI_PLACEHOLDERS = /* @__PURE__ */ new Set([
   "PROMPT",
@@ -2731,7 +3046,7 @@ var BASE_TARGET_SCHEMA = import_zod.z.object({
   judge_target: import_zod.z.string().optional(),
   workers: import_zod.z.number().int().min(1).optional()
 }).passthrough();
-var DEFAULT_AZURE_API_VERSION = "2024-10-01-preview";
+var DEFAULT_AZURE_API_VERSION = "2024-12-01-preview";
 function normalizeAzureApiVersion(value) {
   if (!value) {
     return DEFAULT_AZURE_API_VERSION;
@@ -2775,7 +3090,7 @@ function resolveRetryConfig(target) {
     retryableStatusCodes
   };
 }
-function resolveTargetDefinition(definition, env = process.env) {
+function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
   const parsed = BASE_TARGET_SCHEMA.parse(definition);
   const provider = parsed.provider.toLowerCase();
   const providerBatching = resolveOptionalBoolean(
@@ -2848,7 +3163,7 @@ function resolveTargetDefinition(definition, env = process.env) {
         judgeTarget: parsed.judge_target,
         workers: parsed.workers,
         providerBatching,
-        config: resolveCliConfig(parsed, env)
+        config: resolveCliConfig(parsed, env, evalFilePath)
       };
     default:
       throw new Error(`Unsupported provider '${parsed.provider}' in target '${parsed.name}'`);
@@ -2966,7 +3281,8 @@ function normalizeCodexLogFormat(value) {
 }
 function resolveMockConfig(target) {
   const response = typeof target.response === "string" ? target.response : void 0;
-  return { response };
+  const trace = Array.isArray(target.trace) ? target.trace : void 0;
+  return { response, trace };
 }
 function resolveVSCodeConfig(target, env, insiders) {
   const workspaceTemplateEnvVar = resolveOptionalLiteralString(
@@ -2998,15 +3314,18 @@ function resolveVSCodeConfig(target, env, insiders) {
     workspaceTemplate
   };
 }
-function resolveCliConfig(target, env) {
+function resolveCliConfig(target, env, evalFilePath) {
   const commandTemplateSource = target.command_template ?? target.commandTemplate;
   const filesFormat = resolveOptionalLiteralString(
     target.files_format ?? target.filesFormat ?? target.attachments_format ?? target.attachmentsFormat
   );
-  const cwd = resolveOptionalString(target.cwd, env, `${target.name} working directory`, {
+  let cwd = resolveOptionalString(target.cwd, env, `${target.name} working directory`, {
     allowLiteral: true,
     optionalEnv: true
   });
+  if (!cwd && evalFilePath) {
+    cwd = import_node_path11.default.dirname(import_node_path11.default.resolve(evalFilePath));
+  }
   const timeoutMs = resolveTimeoutMs(
     target.timeout_seconds ?? target.timeoutSeconds,
     `${target.name} timeout`
@@ -3124,17 +3443,15 @@ function resolveOptionalString(source, env, description, options) {
   if (envVarMatch) {
     const varName = envVarMatch[1];
     const envValue = env[varName];
-    if (envValue !== void 0) {
-      if (envValue.trim().length === 0) {
-        throw new Error(`Environment variable '${varName}' for ${description} is empty`);
-      }
-      return envValue;
-    }
     const optionalEnv = options?.optionalEnv ?? false;
-    if (optionalEnv) {
-      return void 0;
+    if (envValue === void 0 || envValue.trim().length === 0) {
+      if (optionalEnv) {
+        return void 0;
+      }
+      const status = envValue === void 0 ? "is not set" : "is empty";
+      throw new Error(`Environment variable '${varName}' required for ${description} ${status}`);
     }
-    throw new Error(`Environment variable '${varName}' required for ${description} is not set`);
+    return envValue;
   }
   const allowLiteral = options?.allowLiteral ?? false;
   if (!allowLiteral) {
@@ -3246,7 +3563,7 @@ function resolveOptionalNumberArray(source, description) {
 }
 // src/evaluation/providers/vscode.ts
-var import_node_path11 = __toESM(require("path"), 1);
+var import_node_path12 = __toESM(require("path"), 1);
 var import_subagent = require("subagent");
 // src/evaluation/providers/vscode-templates.ts
@@ -3416,7 +3733,7 @@ function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
     return "";
   }
   const buildList = (files) => files.map((absolutePath) => {
-    const fileName = import_node_path11.default.basename(absolutePath);
+    const fileName = import_node_path12.default.basename(absolutePath);
     const fileUri = pathToFileUri2(absolutePath);
     return `* [${fileName}](${fileUri})`;
   });
@@ -3441,8 +3758,8 @@ function collectGuidelineFiles2(attachments, guidelinePatterns) {
   }
   const unique = /* @__PURE__ */ new Map();
   for (const attachment of attachments) {
-    const absolutePath = import_node_path11.default.resolve(attachment);
-    const normalized = absolutePath.split(import_node_path11.default.sep).join("/");
+    const absolutePath = import_node_path12.default.resolve(attachment);
+    const normalized = absolutePath.split(import_node_path12.default.sep).join("/");
     if (isGuidelineFile(normalized, guidelinePatterns)) {
       if (!unique.has(absolutePath)) {
         unique.set(absolutePath, absolutePath);
@@ -3457,7 +3774,7 @@ function collectAttachmentFiles(attachments) {
   }
   const unique = /* @__PURE__ */ new Map();
   for (const attachment of attachments) {
-    const absolutePath = import_node_path11.default.resolve(attachment);
+    const absolutePath = import_node_path12.default.resolve(attachment);
     if (!unique.has(absolutePath)) {
       unique.set(absolutePath, absolutePath);
     }
@@ -3465,7 +3782,7 @@ function collectAttachmentFiles(attachments) {
   return Array.from(unique.values());
 }
 function pathToFileUri2(filePath) {
-  const absolutePath = import_node_path11.default.isAbsolute(filePath) ? filePath : import_node_path11.default.resolve(filePath);
+  const absolutePath = import_node_path12.default.isAbsolute(filePath) ? filePath : import_node_path12.default.resolve(filePath);
   const normalizedPath = absolutePath.replace(/\\/g, "/");
   if (/^[a-zA-Z]:\//.test(normalizedPath)) {
     return `file:///${normalizedPath}`;
@@ -3478,7 +3795,7 @@ function normalizeAttachments(attachments) {
   }
   const deduped = /* @__PURE__ */ new Set();
   for (const attachment of attachments) {
-    deduped.add(import_node_path11.default.resolve(attachment));
+    deduped.add(import_node_path12.default.resolve(attachment));
   }
   return Array.from(deduped);
 }
@@ -3487,7 +3804,7 @@ function mergeAttachments(all) {
   for (const list of all) {
     if (!list) continue;
     for (const inputFile of list) {
-      deduped.add(import_node_path11.default.resolve(inputFile));
+      deduped.add(import_node_path12.default.resolve(inputFile));
     }
   }
   return deduped.size > 0 ? Array.from(deduped) : void 0;
@@ -3536,7 +3853,7 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
 // src/evaluation/providers/targets-file.ts
 var import_node_fs4 = require("fs");
 var import_promises10 = require("fs/promises");
-var import_node_path12 = __toESM(require("path"), 1);
+var import_node_path13 = __toESM(require("path"), 1);
 var import_yaml3 = require("yaml");
 function isRecord(value) {
   return typeof value === "object" && value !== null && !Array.isArray(value);
@@ -3573,7 +3890,7 @@ async function fileExists3(filePath) {
   }
 }
 async function readTargetDefinitions(filePath) {
-  const absolutePath = import_node_path12.default.resolve(filePath);
+  const absolutePath = import_node_path13.default.resolve(filePath);
   if (!await fileExists3(absolutePath)) {
     throw new Error(`targets.yaml not found at ${absolutePath}`);
   }
@@ -4021,11 +4338,478 @@ function substituteVariables(template, variables) {
     return variables[varName] ?? match;
   });
 }
+var ToolTrajectoryEvaluator = class {
+  kind = "tool_trajectory";
+  config;
+  constructor(options) {
+    this.config = options.config;
+  }
+  evaluate(context) {
+    const { candidateTrace, candidateTraceSummary } = context;
+    if (!candidateTrace || !candidateTraceSummary) {
+      return {
+        score: 0,
+        verdict: "fail",
+        hits: [],
+        misses: ["No trace available for evaluation"],
+        expectedAspectCount: 1
+      };
+    }
+    switch (this.config.mode) {
+      case "any_order":
+        return this.evaluateAnyOrder(candidateTraceSummary);
+      case "in_order":
+        return this.evaluateInOrder(candidateTrace);
+      case "exact":
+        return this.evaluateExact(candidateTrace);
+      default:
+        return {
+          score: 0,
+          verdict: "fail",
+          hits: [],
+          misses: [`Unknown mode: ${this.config.mode}`],
+          expectedAspectCount: 1
+        };
+    }
+  }
+  evaluateAnyOrder(summary) {
+    const minimums = this.config.minimums ?? {};
+    const toolNames = Object.keys(minimums);
+    if (toolNames.length === 0) {
+      return {
+        score: 1,
+        verdict: "pass",
+        hits: ["No tool requirements specified"],
+        misses: [],
+        expectedAspectCount: 0
+      };
+    }
+    const hits = [];
+    const misses = [];
+    for (const toolName of toolNames) {
+      const required = minimums[toolName];
+      const actual = summary.toolCallsByName[toolName] ?? 0;
+      if (actual >= required) {
+        hits.push(`${toolName}: called ${actual} times (required \u2265${required})`);
+      } else {
+        misses.push(`${toolName}: called ${actual} times (required \u2265${required})`);
+      }
+    }
+    const score = hits.length / toolNames.length;
+    return {
+      score,
+      verdict: scoreToVerdict(score),
+      hits,
+      misses,
+      expectedAspectCount: toolNames.length
+    };
+  }
+  evaluateInOrder(trace) {
+    const expected = this.config.expected ?? [];
+    if (expected.length === 0) {
+      return {
+        score: 1,
+        verdict: "pass",
+        hits: ["No tool sequence specified"],
+        misses: [],
+        expectedAspectCount: 0
+      };
+    }
+    const actualToolCalls = trace.filter((e) => e.type === "tool_call" && e.name);
+    const hits = [];
+    const misses = [];
+    let actualIndex = 0;
+    for (let i = 0; i < expected.length; i++) {
+      const expectedTool = expected[i].tool;
+      let found = false;
+      while (actualIndex < actualToolCalls.length) {
+        if (actualToolCalls[actualIndex].name === expectedTool) {
+          hits.push(`Found ${expectedTool} at position ${actualIndex}`);
+          actualIndex++;
+          found = true;
+          break;
+        }
+        actualIndex++;
+      }
+      if (!found) {
+        misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
+      }
+    }
+    const score = hits.length / expected.length;
+    return {
+      score,
+      verdict: scoreToVerdict(score),
+      hits,
+      misses,
+      expectedAspectCount: expected.length
+    };
+  }
+  evaluateExact(trace) {
+    const expected = this.config.expected ?? [];
+    if (expected.length === 0) {
+      return {
+        score: 1,
+        verdict: "pass",
+        hits: ["No tool sequence specified"],
+        misses: [],
+        expectedAspectCount: 0
+      };
+    }
+    const actualToolCalls = trace.filter((e) => e.type === "tool_call" && e.name);
+    const hits = [];
+    const misses = [];
+    if (actualToolCalls.length !== expected.length) {
+      misses.push(`Expected ${expected.length} tool calls, got ${actualToolCalls.length}`);
+    }
+    const checkLength = Math.min(expected.length, actualToolCalls.length);
+    for (let i = 0; i < checkLength; i++) {
+      const expectedTool = expected[i].tool;
+      const actualTool = actualToolCalls[i].name;
+      if (actualTool === expectedTool) {
+        hits.push(`Position ${i}: ${expectedTool} \u2713`);
+      } else {
+        misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
+      }
+    }
+    for (let i = checkLength; i < expected.length; i++) {
+      misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
+    }
+    const score = hits.length / expected.length;
+    return {
+      score,
+      verdict: scoreToVerdict(score),
+      hits,
+      misses,
+      expectedAspectCount: expected.length
+    };
+  }
+};
+var ExpectedMessagesEvaluator = class {
+  kind = "expected_messages";
+  evaluate(context) {
+    const { candidateTrace, evalCase } = context;
+    const expectedSegments = evalCase.expected_segments;
+    const expectedToolCalls = this.extractExpectedToolCalls(expectedSegments);
+    if (expectedToolCalls.length === 0) {
+      return {
+        score: 1,
+        verdict: "pass",
+        hits: ["No tool_calls specified in expected_messages"],
+        misses: [],
+        expectedAspectCount: 1
+      };
+    }
+    if (!candidateTrace || candidateTrace.length === 0) {
+      return {
+        score: 0,
+        verdict: "fail",
+        hits: [],
+        misses: ["No trace available to validate tool_calls"],
+        expectedAspectCount: expectedToolCalls.length
+      };
+    }
+    const actualToolCalls = candidateTrace.filter((e) => e.type === "tool_call");
+    return this.validateToolCalls(expectedToolCalls, actualToolCalls);
+  }
+  extractExpectedToolCalls(segments) {
+    if (!segments) {
+      return [];
+    }
+    const toolCalls = [];
+    for (const segment of segments) {
+      const role = segment.role;
+      const segmentToolCalls = segment.tool_calls;
+      if (role === "assistant" && Array.isArray(segmentToolCalls)) {
+        for (const tc of segmentToolCalls) {
+          if (typeof tc === "object" && tc !== null && typeof tc.tool === "string") {
+            const toolCall = tc;
+            toolCalls.push({ tool: toolCall.tool, input: toolCall.input });
+          }
+        }
+      }
+    }
+    return toolCalls;
+  }
+  validateToolCalls(expected, actual) {
+    const hits = [];
+    const misses = [];
+    for (let i = 0; i < expected.length; i++) {
+      const expectedCall = expected[i];
+      const actualCall = actual[i];
+      if (!actualCall) {
+        misses.push(
+          `tool_calls[${i}]: expected ${expectedCall.tool}, but no more tool calls in trace`
+        );
+        continue;
+      }
+      if (actualCall.name !== expectedCall.tool) {
+        misses.push(
+          `tool_calls[${i}]: expected ${expectedCall.tool}, got ${actualCall.name ?? "unknown"}`
+        );
+        continue;
+      }
+      if (expectedCall.input !== void 0) {
+        if (!this.deepEquals(expectedCall.input, actualCall.input)) {
+          misses.push(`tool_calls[${i}]: ${expectedCall.tool} input mismatch`);
+          continue;
+        }
+      }
+      hits.push(`tool_calls[${i}]: ${expectedCall.tool} matched`);
+    }
+    const totalChecks = expected.length || 1;
+    const score = hits.length / totalChecks;
+    return {
+      score,
+      verdict: score >= 0.8 ? "pass" : score >= 0.6 ? "borderline" : "fail",
+      hits,
+      misses,
+      expectedAspectCount: totalChecks
+    };
+  }
+  deepEquals(a, b) {
+    if (a === b) return true;
+    if (typeof a !== typeof b) return false;
+    if (typeof a !== "object" || a === null || b === null) return false;
+    if (Array.isArray(a) && Array.isArray(b)) {
+      if (a.length !== b.length) return false;
+      return a.every((val, i) => this.deepEquals(val, b[i]));
+    }
+    if (Array.isArray(a) || Array.isArray(b)) return false;
+    const aObj = a;
+    const bObj = b;
+    const aKeys = Object.keys(aObj);
+    const bKeys = Object.keys(bObj);
+    if (aKeys.length !== bKeys.length) return false;
+    return aKeys.every((key) => this.deepEquals(aObj[key], bObj[key]));
+  }
+};
+var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
+{{EVALUATOR_RESULTS_JSON}}
+Decide the final score and verdict based on all evaluator results.
+Return a JSON object with: score (0.0-1.0), verdict (pass/fail/borderline), and reasoning.`;
+var CompositeEvaluator = class {
+  kind = "composite";
+  config;
+  evaluatorFactory;
+  cwd;
+  constructor(options) {
+    this.config = options.config;
+    this.evaluatorFactory = options.evaluatorFactory;
+    this.cwd = options.cwd;
+  }
+  async evaluate(context) {
+    const memberResults = await Promise.all(
+      this.config.evaluators.map(async (memberConfig) => {
+        const evaluator = this.evaluatorFactory.create(memberConfig, context);
+        return {
+          id: memberConfig.name,
+          type: memberConfig.type,
+          result: await evaluator.evaluate(context)
+        };
+      })
+    );
+    return this.aggregate(memberResults, context);
+  }
+  async aggregate(results, context) {
+    const aggregator = this.config.aggregator;
+    switch (aggregator.type) {
+      case "code_judge":
+        return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
+      case "llm_judge":
+        return this.runLlmAggregator(results, context, aggregator);
+      default:
+        return this.runWeightedAverage(results, aggregator.weights);
+    }
+  }
+  runWeightedAverage(results, weights) {
+    let totalWeight = 0;
+    let weightedSum = 0;
+    const allHits = [];
+    const allMisses = [];
+    const reasoningParts = [];
+    const evaluatorResults = [];
+    for (const member of results) {
+      const weight = weights?.[member.id] ?? 1;
+      totalWeight += weight;
+      weightedSum += member.result.score * weight;
+      allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
+      allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
+      if (member.result.reasoning) {
+        reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
+      }
+      evaluatorResults.push({
+        name: member.id,
+        type: member.type,
+        score: member.result.score,
+        weight,
+        verdict: member.result.verdict,
+        hits: [...member.result.hits],
+        misses: [...member.result.misses],
+        reasoning: member.result.reasoning,
+        evaluatorRawRequest: member.result.evaluatorRawRequest,
+        evaluatorResults: member.result.evaluatorResults
+      });
+    }
+    const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
+    return {
+      score: clampScore(finalScore),
+      verdict: scoreToVerdict(finalScore),
+      hits: allHits,
+      misses: allMisses,
+      expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
+      reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
+      evaluatorRawRequest: {
+        aggregator: "weighted_average",
+        ...weights ? { weights } : {}
+      },
+      evaluatorResults
+    };
+  }
+  async runCodeAggregator(results, scriptPath, cwd, weights) {
+    const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
+    const inputPayload = JSON.stringify({ results: resultsObject }, null, 2);
+    const evaluatorResults = results.map((member) => ({
+      name: member.id,
+      type: member.type,
+      score: member.result.score,
+      weight: weights?.[member.id] ?? 1,
+      verdict: member.result.verdict,
+      hits: [...member.result.hits],
+      misses: [...member.result.misses],
+      reasoning: member.result.reasoning,
+      evaluatorRawRequest: member.result.evaluatorRawRequest,
+      evaluatorResults: member.result.evaluatorResults
+    }));
+    try {
+      const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
+      const parsed = parseJsonSafe(stdout);
+      const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
+      const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
+      const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
+      const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
+      const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
+      return {
+        score,
+        verdict,
+        hits,
+        misses,
+        expectedAspectCount: hits.length + misses.length || 1,
+        reasoning,
+        evaluatorRawRequest: {
+          aggregator: "code_judge",
+          script: scriptPath
+        },
+        evaluatorResults
+      };
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      return {
+        score: 0,
+        verdict: "fail",
+        hits: [],
+        misses: [`Code aggregator failed: ${message}`],
+        expectedAspectCount: 1,
+        reasoning: message,
+        evaluatorRawRequest: {
+          aggregator: "code_judge",
+          script: scriptPath,
+          error: message
+        },
+        evaluatorResults
+      };
+    }
+  }
+  async runLlmAggregator(results, context, config) {
+    const judgeProvider = context.judgeProvider;
+    if (!judgeProvider) {
+      throw new Error("No judge provider available for LLM aggregation");
+    }
+    const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
+    const resultsJson = JSON.stringify(resultsObject, null, 2);
+    const evaluatorResults = results.map((member) => ({
+      name: member.id,
+      type: member.type,
+      score: member.result.score,
+      verdict: member.result.verdict,
+      hits: [...member.result.hits],
+      misses: [...member.result.misses],
+      reasoning: member.result.reasoning,
+      evaluatorRawRequest: member.result.evaluatorRawRequest,
+      evaluatorResults: member.result.evaluatorResults
+    }));
+    const promptTemplate = config.prompt ?? DEFAULT_COMPOSITE_AGGREGATOR_PROMPT;
+    const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
+    const systemPrompt = buildOutputSchema();
+    const evaluatorRawRequest = {
+      aggregator: "llm_judge",
+      userPrompt,
+      systemPrompt,
+      target: judgeProvider.targetName
+    };
+    try {
+      const model = judgeProvider.asLanguageModel?.();
+      if (model) {
+        const { text } = await (0, import_ai2.generateText)({
+          model,
+          system: systemPrompt,
+          prompt: userPrompt
+        });
+        const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text));
+        const score2 = clampScore(data2.score);
+        const hits2 = Array.isArray(data2.hits) ? data2.hits.filter(isNonEmptyString).slice(0, 4) : [];
+        const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
+        const reasoning2 = data2.reasoning;
+        return {
+          score: score2,
+          verdict: scoreToVerdict(score2),
+          hits: hits2,
+          misses: misses2,
+          expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
+          reasoning: reasoning2,
+          evaluatorRawRequest,
+          evaluatorResults
+        };
+      }
+      const response = await judgeProvider.invoke({
+        question: userPrompt,
+        systemPrompt,
+        evalCaseId: context.evalCase.id,
+        attempt: context.attempt
+      });
+      const data = freeformEvaluationSchema.parse(parseJsonFromText(response.text ?? ""));
+      const score = clampScore(data.score);
+      const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
+      const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
+      const reasoning = data.reasoning ?? response.reasoning;
+      return {
+        score,
+        verdict: scoreToVerdict(score),
+        hits,
+        misses,
+        expectedAspectCount: Math.max(hits.length + misses.length, 1),
+        reasoning,
+        evaluatorRawRequest,
+        evaluatorResults
+      };
+    } catch {
+      return {
+        score: 0,
+        verdict: "fail",
+        hits: [],
+        misses: [],
+        expectedAspectCount: 1,
+        evaluatorRawRequest,
+        evaluatorResults
+      };
+    }
+  }
+};
 // src/evaluation/orchestrator.ts
 var import_node_crypto2 = require("crypto");
 var import_promises11 = require("fs/promises");
-var import_node_path13 = __toESM(require("path"), 1);
+var import_node_path14 = __toESM(require("path"), 1);
 // ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
 var Node = class {
@@ -4232,7 +5016,7 @@ async function runEvaluation(options) {
     if (!definition) {
       return void 0;
     }
-    const resolved = resolveTargetDefinition(definition, envLookup);
+    const resolved = resolveTargetDefinition(definition, envLookup, evalFilePath);
     resolvedTargetsByName.set(name, resolved);
     return resolved;
   };
@@ -4546,6 +5330,17 @@ async function runEvalCase(options) {
   if (cacheKey && cache && !cachedResponse) {
     await cache.set(cacheKey, providerResponse);
   }
+  let candidateTrace = providerResponse.trace;
+  if (!candidateTrace && providerResponse.traceRef) {
+    try {
+      const rawTrace = await readJsonFile(providerResponse.traceRef);
+      if (Array.isArray(rawTrace) && rawTrace.every(isTraceEvent)) {
+        candidateTrace = rawTrace;
+      }
+    } catch {
+    }
+  }
+  const candidateTraceSummary = candidateTrace ? computeTraceSummary(candidateTrace) : void 0;
   try {
     return await evaluateCandidate({
       evalCase,
@@ -4557,7 +5352,9 @@ async function runEvalCase(options) {
       nowFn,
       attempt,
       judgeProvider,
-      agentTimeoutMs
+      agentTimeoutMs,
+      candidateTrace,
+      candidateTraceSummary
     });
   } catch (error) {
     return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs, provider);
@@ -4574,7 +5371,9 @@ async function evaluateCandidate(options) {
     nowFn,
     attempt,
     judgeProvider,
-    agentTimeoutMs
+    agentTimeoutMs,
+    candidateTrace,
+    candidateTraceSummary
   } = options;
   const gradeTimestamp = nowFn();
   const { score, evaluatorResults } = await runEvaluatorsForCase({
@@ -4587,7 +5386,9 @@ async function evaluateCandidate(options) {
     promptInputs,
     now: gradeTimestamp,
     judgeProvider,
-    agentTimeoutMs
+    agentTimeoutMs,
+    candidateTrace,
+    candidateTraceSummary
   });
   const completedAt = nowFn();
   let agentProviderRequest;
@@ -4626,7 +5427,8 @@ async function evaluateCandidate(options) {
     agent_provider_request: agentProviderRequest,
     lm_provider_request: lmProviderRequest,
     evaluator_provider_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
-    evaluator_results: evaluatorResults
+    evaluator_results: evaluatorResults,
+    trace_summary: candidateTraceSummary
   };
 }
 async function runEvaluatorsForCase(options) {
@@ -4640,7 +5442,9 @@ async function runEvaluatorsForCase(options) {
     promptInputs,
     now,
     judgeProvider,
-    agentTimeoutMs
+    agentTimeoutMs,
+    candidateTrace,
+    candidateTraceSummary
   } = options;
   if (evalCase.evaluators && evalCase.evaluators.length > 0) {
     return runEvaluatorList({
@@ -4654,7 +5458,9 @@ async function runEvaluatorsForCase(options) {
       promptInputs,
       now,
       judgeProvider,
-      agentTimeoutMs
+      agentTimeoutMs,
+      candidateTrace,
+      candidateTraceSummary
     });
   }
   const evaluatorKind = evalCase.evaluator ?? "llm_judge";
@@ -4670,7 +5476,9 @@ async function runEvaluatorsForCase(options) {
     attempt,
     promptInputs,
     now,
-    judgeProvider
+    judgeProvider,
+    candidateTrace,
+    candidateTraceSummary
   });
   return { score };
 }
@@ -4686,7 +5494,9 @@ async function runEvaluatorList(options) {
     promptInputs,
     now,
     judgeProvider,
-    agentTimeoutMs
+    agentTimeoutMs,
+    candidateTrace,
+    candidateTraceSummary
   } = options;
   const scored = [];
   const evaluatorResults = [];
@@ -4732,6 +5542,63 @@ async function runEvaluatorList(options) {
           promptInputs,
           now
         });
+        scored.push({ score: score2, name: evaluator.name, type: "code_judge" });
+        evaluatorResults.push({
+          name: evaluator.name,
+          type: "code_judge",
+          score: score2.score,
+          verdict: score2.verdict,
+          hits: score2.hits,
+          misses: score2.misses,
+          reasoning: score2.reasoning,
+          evaluator_provider_request: score2.evaluatorRawRequest
+        });
+      }
+      if (evaluator.type === "composite") {
+        const evalFileDir = evalCase.guideline_paths[0] ? import_node_path14.default.dirname(evalCase.guideline_paths[0]) : process.cwd();
+        const createEvaluator = (memberConfig) => {
+          switch (memberConfig.type) {
+            case "llm_judge":
+              return evaluatorRegistry.llm_judge;
+            case "code":
+              return new CodeEvaluator({
+                script: memberConfig.script,
+                cwd: memberConfig.resolvedCwd ?? memberConfig.cwd,
+                agentTimeoutMs
+              });
+            case "composite":
+              return new CompositeEvaluator({
+                config: memberConfig,
+                cwd: evalFileDir,
+                evaluatorFactory: { create: createEvaluator }
+              });
+            case "tool_trajectory":
+              return new ToolTrajectoryEvaluator({
+                config: memberConfig
+              });
+            case "expected_messages":
+              return new ExpectedMessagesEvaluator();
+            default: {
+              const unknownConfig = memberConfig;
+              throw new Error(`Unsupported evaluator type in composite: ${unknownConfig.type}`);
+            }
+          }
+        };
+        const compositeEvaluator = new CompositeEvaluator({
+          config: evaluator,
+          cwd: evalFileDir,
+          evaluatorFactory: { create: createEvaluator }
+        });
+        const score2 = await compositeEvaluator.evaluate({
+          evalCase,
+          candidate,
+          target,
+          provider,
+          attempt,
+          promptInputs,
+          now,
+          judgeProvider
+        });
         scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
         evaluatorResults.push({
           name: evaluator.name,
@@ -4741,7 +5608,58 @@ async function runEvaluatorList(options) {
           hits: score2.hits,
           misses: score2.misses,
           reasoning: score2.reasoning,
-          evaluator_provider_request: score2.evaluatorRawRequest
+          evaluator_provider_request: score2.evaluatorRawRequest,
+          evaluator_results: mapChildResults(score2.evaluatorResults)
+        });
+      }
+      if (evaluator.type === "tool_trajectory") {
+        const trajectoryEvaluator = new ToolTrajectoryEvaluator({
+          config: evaluator
+        });
+        const score2 = trajectoryEvaluator.evaluate({
+          evalCase,
+          candidate,
+          target,
+          provider,
+          attempt,
+          promptInputs,
+          now,
+          candidateTrace,
+          candidateTraceSummary
+        });
+        scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
+        evaluatorResults.push({
+          name: evaluator.name,
+          type: evaluator.type,
+          score: score2.score,
+          verdict: score2.verdict,
+          hits: score2.hits,
+          misses: score2.misses,
+          reasoning: score2.reasoning
+        });
+      }
+      if (evaluator.type === "expected_messages") {
+        const expectedMessagesEvaluator = new ExpectedMessagesEvaluator();
+        const score2 = expectedMessagesEvaluator.evaluate({
+          evalCase,
+          candidate,
+          target,
+          provider,
+          attempt,
+          promptInputs,
+          now,
+          candidateTrace,
+          candidateTraceSummary
+        });
+        scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
+        evaluatorResults.push({
+          name: evaluator.name,
+          type: evaluator.type,
+          score: score2.score,
+          verdict: score2.verdict,
+          hits: score2.hits,
+          misses: score2.misses,
+          reasoning: score2.reasoning
         });
       }
     } catch (error) {
@@ -4754,14 +5672,15 @@ async function runEvaluatorList(options) {
         expectedAspectCount: 1,
         reasoning: message
       };
+      const resultType = evaluator.type === "code" ? "code_judge" : evaluator.type;
       scored.push({
         score: fallbackScore,
         name: evaluator.name ?? "unknown",
-        type: evaluator.type ?? "unknown"
+        type: resultType ?? "llm_judge"
       });
       evaluatorResults.push({
         name: evaluator.name ?? "unknown",
-        type: evaluator.type ?? "unknown",
+        type: resultType ?? "llm_judge",
         score: 0,
         verdict: "fail",
         hits: [],
@@ -4865,8 +5784,8 @@ function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
 async function dumpPrompt(directory, evalCase, promptInputs) {
   const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
   const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
-  const filePath = import_node_path13.default.resolve(directory, filename);
-  await (0, import_promises11.mkdir)(import_node_path13.default.dirname(filePath), { recursive: true });
+  const filePath = import_node_path14.default.resolve(directory, filename);
+  await (0, import_promises11.mkdir)(import_node_path14.default.dirname(filePath), { recursive: true });
   const payload = {
     eval_id: evalCase.id,
     question: promptInputs.question,
@@ -4979,6 +5898,23 @@ function isTimeoutLike(error) {
   const value = String(error).toLowerCase();
   return value.includes("timeout");
 }
+function mapChildResults(children) {
+  if (!children || children.length === 0) {
+    return void 0;
+  }
+  return children.map((child) => ({
+    name: child.name,
+    type: child.type,
+    score: child.score,
+    weight: child.weight,
+    verdict: child.verdict,
+    hits: child.hits,
+    misses: child.misses,
+    reasoning: child.reasoning,
+    evaluator_provider_request: child.evaluatorRawRequest,
+    evaluator_results: mapChildResults(child.evaluatorResults)
+  }));
+}
 // src/evaluation/generators/rubric-generator.ts
 var import_ai3 = require("ai");
@@ -5067,11 +6003,15 @@ function createAgentKernel() {
 // Annotate the CommonJS export names for ESM import in node:
 0 && (module.exports = {
   CodeEvaluator,
+  CompositeEvaluator,
+  ExpectedMessagesEvaluator,
   LlmJudgeEvaluator,
   TEST_MESSAGE_ROLES,
+  ToolTrajectoryEvaluator,
   buildDirectoryChain,
   buildPromptInputs,
   buildSearchRoots,
+  computeTraceSummary,
   consumeCodexLogEntries,
   createAgentKernel,
   createProvider,
@@ -5082,14 +6022,18 @@ function createAgentKernel() {
   generateRubrics,
   getHitCount,
   isEvaluatorKind,
+  isExpectedToolCall,
   isGuidelineFile,
   isJsonObject,
   isJsonValue,
   isTestMessage,
   isTestMessageRole,
+  isTraceEvent,
+  isTraceEventType,
   listTargetNames,
   loadEvalCases,
   normalizeLineEndings,
+  readJsonFile,
   readTargetDefinitions,
   readTestSuiteMetadata,
   readTextFile,