npm - @agentv/core - Versions diffs - 2.0.2 → 2.1.0 - Mend

@agentv/core 2.0.2 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/index.cjs CHANGED Viewed

@@ -42,31 +42,39 @@ __export(index_exports, {
   ToolTrajectoryEvaluator: () => ToolTrajectoryEvaluator,
   avgToolDurationMs: () => avgToolDurationMs,
   buildDirectoryChain: () => buildDirectoryChain2,
+  buildOutputSchema: () => buildOutputSchema,
   buildPromptInputs: () => buildPromptInputs,
   buildSearchRoots: () => buildSearchRoots2,
+  clampScore: () => clampScore,
   computeTraceSummary: () => computeTraceSummary,
   consumeClaudeCodeLogEntries: () => consumeClaudeCodeLogEntries,
   consumeCodexLogEntries: () => consumeCodexLogEntries,
   consumePiLogEntries: () => consumePiLogEntries,
   createAgentKernel: () => createAgentKernel,
   createProvider: () => createProvider,
+  deepEqual: () => deepEqual,
   ensureVSCodeSubagents: () => ensureVSCodeSubagents,
+  executeScript: () => executeScript,
   explorationRatio: () => explorationRatio,
-  extractCodeBlocks: () => extractCodeBlocks,
+  extractJsonBlob: () => extractJsonBlob,
   fileExists: () => fileExists2,
   findGitRoot: () => findGitRoot,
+  freeformEvaluationSchema: () => freeformEvaluationSchema,
   generateRubrics: () => generateRubrics,
   getHitCount: () => getHitCount,
   isEvaluatorKind: () => isEvaluatorKind,
   isGuidelineFile: () => isGuidelineFile,
   isJsonObject: () => isJsonObject,
   isJsonValue: () => isJsonValue,
+  isNonEmptyString: () => isNonEmptyString,
   isTestMessage: () => isTestMessage,
   isTestMessageRole: () => isTestMessageRole,
   listTargetNames: () => listTargetNames,
   loadEvalCases: () => loadEvalCases,
   mergeExecutionMetrics: () => mergeExecutionMetrics,
   normalizeLineEndings: () => normalizeLineEndings,
+  parseJsonFromText: () => parseJsonFromText,
+  parseJsonSafe: () => parseJsonSafe,
   readJsonFile: () => readJsonFile,
   readTargetDefinitions: () => readTargetDefinitions,
   readTestSuiteMetadata: () => readTestSuiteMetadata,
@@ -76,6 +84,7 @@ __export(index_exports, {
   resolveTargetDefinition: () => resolveTargetDefinition,
   runEvalCase: () => runEvalCase,
   runEvaluation: () => runEvaluation,
+  scoreToVerdict: () => scoreToVerdict,
   subscribeToClaudeCodeLogEntries: () => subscribeToClaudeCodeLogEntries,
   subscribeToCodexLogEntries: () => subscribeToCodexLogEntries,
   subscribeToPiLogEntries: () => subscribeToPiLogEntries,
@@ -221,85 +230,6 @@ var import_promises6 = require("fs/promises");
 var import_node_path6 = __toESM(require("path"), 1);
 var import_yaml2 = require("yaml");
-// src/evaluation/formatting/segment-formatter.ts
-function extractCodeBlocks(segments) {
-  const CODE_BLOCK_PATTERN = /```[\s\S]*?```/g;
-  const codeBlocks = [];
-  for (const segment of segments) {
-    const typeValue = segment.type;
-    if (typeof typeValue !== "string" || typeValue !== "text") {
-      continue;
-    }
-    const textValue = segment.value;
-    if (typeof textValue !== "string") {
-      continue;
-    }
-    const matches = textValue.match(CODE_BLOCK_PATTERN);
-    if (matches) {
-      codeBlocks.push(...matches);
-    }
-  }
-  return codeBlocks;
-}
-function formatFileContents(parts) {
-  const fileCount = parts.filter((p) => p.isFile).length;
-  if (fileCount > 0) {
-    return parts.map((part) => {
-      if (part.isFile && part.displayPath) {
-        return `<file path="${part.displayPath}">
-${part.content}
-</file>`;
-      }
-      return part.content;
-    }).join("\n\n");
-  }
-  return parts.map((p) => p.content).join(" ");
-}
-function formatSegment(segment, mode = "lm") {
-  const type = asString(segment.type);
-  if (type === "text") {
-    return asString(segment.value);
-  }
-  if (type === "guideline_ref") {
-    const refPath = asString(segment.path);
-    return refPath ? `<Attached: ${refPath}>` : void 0;
-  }
-  if (type === "file") {
-    const filePath = asString(segment.path);
-    if (!filePath) {
-      return void 0;
-    }
-    if (mode === "agent") {
-      return `<file: path="${filePath}">`;
-    }
-    const text = asString(segment.text);
-    if (text && filePath) {
-      return formatFileContents([{ content: text.trim(), isFile: true, displayPath: filePath }]);
-    }
-  }
-  return void 0;
-}
-function hasVisibleContent(segments) {
-  return segments.some((segment) => {
-    const type = asString(segment.type);
-    if (type === "text") {
-      const value = asString(segment.value);
-      return value !== void 0 && value.trim().length > 0;
-    }
-    if (type === "guideline_ref") {
-      return false;
-    }
-    if (type === "file") {
-      const text = asString(segment.text);
-      return text !== void 0 && text.trim().length > 0;
-    }
-    return false;
-  });
-}
-function asString(value) {
-  return typeof value === "string" ? value : void 0;
-}
 // src/evaluation/loaders/config-loader.ts
 var import_promises2 = require("fs/promises");
 var import_node_path2 = __toESM(require("path"), 1);
@@ -554,7 +484,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
       logWarning2(`Skipping invalid evaluator entry for '${evalId}' (expected object)`);
       continue;
     }
-    const name = asString2(rawEvaluator.name);
+    const name = asString(rawEvaluator.name);
     const typeValue = rawEvaluator.type;
     if (!name || !isEvaluatorKind(typeValue)) {
       logWarning2(`Skipping evaluator with invalid name/type in '${evalId}'`);
@@ -582,7 +512,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
         continue;
       }
       const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
-      const cwd = asString2(rawEvaluator.cwd);
+      const cwd = asString(rawEvaluator.cwd);
       let resolvedCwd;
       if (cwd) {
         const resolved = await resolveFileReference(cwd, searchRoots);
@@ -597,7 +527,29 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
       } else {
         resolvedCwd = searchRoots[0];
       }
-      const knownProps = /* @__PURE__ */ new Set(["name", "type", "script", "cwd", "weight"]);
+      const rawTarget = rawEvaluator.target;
+      let targetConfig;
+      if (rawTarget !== void 0) {
+        if (isJsonObject2(rawTarget)) {
+          const maxCalls = rawTarget.max_calls;
+          if (maxCalls !== void 0 && (typeof maxCalls !== "number" || maxCalls < 0)) {
+            logWarning2(
+              `Invalid target.max_calls for evaluator '${name}' in '${evalId}': must be a non-negative number`
+            );
+          } else {
+            targetConfig = {
+              ...typeof maxCalls === "number" ? { max_calls: maxCalls } : {}
+            };
+          }
+        } else if (rawTarget === true) {
+          targetConfig = {};
+        } else {
+          logWarning2(
+            `Invalid target config for evaluator '${name}' in '${evalId}': expected object or true`
+          );
+        }
+      }
+      const knownProps = /* @__PURE__ */ new Set(["name", "type", "script", "cwd", "weight", "target"]);
       const config = {};
       for (const [key, value] of Object.entries(rawEvaluator)) {
         if (!knownProps.has(key) && value !== void 0) {
@@ -611,7 +563,8 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
         cwd,
         resolvedCwd,
         ...weight2 !== void 0 ? { weight: weight2 } : {},
-        ...Object.keys(config).length > 0 ? { config } : {}
+        ...Object.keys(config).length > 0 ? { config } : {},
+        ...targetConfig !== void 0 ? { target: targetConfig } : {}
       });
       continue;
     }
@@ -628,7 +581,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
         logWarning2(`Skipping composite evaluator '${name}' in '${evalId}': missing aggregator`);
         continue;
       }
-      const aggregatorType = asString2(rawAggregator.type);
+      const aggregatorType = asString(rawAggregator.type);
       if (aggregatorType !== "weighted_average" && aggregatorType !== "code_judge" && aggregatorType !== "llm_judge") {
         logWarning2(
           `Skipping composite evaluator '${name}' in '${evalId}': invalid aggregator type '${aggregatorType}'`
@@ -641,7 +594,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
           logWarning2(`Skipping invalid member evaluator in composite '${name}' (expected object)`);
           continue;
         }
-        const memberName = asString2(rawMember.name);
+        const memberName = asString(rawMember.name);
         const memberType = rawMember.type;
         if (!memberName || !isEvaluatorKind(memberType)) {
           logWarning2(`Skipping member evaluator with invalid name/type in composite '${name}'`);
@@ -679,7 +632,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
           ...Object.keys(parsedWeights).length > 0 ? { weights: parsedWeights } : {}
         };
       } else if (aggregatorType === "code_judge") {
-        const aggregatorPath = asString2(rawAggregator.path);
+        const aggregatorPath = asString(rawAggregator.path);
         if (!aggregatorPath) {
           logWarning2(
             `Skipping composite evaluator '${name}' in '${evalId}': code_judge aggregator missing path`
@@ -692,7 +645,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
           cwd: searchRoots[0]
         };
       } else {
-        const aggregatorPrompt = asString2(rawAggregator.prompt);
+        const aggregatorPrompt = asString(rawAggregator.prompt);
         let promptPath2;
         if (aggregatorPrompt) {
           const resolved = await resolveFileReference(aggregatorPrompt, searchRoots);
@@ -717,7 +670,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
       continue;
     }
     if (typeValue === "tool_trajectory") {
-      const mode = asString2(rawEvaluator.mode);
+      const mode = asString(rawEvaluator.mode);
       if (mode !== "any_order" && mode !== "in_order" && mode !== "exact") {
         logWarning2(
           `Skipping tool_trajectory evaluator '${name}' in '${evalId}': invalid mode '${mode}' (must be any_order, in_order, or exact)`
@@ -808,8 +761,8 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
           );
           continue;
         }
-        const fieldPath = asString2(rawField.path);
-        const match = asString2(rawField.match);
+        const fieldPath = asString(rawField.path);
+        const match = asString(rawField.match);
         if (!fieldPath) {
           logWarning2(
             `Skipping field without path in field_accuracy evaluator '${name}' in '${evalId}'`
@@ -839,7 +792,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
         );
         continue;
       }
-      const aggregation = asString2(rawEvaluator.aggregation);
+      const aggregation = asString(rawEvaluator.aggregation);
       const validAggregation = isValidFieldAggregationType(aggregation) ? aggregation : void 0;
       const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
       evaluators.push({
@@ -920,7 +873,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
       });
       continue;
     }
-    const prompt = asString2(rawEvaluator.prompt);
+    const prompt = asString(rawEvaluator.prompt);
     let promptPath;
     if (prompt) {
       const resolved = await resolveFileReference(prompt, searchRoots);
@@ -939,11 +892,11 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
         );
       }
     }
-    const _model = asString2(rawEvaluator.model);
+    const _model = asString(rawEvaluator.model);
     const rawRubrics = rawEvaluator.rubrics;
     const parsedRubrics = Array.isArray(rawRubrics) ? rawRubrics.filter((r) => isJsonObject2(r)).map((rubric, index) => ({
-      id: asString2(rubric.id) ?? `rubric-${index + 1}`,
-      description: asString2(rubric.description) ?? "",
+      id: asString(rubric.id) ?? `rubric-${index + 1}`,
+      description: asString(rubric.description) ?? "",
       weight: typeof rubric.weight === "number" ? rubric.weight : 1,
       required: typeof rubric.required === "boolean" ? rubric.required : true
     })).filter((r) => r.description.length > 0) : void 0;
@@ -987,7 +940,7 @@ function coerceEvaluator(candidate, contextId) {
   logWarning2(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
   return void 0;
 }
-function asString2(value) {
+function asString(value) {
   return typeof value === "string" ? value : void 0;
 }
 function asStringArray(value, description) {
@@ -1063,6 +1016,68 @@ function isValidFieldAggregationType(value) {
 // src/evaluation/loaders/message-processor.ts
 var import_promises4 = require("fs/promises");
 var import_node_path4 = __toESM(require("path"), 1);
+// src/evaluation/formatting/segment-formatter.ts
+function formatFileContents(parts) {
+  const fileCount = parts.filter((p) => p.isFile).length;
+  if (fileCount > 0) {
+    return parts.map((part) => {
+      if (part.isFile && part.displayPath) {
+        return `<file path="${part.displayPath}">
+${part.content}
+</file>`;
+      }
+      return part.content;
+    }).join("\n\n");
+  }
+  return parts.map((p) => p.content).join(" ");
+}
+function formatSegment(segment, mode = "lm") {
+  const type = asString2(segment.type);
+  if (type === "text") {
+    return asString2(segment.value);
+  }
+  if (type === "guideline_ref") {
+    const refPath = asString2(segment.path);
+    return refPath ? `<Attached: ${refPath}>` : void 0;
+  }
+  if (type === "file") {
+    const filePath = asString2(segment.path);
+    if (!filePath) {
+      return void 0;
+    }
+    if (mode === "agent") {
+      return `<file: path="${filePath}">`;
+    }
+    const text = asString2(segment.text);
+    if (text && filePath) {
+      return formatFileContents([{ content: text.trim(), isFile: true, displayPath: filePath }]);
+    }
+  }
+  return void 0;
+}
+function hasVisibleContent(segments) {
+  return segments.some((segment) => {
+    const type = asString2(segment.type);
+    if (type === "text") {
+      const value = asString2(segment.value);
+      return value !== void 0 && value.trim().length > 0;
+    }
+    if (type === "guideline_ref") {
+      return false;
+    }
+    if (type === "file") {
+      const text = asString2(segment.text);
+      return text !== void 0 && text.trim().length > 0;
+    }
+    return false;
+  });
+}
+function asString2(value) {
+  return typeof value === "string" ? value : void 0;
+}
+// src/evaluation/loaders/message-processor.ts
 var ANSI_YELLOW4 = "\x1B[33m";
 var ANSI_RESET4 = "\x1B[0m";
 async function processMessages(options) {
@@ -1368,9 +1383,6 @@ ${messageContent}`);
         questionParts.push(formattedContent);
       }
     }
-    if (testCase.code_snippets.length > 0) {
-      questionParts.push(testCase.code_snippets.join("\n"));
-    }
     question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
   }
   const chatPrompt = useRoleMarkers ? buildChatPromptFromSegments({
@@ -1569,7 +1581,6 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
       repoRootPath,
       verbose
     }) : [];
-    const codeSnippets = extractCodeBlocks(inputSegments);
     let referenceAnswer = "";
     if (outputSegments.length > 0) {
       const lastMessage = outputSegments[outputSegments.length - 1];
@@ -1642,7 +1653,6 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
       guideline_paths: guidelinePaths.map((guidelinePath) => import_node_path6.default.resolve(guidelinePath)),
       guideline_patterns: guidelinePatterns,
       file_paths: allFilePaths,
-      code_snippets: codeSnippets,
       expected_outcome: outcome,
       evaluator: evalCaseEvaluatorKind,
       evaluators
@@ -6327,9 +6337,64 @@ function resolveAndCreateProvider(definition, env = process.env) {
   return createProvider(resolved);
 }
-// src/evaluation/evaluators.ts
-var import_ai2 = require("ai");
-var import_zod3 = require("zod");
+// src/evaluation/evaluators/scoring.ts
+function scoreToVerdict(score) {
+  if (score >= 0.8) {
+    return "pass";
+  }
+  if (score >= 0.6) {
+    return "borderline";
+  }
+  return "fail";
+}
+function clampScore(value) {
+  if (Number.isNaN(value) || !Number.isFinite(value)) {
+    return 0;
+  }
+  if (value < 0) {
+    return 0;
+  }
+  if (value > 1) {
+    return 1;
+  }
+  return value;
+}
+function extractJsonBlob(text) {
+  const match = text.match(/\{[\s\S]*\}/);
+  return match?.[0];
+}
+function parseJsonFromText(text) {
+  const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
+  const blob = extractJsonBlob(cleaned) ?? cleaned;
+  return JSON.parse(blob);
+}
+function isNonEmptyString(value) {
+  return typeof value === "string" && value.trim().length > 0;
+}
+function parseJsonSafe(payload) {
+  try {
+    return JSON.parse(payload);
+  } catch {
+    return void 0;
+  }
+}
+function deepEqual(a, b) {
+  if (a === b) return true;
+  if (a === null || b === null) return a === b;
+  if (typeof a !== typeof b) return false;
+  if (typeof a !== "object") return a === b;
+  if (Array.isArray(a) !== Array.isArray(b)) return false;
+  if (Array.isArray(a) && Array.isArray(b)) {
+    if (a.length !== b.length) return false;
+    return a.every((val, i) => deepEqual(val, b[i]));
+  }
+  const aObj = a;
+  const bObj = b;
+  const aKeys = Object.keys(aObj);
+  const bKeys = Object.keys(bObj);
+  if (aKeys.length !== bKeys.length) return false;
+  return aKeys.every((key) => Object.hasOwn(bObj, key) && deepEqual(aObj[key], bObj[key]));
+}
 // src/runtime/exec.ts
 function shellEscapePath(value) {
@@ -6354,7 +6419,9 @@ async function execFileWithStdinBun(argv, stdinPayload, options) {
     cwd: options.cwd,
     stdin: encoder.encode(stdinPayload),
     stdout: "pipe",
-    stderr: "pipe"
+    stderr: "pipe",
+    // Merge additional env vars with process.env
+    env: options.env ? { ...process.env, ...options.env } : process.env
   });
   let timedOut = false;
   const timeout = options.timeoutMs !== void 0 ? setTimeout(() => {
@@ -6389,7 +6456,9 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
     const [cmd, ...args] = argv;
     const child = spawn4(cmd, args, {
       cwd: options.cwd,
-      stdio: ["pipe", "pipe", "pipe"]
+      stdio: ["pipe", "pipe", "pipe"],
+      // Merge additional env vars with process.env
+      env: options.env ? { ...process.env, ...options.env } : process.env
     });
     const stdoutChunks = [];
     const stderrChunks = [];
@@ -6442,7 +6511,9 @@ async function execShellWithStdin(command, stdinPayload, options = {}) {
       const child = spawn4(wrappedCommand, {
         shell: true,
         cwd: options.cwd,
-        stdio: ["ignore", "ignore", "ignore"]
+        stdio: ["ignore", "ignore", "ignore"],
+        // Merge additional env vars with process.env
+        env: options.env ? { ...process.env, ...options.env } : process.env
       });
       const timeout = options.timeoutMs ? setTimeout(() => {
         child.kill();
@@ -6469,59 +6540,414 @@ async function execShellWithStdin(command, stdinPayload, options = {}) {
   }
 }
-// src/evaluation/case-conversion.ts
-function toSnakeCase(str) {
-  if (/^[A-Z]/.test(str)) {
-    return str;
-  }
-  return str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
-}
-function toSnakeCaseDeep(obj) {
-  if (obj === null || obj === void 0) {
-    return obj;
-  }
-  if (Array.isArray(obj)) {
-    return obj.map((item) => toSnakeCaseDeep(item));
-  }
-  if (typeof obj === "object") {
-    const result = {};
-    for (const [key, value] of Object.entries(obj)) {
-      const snakeKey = toSnakeCase(key);
-      result[snakeKey] = toSnakeCaseDeep(value);
+// src/runtime/target-proxy.ts
+var import_node_crypto4 = require("crypto");
+var import_node_http = require("http");
+var DEFAULT_MAX_CALLS = 50;
+async function createTargetProxy(options) {
+  const { defaultProvider, targetResolver, availableTargets, maxCalls } = options;
+  const token = (0, import_node_crypto4.randomBytes)(32).toString("hex");
+  let callCount = 0;
+  let isShutdown = false;
+  const targetsList = availableTargets ?? [defaultProvider.targetName];
+  function resolveProvider(targetName) {
+    if (targetName === void 0 || targetName === defaultProvider.targetName) {
+      return defaultProvider;
+    }
+    if (targetResolver) {
+      return targetResolver(targetName);
     }
-    return result;
+    return void 0;
   }
-  return obj;
-}
-// src/evaluation/providers/types.ts
-var AGENT_PROVIDER_KINDS = [
-  "codex",
-  "pi-coding-agent",
-  "claude-code",
-  "vscode",
-  "vscode-insiders"
-];
-function extractLastAssistantContent(messages) {
-  if (!messages || messages.length === 0) {
-    return "";
+  const server = (0, import_node_http.createServer)(async (req, res) => {
+    res.setHeader("Access-Control-Allow-Origin", "*");
+    res.setHeader("Access-Control-Allow-Methods", "GET, POST, OPTIONS");
+    res.setHeader("Access-Control-Allow-Headers", "Content-Type, Authorization");
+    if (req.method === "OPTIONS") {
+      res.writeHead(204);
+      res.end();
+      return;
+    }
+    const authHeader = req.headers.authorization;
+    if (!authHeader || authHeader !== `Bearer ${token}`) {
+      sendJson(res, 401, { error: "Unauthorized" });
+      return;
+    }
+    if (isShutdown) {
+      sendJson(res, 503, { error: "Proxy is shutting down" });
+      return;
+    }
+    const url2 = req.url ?? "";
+    if (req.method === "GET" && url2 === "/info") {
+      handleInfo(res);
+      return;
+    }
+    if (req.method === "POST" && url2 === "/invoke") {
+      await handleInvoke(req, res);
+      return;
+    }
+    if (req.method === "POST" && url2 === "/invokeBatch") {
+      await handleInvokeBatch(req, res);
+      return;
+    }
+    sendJson(res, 404, { error: "Not found" });
+  });
+  function handleInfo(res) {
+    const response = {
+      targetName: defaultProvider.targetName,
+      maxCalls,
+      callCount,
+      availableTargets: targetsList
+    };
+    sendJson(res, 200, response);
   }
-  for (let i = messages.length - 1; i >= 0; i--) {
-    const msg = messages[i];
-    if (msg.role === "assistant" && msg.content !== void 0) {
-      if (typeof msg.content === "string") {
-        return msg.content;
+  async function handleInvoke(req, res) {
+    if (callCount >= maxCalls) {
+      sendJson(res, 429, { error: `Max calls exceeded (limit: ${maxCalls})` });
+      return;
+    }
+    try {
+      const body = await readBody(req);
+      const request = JSON.parse(body);
+      if (!request.question || typeof request.question !== "string") {
+        sendJson(res, 400, { error: "Missing required field: question" });
+        return;
       }
-      return JSON.stringify(msg.content);
+      const provider = resolveProvider(request.target);
+      if (!provider) {
+        sendJson(res, 400, {
+          error: `Unknown target '${request.target}'. Available: ${targetsList.join(", ")}`
+        });
+        return;
+      }
+      callCount++;
+      const response = await provider.invoke({
+        question: request.question,
+        systemPrompt: request.systemPrompt,
+        evalCaseId: request.evalCaseId ?? "proxy",
+        attempt: request.attempt ?? 1
+      });
+      const outputMessages = response.outputMessages ?? [];
+      const rawText = extractLastAssistantContent(outputMessages);
+      const result = {
+        outputMessages,
+        rawText
+      };
+      sendJson(res, 200, result);
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      sendJson(res, 500, { error: message });
     }
   }
-  return "";
-}
-function isAgentProvider(provider) {
-  return provider ? AGENT_PROVIDER_KINDS.includes(provider.kind) : false;
+  async function handleInvokeBatch(req, res) {
+    try {
+      const body = await readBody(req);
+      const { requests } = JSON.parse(body);
+      if (!Array.isArray(requests)) {
+        sendJson(res, 400, { error: "Missing required field: requests (array)" });
+        return;
+      }
+      if (callCount + requests.length > maxCalls) {
+        sendJson(res, 429, {
+          error: `Batch would exceed max calls (current: ${callCount}, batch: ${requests.length}, limit: ${maxCalls})`
+        });
+        return;
+      }
+      const responses = [];
+      for (const request of requests) {
+        if (!request.question || typeof request.question !== "string") {
+          responses.push({
+            outputMessages: [],
+            rawText: "Error: Missing required field: question"
+          });
+          continue;
+        }
+        const provider = resolveProvider(request.target);
+        if (!provider) {
+          responses.push({
+            outputMessages: [],
+            rawText: `Error: Unknown target '${request.target}'. Available: ${targetsList.join(", ")}`
+          });
+          continue;
+        }
+        callCount++;
+        try {
+          const response = await provider.invoke({
+            question: request.question,
+            systemPrompt: request.systemPrompt,
+            evalCaseId: request.evalCaseId ?? "proxy",
+            attempt: request.attempt ?? 1
+          });
+          const outputMessages = response.outputMessages ?? [];
+          responses.push({
+            outputMessages,
+            rawText: extractLastAssistantContent(outputMessages)
+          });
+        } catch (error) {
+          const message = error instanceof Error ? error.message : String(error);
+          responses.push({
+            outputMessages: [],
+            rawText: `Error: ${message}`
+          });
+        }
+      }
+      sendJson(res, 200, { responses });
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      sendJson(res, 500, { error: message });
+    }
+  }
+  await new Promise((resolve, reject) => {
+    server.once("error", reject);
+    server.listen(0, "127.0.0.1", () => {
+      server.removeListener("error", reject);
+      resolve();
+    });
+  });
+  const address = server.address();
+  const url = `http://127.0.0.1:${address.port}`;
+  return {
+    url,
+    token,
+    shutdown: async () => {
+      isShutdown = true;
+      return new Promise((resolve, reject) => {
+        server.close((err) => {
+          if (err) reject(err);
+          else resolve();
+        });
+      });
+    },
+    getUsageMetadata: () => ({
+      callCount,
+      maxCalls
+    })
+  };
+}
+function sendJson(res, statusCode, body) {
+  res.writeHead(statusCode, { "Content-Type": "application/json" });
+  res.end(JSON.stringify(body));
+}
+function readBody(req) {
+  return new Promise((resolve, reject) => {
+    const chunks = [];
+    req.on("data", (chunk) => chunks.push(chunk));
+    req.on("end", () => resolve(Buffer.concat(chunks).toString("utf8")));
+    req.on("error", reject);
+  });
+}
+function extractLastAssistantContent(messages) {
+  for (let i = messages.length - 1; i >= 0; i--) {
+    const msg = messages[i];
+    if (msg.role === "assistant" && msg.content !== void 0) {
+      if (typeof msg.content === "string") {
+        return msg.content;
+      }
+      if (Array.isArray(msg.content)) {
+        for (const part of msg.content) {
+          if (typeof part === "object" && part !== null && "text" in part) {
+            return String(part.text);
+          }
+        }
+      }
+    }
+  }
+  return void 0;
+}
+// src/evaluation/case-conversion.ts
+function toSnakeCase(str) {
+  if (/^[A-Z]/.test(str)) {
+    return str;
+  }
+  return str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
+}
+function toSnakeCaseDeep(obj) {
+  if (obj === null || obj === void 0) {
+    return obj;
+  }
+  if (Array.isArray(obj)) {
+    return obj.map((item) => toSnakeCaseDeep(item));
+  }
+  if (typeof obj === "object") {
+    const result = {};
+    for (const [key, value] of Object.entries(obj)) {
+      const snakeKey = toSnakeCase(key);
+      result[snakeKey] = toSnakeCaseDeep(value);
+    }
+    return result;
+  }
+  return obj;
+}
+// src/evaluation/evaluators/code-evaluator.ts
+var CodeEvaluator = class {
+  kind = "code";
+  script;
+  cwd;
+  agentTimeoutMs;
+  config;
+  target;
+  constructor(options) {
+    this.script = options.script;
+    this.cwd = options.cwd;
+    this.agentTimeoutMs = options.agentTimeoutMs;
+    this.config = options.config;
+    this.target = options.target;
+  }
+  async evaluate(context) {
+    const payload = {
+      question: context.evalCase.question,
+      expectedOutcome: context.evalCase.expected_outcome,
+      expectedMessages: context.evalCase.expected_messages,
+      referenceAnswer: context.evalCase.reference_answer,
+      candidateAnswer: context.candidate,
+      outputMessages: context.outputMessages ?? null,
+      guidelineFiles: context.evalCase.guideline_paths,
+      inputFiles: context.evalCase.file_paths.filter(
+        (path17) => !context.evalCase.guideline_paths.includes(path17)
+      ),
+      inputMessages: context.evalCase.input_messages,
+      traceSummary: context.traceSummary ?? null,
+      config: this.config ?? null
+    };
+    const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
+    let proxyEnv;
+    let proxyShutdown;
+    let getProxyUsage;
+    if (this.target !== void 0 && context.judgeProvider) {
+      const maxCalls = this.target.max_calls ?? DEFAULT_MAX_CALLS;
+      const proxy = await createTargetProxy({
+        defaultProvider: context.judgeProvider,
+        targetResolver: context.targetResolver,
+        availableTargets: context.availableTargets,
+        maxCalls
+      });
+      proxyEnv = {
+        AGENTV_TARGET_PROXY_URL: proxy.url,
+        AGENTV_TARGET_PROXY_TOKEN: proxy.token
+      };
+      proxyShutdown = proxy.shutdown;
+      getProxyUsage = proxy.getUsageMetadata;
+    }
+    try {
+      const stdout = await executeScript(
+        this.script,
+        inputPayload,
+        this.agentTimeoutMs,
+        this.cwd,
+        proxyEnv
+      );
+      const parsed = parseJsonSafe(stdout);
+      const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
+      const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
+      const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
+      const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
+      const details = parsed?.details && typeof parsed.details === "object" && !Array.isArray(parsed.details) ? parsed.details : void 0;
+      const proxyUsage = getProxyUsage?.();
+      const evaluatorRawRequest = {
+        script: this.script,
+        ...this.cwd ? { cwd: this.cwd } : {},
+        ...proxyUsage ? {
+          target_proxy: {
+            call_count: proxyUsage.callCount,
+            max_calls: proxyUsage.maxCalls
+          }
+        } : {}
+      };
+      return {
+        score,
+        verdict: scoreToVerdict(score),
+        hits,
+        misses,
+        expectedAspectCount: hits.length + misses.length || 1,
+        reasoning,
+        evaluatorRawRequest,
+        ...details ? { details } : {}
+      };
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      const proxyUsage = getProxyUsage?.();
+      return {
+        score: 0,
+        verdict: "fail",
+        hits: [],
+        misses: [`Code evaluator failed: ${message}`],
+        expectedAspectCount: 1,
+        reasoning: message,
+        evaluatorRawRequest: {
+          script: this.script,
+          ...this.cwd ? { cwd: this.cwd } : {},
+          ...proxyUsage ? {
+            target_proxy: {
+              call_count: proxyUsage.callCount,
+              max_calls: proxyUsage.maxCalls
+            }
+          } : {},
+          error: message
+        }
+      };
+    } finally {
+      if (proxyShutdown) {
+        await proxyShutdown();
+      }
+    }
+  }
+};
+async function executeScript(scriptPath, input, agentTimeoutMs, cwd, env) {
+  const { stdout, stderr, exitCode } = typeof scriptPath === "string" ? await execShellWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env }) : await execFileWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env });
+  if (exitCode !== 0) {
+    const trimmedErr = formatStderr(stderr);
+    throw new Error(
+      trimmedErr.length > 0 ? `Code evaluator exited with code ${exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${exitCode}`
+    );
+  }
+  return stdout.trim();
+}
+function formatStderr(stderr) {
+  const trimmed = stderr.trim();
+  const maxLength = 2e3;
+  if (trimmed.length <= maxLength) {
+    return trimmed;
+  }
+  const tail = trimmed.slice(-maxLength);
+  return `...(truncated, last ${maxLength} chars)
+${tail}`;
+}
+// src/evaluation/evaluators/composite.ts
+var import_ai3 = require("ai");
+// src/evaluation/providers/types.ts
+var AGENT_PROVIDER_KINDS = [
+  "codex",
+  "pi-coding-agent",
+  "claude-code",
+  "vscode",
+  "vscode-insiders"
+];
+function extractLastAssistantContent2(messages) {
+  if (!messages || messages.length === 0) {
+    return "";
+  }
+  for (let i = messages.length - 1; i >= 0; i--) {
+    const msg = messages[i];
+    if (msg.role === "assistant" && msg.content !== void 0) {
+      if (typeof msg.content === "string") {
+        return msg.content;
+      }
+      return JSON.stringify(msg.content);
+    }
+  }
+  return "";
+}
+function isAgentProvider(provider) {
+  return provider ? AGENT_PROVIDER_KINDS.includes(provider.kind) : false;
 }
-// src/evaluation/evaluators.ts
+// src/evaluation/evaluators/llm-judge.ts
+var import_ai2 = require("ai");
+var import_zod3 = require("zod");
 var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
 Use the reference_answer as a gold standard for a high-quality response (if provided). The reference_answer may be a simple text response, or it may contain a sequence of expected agent messages including tool calls. When it contains multiple messages, the last message represents the final expected answer. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
@@ -6601,7 +7027,7 @@ var LlmJudgeEvaluator = class {
       target: judgeProvider.targetName
     };
     try {
-      const { data, providerResponse } = await this.runWithRetry({
+      const { data } = await this.runWithRetry({
         context,
         judgeProvider,
         systemPrompt,
@@ -6714,7 +7140,7 @@ var LlmJudgeEvaluator = class {
           temperature: this.temperature
         });
         const data = schema.parse(
-          parseJsonFromText(extractLastAssistantContent(response.outputMessages))
+          parseJsonFromText(extractLastAssistantContent2(response.outputMessages))
         );
         return { data, providerResponse: response };
       } catch (e) {
@@ -6750,86 +7176,160 @@ You must return a valid JSON object matching this schema:
   "overall_reasoning": "string (summary)"
 }`;
 }
-function scoreToVerdict(score) {
-  if (score >= 0.8) {
-    return "pass";
-  }
-  if (score >= 0.6) {
-    return "borderline";
-  }
-  return "fail";
+function substituteVariables(template, variables) {
+  return template.replace(/\{\{\s*([a-zA-Z0-9_]+)\s*\}\}/g, (match, varName) => {
+    return variables[varName] ?? match;
+  });
 }
-function clampScore(value) {
-  if (Number.isNaN(value) || !Number.isFinite(value)) {
-    return 0;
-  }
-  if (value < 0) {
-    return 0;
-  }
-  if (value > 1) {
-    return 1;
+function calculateRubricScore(result, rubrics) {
+  const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
+  const hits = [];
+  const misses = [];
+  let totalWeight = 0;
+  let earnedWeight = 0;
+  let failedRequired = false;
+  for (const check of result.checks) {
+    const rubric = rubricMap.get(check.id);
+    if (!rubric) {
+      continue;
+    }
+    totalWeight += rubric.weight;
+    if (check.satisfied) {
+      earnedWeight += rubric.weight;
+      hits.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
+    } else {
+      misses.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
+      if (rubric.required) {
+        failedRequired = true;
+      }
+    }
   }
-  return value;
-}
-function extractJsonBlob(text) {
-  const match = text.match(/\{[\s\S]*\}/);
-  return match?.[0];
-}
-function parseJsonFromText(text) {
-  const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
-  const blob = extractJsonBlob(cleaned) ?? cleaned;
-  return JSON.parse(blob);
-}
-function isNonEmptyString(value) {
-  return typeof value === "string" && value.trim().length > 0;
+  const score = totalWeight > 0 ? Math.min(1, Math.max(0, earnedWeight / totalWeight)) : 0;
+  const verdict = failedRequired ? "fail" : scoreToVerdict(score);
+  return { score, verdict, hits, misses };
 }
-var CodeEvaluator = class {
-  kind = "code";
-  script;
-  cwd;
-  agentTimeoutMs;
+// src/evaluation/evaluators/composite.ts
+var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
+{{EVALUATOR_RESULTS_JSON}}
+Decide the final score and verdict based on all evaluator results.
+Return a JSON object with: score (0.0-1.0), verdict (pass/fail/borderline), and reasoning.`;
+var CompositeEvaluator = class {
+  kind = "composite";
   config;
+  evaluatorFactory;
+  cwd;
   constructor(options) {
-    this.script = options.script;
-    this.cwd = options.cwd;
-    this.agentTimeoutMs = options.agentTimeoutMs;
     this.config = options.config;
+    this.evaluatorFactory = options.evaluatorFactory;
+    this.cwd = options.cwd;
   }
   async evaluate(context) {
-    const payload = {
-      question: context.evalCase.question,
-      expectedOutcome: context.evalCase.expected_outcome,
-      expectedMessages: context.evalCase.expected_messages,
-      referenceAnswer: context.evalCase.reference_answer,
-      candidateAnswer: context.candidate,
-      outputMessages: context.outputMessages ?? null,
-      guidelineFiles: context.evalCase.guideline_paths,
-      inputFiles: context.evalCase.file_paths.filter(
-        (path17) => !context.evalCase.guideline_paths.includes(path17)
-      ),
-      inputMessages: context.evalCase.input_messages,
-      traceSummary: context.traceSummary ?? null,
-      config: this.config ?? null
+    const memberResults = await Promise.all(
+      this.config.evaluators.map(async (memberConfig) => {
+        const evaluator = this.evaluatorFactory.create(memberConfig, context);
+        return {
+          id: memberConfig.name,
+          type: memberConfig.type,
+          result: await evaluator.evaluate(context)
+        };
+      })
+    );
+    return this.aggregate(memberResults, context);
+  }
+  async aggregate(results, context) {
+    const aggregator = this.config.aggregator;
+    switch (aggregator.type) {
+      case "code_judge":
+        return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
+      case "llm_judge":
+        return this.runLlmAggregator(results, context, aggregator);
+      default:
+        return this.runWeightedAverage(results, aggregator.weights);
+    }
+  }
+  runWeightedAverage(results, weights) {
+    let totalWeight = 0;
+    let weightedSum = 0;
+    const allHits = [];
+    const allMisses = [];
+    const reasoningParts = [];
+    const evaluatorResults = [];
+    for (const member of results) {
+      const weight = weights?.[member.id] ?? 1;
+      totalWeight += weight;
+      weightedSum += member.result.score * weight;
+      allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
+      allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
+      if (member.result.reasoning) {
+        reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
+      }
+      evaluatorResults.push({
+        name: member.id,
+        type: member.type,
+        score: member.result.score,
+        weight,
+        verdict: member.result.verdict,
+        hits: [...member.result.hits],
+        misses: [...member.result.misses],
+        reasoning: member.result.reasoning,
+        evaluatorRawRequest: member.result.evaluatorRawRequest,
+        evaluatorResults: member.result.evaluatorResults,
+        details: member.result.details
+      });
+    }
+    const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
+    return {
+      score: clampScore(finalScore),
+      verdict: scoreToVerdict(finalScore),
+      hits: allHits,
+      misses: allMisses,
+      expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
+      reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
+      evaluatorRawRequest: {
+        aggregator: "weighted_average",
+        ...weights ? { weights } : {}
+      },
+      evaluatorResults
     };
-    const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
+  }
+  async runCodeAggregator(results, scriptPath, cwd, weights) {
+    const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
+    const inputPayload = JSON.stringify({ results: resultsObject }, null, 2);
+    const evaluatorResults = results.map((member) => ({
+      name: member.id,
+      type: member.type,
+      score: member.result.score,
+      weight: weights?.[member.id] ?? 1,
+      verdict: member.result.verdict,
+      hits: [...member.result.hits],
+      misses: [...member.result.misses],
+      reasoning: member.result.reasoning,
+      evaluatorRawRequest: member.result.evaluatorRawRequest,
+      evaluatorResults: member.result.evaluatorResults,
+      details: member.result.details
+    }));
     try {
-      const stdout = await executeScript(this.script, inputPayload, this.agentTimeoutMs, this.cwd);
+      const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
       const parsed = parseJsonSafe(stdout);
       const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
       const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
       const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
       const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
+      const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
       return {
         score,
-        verdict: scoreToVerdict(score),
+        verdict,
         hits,
         misses,
         expectedAspectCount: hits.length + misses.length || 1,
         reasoning,
         evaluatorRawRequest: {
-          script: this.script,
-          ...this.cwd ? { cwd: this.cwd } : {}
-        }
+          aggregator: "code_judge",
+          script: scriptPath
+        },
+        evaluatorResults
       };
     } catch (error) {
       const message = error instanceof Error ? error.message : String(error);
@@ -6837,452 +7337,292 @@ var CodeEvaluator = class {
         score: 0,
         verdict: "fail",
         hits: [],
-        misses: [`Code evaluator failed: ${message}`],
+        misses: [`Code aggregator failed: ${message}`],
         expectedAspectCount: 1,
         reasoning: message,
         evaluatorRawRequest: {
-          script: this.script,
-          ...this.cwd ? { cwd: this.cwd } : {},
+          aggregator: "code_judge",
+          script: scriptPath,
           error: message
-        }
+        },
+        evaluatorResults
       };
     }
   }
-};
-function calculateRubricScore(result, rubrics) {
-  const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
-  const hits = [];
-  const misses = [];
-  let totalWeight = 0;
-  let earnedWeight = 0;
-  let failedRequired = false;
-  for (const check of result.checks) {
-    const rubric = rubricMap.get(check.id);
-    if (!rubric) {
-      continue;
+  async runLlmAggregator(results, context, config) {
+    const judgeProvider = context.judgeProvider;
+    if (!judgeProvider) {
+      throw new Error("No judge provider available for LLM aggregation");
     }
-    totalWeight += rubric.weight;
-    if (check.satisfied) {
-      earnedWeight += rubric.weight;
-      hits.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
-    } else {
-      misses.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
-      if (rubric.required) {
-        failedRequired = true;
+    const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
+    const resultsJson = JSON.stringify(resultsObject, null, 2);
+    const evaluatorResults = results.map((member) => ({
+      name: member.id,
+      type: member.type,
+      score: member.result.score,
+      verdict: member.result.verdict,
+      hits: [...member.result.hits],
+      misses: [...member.result.misses],
+      reasoning: member.result.reasoning,
+      evaluatorRawRequest: member.result.evaluatorRawRequest,
+      evaluatorResults: member.result.evaluatorResults,
+      details: member.result.details
+    }));
+    const promptTemplate = config.prompt ?? DEFAULT_COMPOSITE_AGGREGATOR_PROMPT;
+    const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
+    const systemPrompt = buildOutputSchema();
+    const evaluatorRawRequest = {
+      aggregator: "llm_judge",
+      userPrompt,
+      systemPrompt,
+      target: judgeProvider.targetName
+    };
+    try {
+      const model = judgeProvider.asLanguageModel?.();
+      if (model) {
+        const { text } = await (0, import_ai3.generateText)({
+          model,
+          system: systemPrompt,
+          prompt: userPrompt
+        });
+        const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text));
+        const score2 = clampScore(data2.score);
+        const hits2 = Array.isArray(data2.hits) ? data2.hits.filter(isNonEmptyString).slice(0, 4) : [];
+        const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
+        const reasoning2 = data2.reasoning;
+        return {
+          score: score2,
+          verdict: scoreToVerdict(score2),
+          hits: hits2,
+          misses: misses2,
+          expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
+          reasoning: reasoning2,
+          evaluatorRawRequest,
+          evaluatorResults
+        };
       }
+      const response = await judgeProvider.invoke({
+        question: userPrompt,
+        systemPrompt,
+        evalCaseId: context.evalCase.id,
+        attempt: context.attempt
+      });
+      const data = freeformEvaluationSchema.parse(
+        parseJsonFromText(extractLastAssistantContent2(response.outputMessages))
+      );
+      const score = clampScore(data.score);
+      const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
+      const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
+      const reasoning = data.reasoning;
+      return {
+        score,
+        verdict: scoreToVerdict(score),
+        hits,
+        misses,
+        expectedAspectCount: Math.max(hits.length + misses.length, 1),
+        reasoning,
+        evaluatorRawRequest,
+        evaluatorResults
+      };
+    } catch {
+      return {
+        score: 0,
+        verdict: "fail",
+        hits: [],
+        misses: [],
+        expectedAspectCount: 1,
+        evaluatorRawRequest,
+        evaluatorResults
+      };
     }
   }
-  const score = totalWeight > 0 ? Math.min(1, Math.max(0, earnedWeight / totalWeight)) : 0;
-  const verdict = failedRequired ? "fail" : scoreToVerdict(score);
-  return { score, verdict, hits, misses };
-}
-async function executeScript(scriptPath, input, agentTimeoutMs, cwd) {
-  const { stdout, stderr, exitCode } = typeof scriptPath === "string" ? await execShellWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs }) : await execFileWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs });
-  if (exitCode !== 0) {
-    const trimmedErr = formatStderr(stderr);
-    throw new Error(
-      trimmedErr.length > 0 ? `Code evaluator exited with code ${exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${exitCode}`
-    );
-  }
-  return stdout.trim();
-}
-function formatStderr(stderr) {
-  const trimmed = stderr.trim();
-  const maxLength = 2e3;
-  if (trimmed.length <= maxLength) {
-    return trimmed;
+};
+// src/evaluation/evaluators/cost.ts
+var CostEvaluator = class {
+  kind = "cost";
+  config;
+  constructor(options) {
+    this.config = options.config;
   }
-  const tail = trimmed.slice(-maxLength);
-  return `...(truncated, last ${maxLength} chars)
-${tail}`;
-}
-function parseJsonSafe(payload) {
-  try {
-    return JSON.parse(payload);
-  } catch {
-    return void 0;
-  }
-}
-function substituteVariables(template, variables) {
-  return template.replace(/\{\{\s*([a-zA-Z0-9_]+)\s*\}\}/g, (match, varName) => {
-    return variables[varName] ?? match;
-  });
-}
-function deepEqual(a, b) {
-  if (a === b) return true;
-  if (a === null || b === null) return a === b;
-  if (typeof a !== typeof b) return false;
-  if (typeof a !== "object") return a === b;
-  if (Array.isArray(a) !== Array.isArray(b)) return false;
-  if (Array.isArray(a) && Array.isArray(b)) {
-    if (a.length !== b.length) return false;
-    return a.every((val, i) => deepEqual(val, b[i]));
-  }
-  const aObj = a;
-  const bObj = b;
-  const aKeys = Object.keys(aObj);
-  const bKeys = Object.keys(bObj);
-  if (aKeys.length !== bKeys.length) return false;
-  return aKeys.every((key) => Object.hasOwn(bObj, key) && deepEqual(aObj[key], bObj[key]));
-}
-function argsMatch(expected, actual) {
-  if (expected === void 0) return true;
-  if (expected === "any") return true;
-  if (actual === void 0) return false;
-  for (const key of Object.keys(expected)) {
-    if (!Object.hasOwn(actual, key)) return false;
-    if (!deepEqual(expected[key], actual[key])) return false;
+  evaluate(context) {
+    const { budget } = this.config;
+    const costUsd = context.traceSummary?.costUsd;
+    if (costUsd === void 0) {
+      return {
+        score: 0,
+        verdict: "fail",
+        hits: [],
+        misses: ["No cost data available in trace"],
+        expectedAspectCount: 1,
+        reasoning: "Execution cost not reported by provider",
+        evaluatorRawRequest: {
+          type: "cost",
+          budget,
+          costUsd: null
+        }
+      };
+    }
+    const passed = costUsd <= budget;
+    const score = passed ? 1 : 0;
+    const formatCost = (n) => `$${n.toFixed(4)}`;
+    return {
+      score,
+      verdict: passed ? "pass" : "fail",
+      hits: passed ? [`Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`] : [],
+      misses: passed ? [] : [`Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`],
+      expectedAspectCount: 1,
+      reasoning: `Execution cost ${formatCost(costUsd)} (budget: ${formatCost(budget)})`,
+      evaluatorRawRequest: {
+        type: "cost",
+        budget,
+        costUsd
+      }
+    };
   }
-  return true;
-}
-var ToolTrajectoryEvaluator = class {
-  kind = "tool_trajectory";
+};
+// src/evaluation/evaluators/field-accuracy.ts
+var DEFAULT_DATE_FORMATS = [
+  "YYYY-MM-DDTHH:mm:ssZ",
+  // ISO with timezone
+  "YYYY-MM-DDTHH:mm:ss",
+  // ISO with time
+  "YYYY-MM-DD",
+  // ISO date
+  "DD-MMM-YYYY",
+  // Localized (e.g., "15-JAN-2025")
+  "MM/DD/YYYY",
+  // US format
+  "DD/MM/YYYY",
+  // EU format
+  "MM-DD-YYYY",
+  // US with dashes
+  "DD-MM-YYYY"
+  // EU with dashes
+];
+var MONTH_NAMES = {
+  jan: 0,
+  january: 0,
+  feb: 1,
+  february: 1,
+  mar: 2,
+  march: 2,
+  apr: 3,
+  april: 3,
+  may: 4,
+  jun: 5,
+  june: 5,
+  jul: 6,
+  july: 6,
+  aug: 7,
+  august: 7,
+  sep: 8,
+  sept: 8,
+  september: 8,
+  oct: 9,
+  october: 9,
+  nov: 10,
+  november: 10,
+  dec: 11,
+  december: 11
+};
+var FieldAccuracyEvaluator = class {
+  kind = "field_accuracy";
   config;
   constructor(options) {
     this.config = options.config;
   }
   evaluate(context) {
-    const { outputMessages, traceSummary } = context;
-    const toolCalls = this.extractToolCallsFromMessages(outputMessages);
-    if (toolCalls.length === 0 && !traceSummary) {
+    const { evalCase, candidate } = context;
+    let candidateData;
+    try {
+      candidateData = parseJsonFromTextSafe(candidate);
+    } catch {
       return {
         score: 0,
         verdict: "fail",
         hits: [],
-        misses: ["No trace available for evaluation"],
-        expectedAspectCount: 1
+        misses: ["Failed to parse candidate answer as JSON"],
+        expectedAspectCount: this.config.fields.length,
+        reasoning: "Candidate answer is not valid JSON"
       };
     }
-    const summary = toolCalls.length > 0 ? this.buildSummary(toolCalls) : traceSummary;
-    if (!summary) {
+    const expectedData = this.extractExpectedData(evalCase.expected_messages);
+    if (!expectedData) {
       return {
         score: 0,
         verdict: "fail",
         hits: [],
-        misses: ["No trace available for evaluation"],
-        expectedAspectCount: 1
+        misses: ["No expected data found in expected_messages"],
+        expectedAspectCount: this.config.fields.length,
+        reasoning: "Could not extract expected data from expected_messages"
       };
     }
-    switch (this.config.mode) {
-      case "any_order":
-        return this.evaluateAnyOrder(summary);
-      case "in_order":
-        return this.evaluateInOrder(toolCalls);
-      case "exact":
-        return this.evaluateExact(toolCalls);
-      default:
-        return {
-          score: 0,
-          verdict: "fail",
-          hits: [],
-          misses: [`Unknown mode: ${this.config.mode}`],
-          expectedAspectCount: 1
-        };
+    const fieldResults = [];
+    for (const fieldConfig of this.config.fields) {
+      const result = this.evaluateField(fieldConfig, candidateData, expectedData);
+      fieldResults.push(result);
     }
+    return this.aggregateResults(fieldResults);
   }
   /**
-   * Extract tool calls from output messages.
+   * Extract expected data from expected_messages array.
+   * Looks for the last assistant message with content.
    */
-  extractToolCallsFromMessages(messages) {
-    if (!messages) {
-      return [];
-    }
-    const toolCalls = [];
-    for (const message of messages) {
-      if (message.toolCalls) {
-        for (const call of message.toolCalls) {
-          toolCalls.push({
-            name: call.tool,
-            args: call.input
-          });
+  extractExpectedData(expectedMessages) {
+    for (let i = expectedMessages.length - 1; i >= 0; i--) {
+      const message = expectedMessages[i];
+      if (message.role === "assistant" && message.content) {
+        if (typeof message.content === "object" && message.content !== null) {
+          return message.content;
+        }
+        if (typeof message.content === "string") {
+          try {
+            return parseJsonFromTextSafe(message.content);
+          } catch {
+          }
         }
       }
     }
-    return toolCalls;
+    return void 0;
   }
   /**
-   * Build a summary from extracted tool calls.
+   * Evaluate a single field against the expected value.
    */
-  buildSummary(toolCalls) {
-    const toolCallsByName = {};
-    for (const call of toolCalls) {
-      toolCallsByName[call.name] = (toolCallsByName[call.name] ?? 0) + 1;
-    }
-    const toolNames = Object.keys(toolCallsByName).sort();
-    return {
-      eventCount: toolCalls.length,
-      toolNames,
-      toolCallsByName,
-      errorCount: 0
-    };
-  }
-  evaluateAnyOrder(summary) {
-    const minimums = this.config.minimums ?? {};
-    const toolNames = Object.keys(minimums);
-    if (toolNames.length === 0) {
+  evaluateField(fieldConfig, candidateData, expectedData) {
+    const { path: path17, match, required = true, weight = 1 } = fieldConfig;
+    const candidateValue = resolvePath(candidateData, path17);
+    const expectedValue = resolvePath(expectedData, path17);
+    if (expectedValue === void 0) {
       return {
+        path: path17,
         score: 1,
-        verdict: "pass",
-        hits: ["No tool requirements specified"],
-        misses: [],
-        expectedAspectCount: 0
+        // No expected value means no comparison needed
+        weight,
+        hit: true,
+        message: `${path17}: no expected value`
       };
     }
-    const hits = [];
-    const misses = [];
-    for (const toolName of toolNames) {
-      const required = minimums[toolName];
-      const actual = summary.toolCallsByName[toolName] ?? 0;
-      if (actual >= required) {
-        hits.push(`${toolName}: called ${actual} times (required \u2265${required})`);
-      } else {
-        misses.push(`${toolName}: called ${actual} times (required \u2265${required})`);
+    if (candidateValue === void 0) {
+      if (required) {
+        return {
+          path: path17,
+          score: 0,
+          weight,
+          hit: false,
+          message: `${path17} (required, missing)`
+        };
       }
-    }
-    const score = hits.length / toolNames.length;
-    return {
-      score,
-      verdict: scoreToVerdict(score),
-      hits,
-      misses,
-      expectedAspectCount: toolNames.length
-    };
-  }
-  evaluateInOrder(toolCalls) {
-    const expected = this.config.expected ?? [];
-    if (expected.length === 0) {
-      return {
-        score: 1,
-        verdict: "pass",
-        hits: ["No tool sequence specified"],
-        misses: [],
-        expectedAspectCount: 0
-      };
-    }
-    const hits = [];
-    const misses = [];
-    let actualIndex = 0;
-    for (let i = 0; i < expected.length; i++) {
-      const expectedItem = expected[i];
-      const expectedTool = expectedItem.tool;
-      let found = false;
-      let argsMismatch = false;
-      while (actualIndex < toolCalls.length) {
-        const actualCall = toolCalls[actualIndex];
-        if (actualCall.name === expectedTool) {
-          if (argsMatch(expectedItem.args, actualCall.args)) {
-            hits.push(`Found ${expectedTool} at position ${actualIndex}`);
-            actualIndex++;
-            found = true;
-            break;
-          }
-          misses.push(
-            `Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`
-          );
-          actualIndex++;
-          argsMismatch = true;
-          break;
-        }
-        actualIndex++;
-      }
-      if (!found && !argsMismatch) {
-        misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
-      }
-    }
-    const score = hits.length / expected.length;
-    return {
-      score,
-      verdict: scoreToVerdict(score),
-      hits,
-      misses,
-      expectedAspectCount: expected.length
-    };
-  }
-  evaluateExact(toolCalls) {
-    const expected = this.config.expected ?? [];
-    if (expected.length === 0) {
-      return {
-        score: 1,
-        verdict: "pass",
-        hits: ["No tool sequence specified"],
-        misses: [],
-        expectedAspectCount: 0
-      };
-    }
-    const hits = [];
-    const misses = [];
-    if (toolCalls.length !== expected.length) {
-      misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
-    }
-    const checkLength = Math.min(expected.length, toolCalls.length);
-    for (let i = 0; i < checkLength; i++) {
-      const expectedItem = expected[i];
-      const expectedTool = expectedItem.tool;
-      const actualCall = toolCalls[i];
-      const actualTool = actualCall.name;
-      if (actualTool === expectedTool) {
-        if (argsMatch(expectedItem.args, actualCall.args)) {
-          hits.push(`Position ${i}: ${expectedTool}`);
-        } else {
-          misses.push(`Position ${i}: ${expectedTool} args mismatch`);
-        }
-      } else {
-        misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
-      }
-    }
-    for (let i = checkLength; i < expected.length; i++) {
-      misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
-    }
-    const score = hits.length / expected.length;
-    return {
-      score,
-      verdict: scoreToVerdict(score),
-      hits,
-      misses,
-      expectedAspectCount: expected.length
-    };
-  }
-};
-var DEFAULT_DATE_FORMATS = [
-  "YYYY-MM-DDTHH:mm:ssZ",
-  // ISO with timezone
-  "YYYY-MM-DDTHH:mm:ss",
-  // ISO with time
-  "YYYY-MM-DD",
-  // ISO date
-  "DD-MMM-YYYY",
-  // Localized (e.g., "15-JAN-2025")
-  "MM/DD/YYYY",
-  // US format
-  "DD/MM/YYYY",
-  // EU format
-  "MM-DD-YYYY",
-  // US with dashes
-  "DD-MM-YYYY"
-  // EU with dashes
-];
-var MONTH_NAMES = {
-  jan: 0,
-  january: 0,
-  feb: 1,
-  february: 1,
-  mar: 2,
-  march: 2,
-  apr: 3,
-  april: 3,
-  may: 4,
-  jun: 5,
-  june: 5,
-  jul: 6,
-  july: 6,
-  aug: 7,
-  august: 7,
-  sep: 8,
-  sept: 8,
-  september: 8,
-  oct: 9,
-  october: 9,
-  nov: 10,
-  november: 10,
-  dec: 11,
-  december: 11
-};
-var FieldAccuracyEvaluator = class {
-  kind = "field_accuracy";
-  config;
-  constructor(options) {
-    this.config = options.config;
-  }
-  evaluate(context) {
-    const { evalCase, candidate } = context;
-    let candidateData;
-    try {
-      candidateData = parseJsonFromTextSafe(candidate);
-    } catch {
-      return {
-        score: 0,
-        verdict: "fail",
-        hits: [],
-        misses: ["Failed to parse candidate answer as JSON"],
-        expectedAspectCount: this.config.fields.length,
-        reasoning: "Candidate answer is not valid JSON"
-      };
-    }
-    const expectedData = this.extractExpectedData(evalCase.expected_messages);
-    if (!expectedData) {
-      return {
-        score: 0,
-        verdict: "fail",
-        hits: [],
-        misses: ["No expected data found in expected_messages"],
-        expectedAspectCount: this.config.fields.length,
-        reasoning: "Could not extract expected data from expected_messages"
-      };
-    }
-    const fieldResults = [];
-    for (const fieldConfig of this.config.fields) {
-      const result = this.evaluateField(fieldConfig, candidateData, expectedData);
-      fieldResults.push(result);
-    }
-    return this.aggregateResults(fieldResults);
-  }
-  /**
-   * Extract expected data from expected_messages array.
-   * Looks for the last assistant message with content.
-   */
-  extractExpectedData(expectedMessages) {
-    for (let i = expectedMessages.length - 1; i >= 0; i--) {
-      const message = expectedMessages[i];
-      if (message.role === "assistant" && message.content) {
-        if (typeof message.content === "object" && message.content !== null) {
-          return message.content;
-        }
-        if (typeof message.content === "string") {
-          try {
-            return parseJsonFromTextSafe(message.content);
-          } catch {
-          }
-        }
-      }
-    }
-    return void 0;
-  }
-  /**
-   * Evaluate a single field against the expected value.
-   */
-  evaluateField(fieldConfig, candidateData, expectedData) {
-    const { path: path17, match, required = true, weight = 1 } = fieldConfig;
-    const candidateValue = resolvePath(candidateData, path17);
-    const expectedValue = resolvePath(expectedData, path17);
-    if (expectedValue === void 0) {
-      return {
-        path: path17,
-        score: 1,
-        // No expected value means no comparison needed
-        weight,
-        hit: true,
-        message: `${path17}: no expected value`
-      };
-    }
-    if (candidateValue === void 0) {
-      if (required) {
-        return {
-          path: path17,
-          score: 0,
-          weight,
-          hit: false,
-          message: `${path17} (required, missing)`
-        };
-      }
-      return {
-        path: path17,
-        score: 1,
-        // Don't penalize missing optional fields
-        weight: 0,
-        // Zero weight means it won't affect the score
-        hit: true,
-        message: `${path17}: optional field missing`
-      };
+      return {
+        path: path17,
+        score: 1,
+        // Don't penalize missing optional fields
+        weight: 0,
+        // Zero weight means it won't affect the score
+        hit: true,
+        message: `${path17}: optional field missing`
+      };
     }
     switch (match) {
       case "exact":
@@ -7353,436 +7693,211 @@ var FieldAccuracyEvaluator = class {
         message: `${path17} (non-numeric value)`
       };
     }
-    if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
-      return {
-        path: path17,
-        score: 0,
-        weight,
-        hit: false,
-        message: `${path17} (invalid numeric value)`
-      };
-    }
-    const diff = Math.abs(candidateNum - expectedNum);
-    let withinTolerance;
-    if (relative) {
-      const relativeDiff = expectedNum === 0 ? diff : diff / Math.abs(expectedNum);
-      withinTolerance = relativeDiff <= tolerance;
-    } else {
-      withinTolerance = diff <= tolerance;
-    }
-    if (withinTolerance) {
-      return {
-        path: path17,
-        score: 1,
-        weight,
-        hit: true,
-        message: `${path17} (within tolerance: diff=${diff.toFixed(2)})`
-      };
-    }
-    return {
-      path: path17,
-      score: 0,
-      weight,
-      hit: false,
-      message: `${path17} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
-    };
-  }
-  /**
-   * Date comparison with format normalization.
-   */
-  compareDate(path17, candidateValue, expectedValue, fieldConfig, weight) {
-    const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
-    const candidateDate = parseDate(String(candidateValue), formats);
-    const expectedDate = parseDate(String(expectedValue), formats);
-    if (candidateDate === null) {
-      return {
-        path: path17,
-        score: 0,
-        weight,
-        hit: false,
-        message: `${path17} (unparseable candidate date)`
-      };
-    }
-    if (expectedDate === null) {
-      return {
-        path: path17,
-        score: 0,
-        weight,
-        hit: false,
-        message: `${path17} (unparseable expected date)`
-      };
-    }
-    if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
-      return {
-        path: path17,
-        score: 1,
-        weight,
-        hit: true,
-        message: path17
-      };
-    }
-    return {
-      path: path17,
-      score: 0,
-      weight,
-      hit: false,
-      message: `${path17} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
-    };
-  }
-  /**
-   * Aggregate field results using configured strategy.
-   */
-  aggregateResults(results) {
-    const aggregation = this.config.aggregation ?? "weighted_average";
-    const hits = [];
-    const misses = [];
-    for (const result of results) {
-      if (result.hit) {
-        hits.push(result.message);
-      } else {
-        misses.push(result.message);
-      }
-    }
-    let score;
-    if (aggregation === "all_or_nothing") {
-      score = misses.length === 0 ? 1 : 0;
-    } else {
-      const totalWeight = results.reduce((sum, r) => sum + r.weight, 0);
-      if (totalWeight === 0) {
-        score = results.length === 0 ? 1 : 0;
-      } else {
-        const weightedSum = results.reduce((sum, r) => sum + r.score * r.weight, 0);
-        score = weightedSum / totalWeight;
-      }
-    }
-    const reasoning = `${hits.length}/${results.length} fields matched`;
-    return {
-      score: clampScore(score),
-      verdict: scoreToVerdict(score),
-      hits: hits.slice(0, 4),
-      misses: misses.slice(0, 4),
-      expectedAspectCount: results.length,
-      reasoning
-    };
-  }
-};
-function resolvePath(obj, path17) {
-  if (!path17 || !obj) {
-    return void 0;
-  }
-  const parts = path17.split(/\.|\[|\]/).filter((p) => p.length > 0);
-  let current = obj;
-  for (const part of parts) {
-    if (current === null || current === void 0) {
-      return void 0;
-    }
-    if (typeof current !== "object") {
-      return void 0;
-    }
-    const isIndex = /^\d+$/.test(part);
-    if (isIndex && Array.isArray(current)) {
-      current = current[Number.parseInt(part, 10)];
-    } else {
-      current = current[part];
-    }
-  }
-  return current;
-}
-function toNumber(value) {
-  if (typeof value === "number") {
-    return value;
-  }
-  if (typeof value === "string") {
-    const num = Number.parseFloat(value);
-    return Number.isNaN(num) ? null : num;
-  }
-  return null;
-}
-function parseDate(dateStr, formats) {
-  if (!dateStr) return null;
-  const trimmed = dateStr.trim();
-  const isoDate = new Date(trimmed);
-  if (!Number.isNaN(isoDate.getTime())) {
-    return isoDate;
-  }
-  const localizedMatch = trimmed.match(/^(\d{1,2})-([A-Za-z]{3,9})-(\d{4})$/);
-  if (localizedMatch) {
-    const day = Number.parseInt(localizedMatch[1], 10);
-    const monthName = localizedMatch[2].toLowerCase();
-    const year = Number.parseInt(localizedMatch[3], 10);
-    const month = MONTH_NAMES[monthName];
-    if (month !== void 0) {
-      return new Date(year, month, day);
-    }
-  }
-  const usMatch = trimmed.match(/^(\d{1,2})[\/\-](\d{1,2})[\/\-](\d{4})$/);
-  if (usMatch) {
-    const hasUSFormat = formats.some((f) => f.includes("MM/DD") || f.includes("MM-DD"));
-    const hasEUFormat = formats.some((f) => f.includes("DD/MM") || f.includes("DD-MM"));
-    if (hasUSFormat && !hasEUFormat) {
-      const month = Number.parseInt(usMatch[1], 10) - 1;
-      const day = Number.parseInt(usMatch[2], 10);
-      const year = Number.parseInt(usMatch[3], 10);
-      if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
-        return new Date(year, month, day);
-      }
-    } else if (hasEUFormat && !hasUSFormat) {
-      const day = Number.parseInt(usMatch[1], 10);
-      const month = Number.parseInt(usMatch[2], 10) - 1;
-      const year = Number.parseInt(usMatch[3], 10);
-      if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
-        return new Date(year, month, day);
-      }
-    } else {
-      const num1 = Number.parseInt(usMatch[1], 10);
-      const num2 = Number.parseInt(usMatch[2], 10);
-      const year = Number.parseInt(usMatch[3], 10);
-      if (num1 > 12 && num2 <= 12) {
-        return new Date(year, num2 - 1, num1);
-      }
-      if (num2 > 12 && num1 <= 12) {
-        return new Date(year, num1 - 1, num2);
-      }
-      if (num1 <= 12 && num2 <= 31) {
-        return new Date(year, num1 - 1, num2);
-      }
-    }
-  }
-  return null;
-}
-function formatDateISO(date) {
-  return date.toISOString().split("T")[0];
-}
-function parseJsonFromTextSafe(text) {
-  const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
-  const match = cleaned.match(/\{[\s\S]*\}/);
-  const blob = match?.[0] ?? cleaned;
-  return JSON.parse(blob);
-}
-var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
-{{EVALUATOR_RESULTS_JSON}}
-Decide the final score and verdict based on all evaluator results.
-Return a JSON object with: score (0.0-1.0), verdict (pass/fail/borderline), and reasoning.`;
-var CompositeEvaluator = class {
-  kind = "composite";
-  config;
-  evaluatorFactory;
-  cwd;
-  constructor(options) {
-    this.config = options.config;
-    this.evaluatorFactory = options.evaluatorFactory;
-    this.cwd = options.cwd;
-  }
-  async evaluate(context) {
-    const memberResults = await Promise.all(
-      this.config.evaluators.map(async (memberConfig) => {
-        const evaluator = this.evaluatorFactory.create(memberConfig, context);
-        return {
-          id: memberConfig.name,
-          type: memberConfig.type,
-          result: await evaluator.evaluate(context)
-        };
-      })
-    );
-    return this.aggregate(memberResults, context);
-  }
-  async aggregate(results, context) {
-    const aggregator = this.config.aggregator;
-    switch (aggregator.type) {
-      case "code_judge":
-        return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
-      case "llm_judge":
-        return this.runLlmAggregator(results, context, aggregator);
-      default:
-        return this.runWeightedAverage(results, aggregator.weights);
-    }
-  }
-  runWeightedAverage(results, weights) {
-    let totalWeight = 0;
-    let weightedSum = 0;
-    const allHits = [];
-    const allMisses = [];
-    const reasoningParts = [];
-    const evaluatorResults = [];
-    for (const member of results) {
-      const weight = weights?.[member.id] ?? 1;
-      totalWeight += weight;
-      weightedSum += member.result.score * weight;
-      allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
-      allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
-      if (member.result.reasoning) {
-        reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
-      }
-      evaluatorResults.push({
-        name: member.id,
-        type: member.type,
-        score: member.result.score,
-        weight,
-        verdict: member.result.verdict,
-        hits: [...member.result.hits],
-        misses: [...member.result.misses],
-        reasoning: member.result.reasoning,
-        evaluatorRawRequest: member.result.evaluatorRawRequest,
-        evaluatorResults: member.result.evaluatorResults
-      });
-    }
-    const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
-    return {
-      score: clampScore(finalScore),
-      verdict: scoreToVerdict(finalScore),
-      hits: allHits,
-      misses: allMisses,
-      expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
-      reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
-      evaluatorRawRequest: {
-        aggregator: "weighted_average",
-        ...weights ? { weights } : {}
-      },
-      evaluatorResults
-    };
-  }
-  async runCodeAggregator(results, scriptPath, cwd, weights) {
-    const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
-    const inputPayload = JSON.stringify({ results: resultsObject }, null, 2);
-    const evaluatorResults = results.map((member) => ({
-      name: member.id,
-      type: member.type,
-      score: member.result.score,
-      weight: weights?.[member.id] ?? 1,
-      verdict: member.result.verdict,
-      hits: [...member.result.hits],
-      misses: [...member.result.misses],
-      reasoning: member.result.reasoning,
-      evaluatorRawRequest: member.result.evaluatorRawRequest,
-      evaluatorResults: member.result.evaluatorResults
-    }));
-    try {
-      const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
-      const parsed = parseJsonSafe(stdout);
-      const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
-      const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
-      const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
-      const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
-      const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
-      return {
-        score,
-        verdict,
-        hits,
-        misses,
-        expectedAspectCount: hits.length + misses.length || 1,
-        reasoning,
-        evaluatorRawRequest: {
-          aggregator: "code_judge",
-          script: scriptPath
-        },
-        evaluatorResults
-      };
-    } catch (error) {
-      const message = error instanceof Error ? error.message : String(error);
-      return {
-        score: 0,
-        verdict: "fail",
-        hits: [],
-        misses: [`Code aggregator failed: ${message}`],
-        expectedAspectCount: 1,
-        reasoning: message,
-        evaluatorRawRequest: {
-          aggregator: "code_judge",
-          script: scriptPath,
-          error: message
-        },
-        evaluatorResults
-      };
-    }
-  }
-  async runLlmAggregator(results, context, config) {
-    const judgeProvider = context.judgeProvider;
-    if (!judgeProvider) {
-      throw new Error("No judge provider available for LLM aggregation");
-    }
-    const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
-    const resultsJson = JSON.stringify(resultsObject, null, 2);
-    const evaluatorResults = results.map((member) => ({
-      name: member.id,
-      type: member.type,
-      score: member.result.score,
-      verdict: member.result.verdict,
-      hits: [...member.result.hits],
-      misses: [...member.result.misses],
-      reasoning: member.result.reasoning,
-      evaluatorRawRequest: member.result.evaluatorRawRequest,
-      evaluatorResults: member.result.evaluatorResults
-    }));
-    const promptTemplate = config.prompt ?? DEFAULT_COMPOSITE_AGGREGATOR_PROMPT;
-    const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
-    const systemPrompt = buildOutputSchema();
-    const evaluatorRawRequest = {
-      aggregator: "llm_judge",
-      userPrompt,
-      systemPrompt,
-      target: judgeProvider.targetName
-    };
-    try {
-      const model = judgeProvider.asLanguageModel?.();
-      if (model) {
-        const { text } = await (0, import_ai2.generateText)({
-          model,
-          system: systemPrompt,
-          prompt: userPrompt
-        });
-        const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text));
-        const score2 = clampScore(data2.score);
-        const hits2 = Array.isArray(data2.hits) ? data2.hits.filter(isNonEmptyString).slice(0, 4) : [];
-        const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
-        const reasoning2 = data2.reasoning;
-        return {
-          score: score2,
-          verdict: scoreToVerdict(score2),
-          hits: hits2,
-          misses: misses2,
-          expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
-          reasoning: reasoning2,
-          evaluatorRawRequest,
-          evaluatorResults
-        };
-      }
-      const response = await judgeProvider.invoke({
-        question: userPrompt,
-        systemPrompt,
-        evalCaseId: context.evalCase.id,
-        attempt: context.attempt
-      });
-      const data = freeformEvaluationSchema.parse(
-        parseJsonFromText(extractLastAssistantContent(response.outputMessages))
-      );
-      const score = clampScore(data.score);
-      const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
-      const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
-      const reasoning = data.reasoning;
+    if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
       return {
-        score,
-        verdict: scoreToVerdict(score),
-        hits,
-        misses,
-        expectedAspectCount: Math.max(hits.length + misses.length, 1),
-        reasoning,
-        evaluatorRawRequest,
-        evaluatorResults
+        path: path17,
+        score: 0,
+        weight,
+        hit: false,
+        message: `${path17} (invalid numeric value)`
       };
-    } catch {
+    }
+    const diff = Math.abs(candidateNum - expectedNum);
+    let withinTolerance;
+    if (relative) {
+      const relativeDiff = expectedNum === 0 ? diff : diff / Math.abs(expectedNum);
+      withinTolerance = relativeDiff <= tolerance;
+    } else {
+      withinTolerance = diff <= tolerance;
+    }
+    if (withinTolerance) {
+      return {
+        path: path17,
+        score: 1,
+        weight,
+        hit: true,
+        message: `${path17} (within tolerance: diff=${diff.toFixed(2)})`
+      };
+    }
+    return {
+      path: path17,
+      score: 0,
+      weight,
+      hit: false,
+      message: `${path17} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
+    };
+  }
+  /**
+   * Date comparison with format normalization.
+   */
+  compareDate(path17, candidateValue, expectedValue, fieldConfig, weight) {
+    const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
+    const candidateDate = parseDate(String(candidateValue), formats);
+    const expectedDate = parseDate(String(expectedValue), formats);
+    if (candidateDate === null) {
       return {
+        path: path17,
         score: 0,
-        verdict: "fail",
-        hits: [],
-        misses: [],
-        expectedAspectCount: 1,
-        evaluatorRawRequest,
-        evaluatorResults
+        weight,
+        hit: false,
+        message: `${path17} (unparseable candidate date)`
+      };
+    }
+    if (expectedDate === null) {
+      return {
+        path: path17,
+        score: 0,
+        weight,
+        hit: false,
+        message: `${path17} (unparseable expected date)`
+      };
+    }
+    if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
+      return {
+        path: path17,
+        score: 1,
+        weight,
+        hit: true,
+        message: path17
       };
     }
+    return {
+      path: path17,
+      score: 0,
+      weight,
+      hit: false,
+      message: `${path17} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
+    };
+  }
+  /**
+   * Aggregate field results using configured strategy.
+   */
+  aggregateResults(results) {
+    const aggregation = this.config.aggregation ?? "weighted_average";
+    const hits = [];
+    const misses = [];
+    for (const result of results) {
+      if (result.hit) {
+        hits.push(result.message);
+      } else {
+        misses.push(result.message);
+      }
+    }
+    let score;
+    if (aggregation === "all_or_nothing") {
+      score = misses.length === 0 ? 1 : 0;
+    } else {
+      const totalWeight = results.reduce((sum, r) => sum + r.weight, 0);
+      if (totalWeight === 0) {
+        score = results.length === 0 ? 1 : 0;
+      } else {
+        const weightedSum = results.reduce((sum, r) => sum + r.score * r.weight, 0);
+        score = weightedSum / totalWeight;
+      }
+    }
+    const reasoning = `${hits.length}/${results.length} fields matched`;
+    return {
+      score: clampScore(score),
+      verdict: scoreToVerdict(score),
+      hits: hits.slice(0, 4),
+      misses: misses.slice(0, 4),
+      expectedAspectCount: results.length,
+      reasoning
+    };
   }
 };
+function resolvePath(obj, path17) {
+  if (!path17 || !obj) {
+    return void 0;
+  }
+  const parts = path17.split(/\.|\[|\]/).filter((p) => p.length > 0);
+  let current = obj;
+  for (const part of parts) {
+    if (current === null || current === void 0) {
+      return void 0;
+    }
+    if (typeof current !== "object") {
+      return void 0;
+    }
+    const isIndex = /^\d+$/.test(part);
+    if (isIndex && Array.isArray(current)) {
+      current = current[Number.parseInt(part, 10)];
+    } else {
+      current = current[part];
+    }
+  }
+  return current;
+}
+function toNumber(value) {
+  if (typeof value === "number") {
+    return value;
+  }
+  if (typeof value === "string") {
+    const num = Number.parseFloat(value);
+    return Number.isNaN(num) ? null : num;
+  }
+  return null;
+}
+function parseDate(dateStr, formats) {
+  if (!dateStr) return null;
+  const trimmed = dateStr.trim();
+  const isoDate = new Date(trimmed);
+  if (!Number.isNaN(isoDate.getTime())) {
+    return isoDate;
+  }
+  const localizedMatch = trimmed.match(/^(\d{1,2})-([A-Za-z]{3,9})-(\d{4})$/);
+  if (localizedMatch) {
+    const day = Number.parseInt(localizedMatch[1], 10);
+    const monthName = localizedMatch[2].toLowerCase();
+    const year = Number.parseInt(localizedMatch[3], 10);
+    const month = MONTH_NAMES[monthName];
+    if (month !== void 0) {
+      return new Date(year, month, day);
+    }
+  }
+  const usMatch = trimmed.match(/^(\d{1,2})[\/\-](\d{1,2})[\/\-](\d{4})$/);
+  if (usMatch) {
+    const hasUSFormat = formats.some((f) => f.includes("MM/DD") || f.includes("MM-DD"));
+    const hasEUFormat = formats.some((f) => f.includes("DD/MM") || f.includes("DD-MM"));
+    if (hasUSFormat && !hasEUFormat) {
+      const month = Number.parseInt(usMatch[1], 10) - 1;
+      const day = Number.parseInt(usMatch[2], 10);
+      const year = Number.parseInt(usMatch[3], 10);
+      if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
+        return new Date(year, month, day);
+      }
+    } else if (hasEUFormat && !hasUSFormat) {
+      const day = Number.parseInt(usMatch[1], 10);
+      const month = Number.parseInt(usMatch[2], 10) - 1;
+      const year = Number.parseInt(usMatch[3], 10);
+      if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
+        return new Date(year, month, day);
+      }
+    } else {
+      const num1 = Number.parseInt(usMatch[1], 10);
+      const num2 = Number.parseInt(usMatch[2], 10);
+      const year = Number.parseInt(usMatch[3], 10);
+      if (num1 > 12 && num2 <= 12) {
+        return new Date(year, num2 - 1, num1);
+      }
+      if (num2 > 12 && num1 <= 12) {
+        return new Date(year, num1 - 1, num2);
+      }
+      if (num1 <= 12 && num2 <= 31) {
+        return new Date(year, num1 - 1, num2);
+      }
+    }
+  }
+  return null;
+}
+function formatDateISO(date) {
+  return date.toISOString().split("T")[0];
+}
+function parseJsonFromTextSafe(text) {
+  return parseJsonFromText(text);
+}
+// src/evaluation/evaluators/latency.ts
 var LatencyEvaluator = class {
   kind = "latency";
   config;
@@ -7816,56 +7931,16 @@ var LatencyEvaluator = class {
       misses: passed ? [] : [`Duration ${durationMs}ms > ${threshold}ms threshold`],
       expectedAspectCount: 1,
       reasoning: `Execution took ${durationMs}ms (threshold: ${threshold}ms)`,
-      evaluatorRawRequest: {
-        type: "latency",
-        threshold,
-        durationMs
-      }
-    };
-  }
-};
-var CostEvaluator = class {
-  kind = "cost";
-  config;
-  constructor(options) {
-    this.config = options.config;
-  }
-  evaluate(context) {
-    const { budget } = this.config;
-    const costUsd = context.traceSummary?.costUsd;
-    if (costUsd === void 0) {
-      return {
-        score: 0,
-        verdict: "fail",
-        hits: [],
-        misses: ["No cost data available in trace"],
-        expectedAspectCount: 1,
-        reasoning: "Execution cost not reported by provider",
-        evaluatorRawRequest: {
-          type: "cost",
-          budget,
-          costUsd: null
-        }
-      };
-    }
-    const passed = costUsd <= budget;
-    const score = passed ? 1 : 0;
-    const formatCost = (n) => `$${n.toFixed(4)}`;
-    return {
-      score,
-      verdict: passed ? "pass" : "fail",
-      hits: passed ? [`Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`] : [],
-      misses: passed ? [] : [`Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`],
-      expectedAspectCount: 1,
-      reasoning: `Execution cost ${formatCost(costUsd)} (budget: ${formatCost(budget)})`,
-      evaluatorRawRequest: {
-        type: "cost",
-        budget,
-        costUsd
+      evaluatorRawRequest: {
+        type: "latency",
+        threshold,
+        durationMs
       }
     };
   }
 };
+// src/evaluation/evaluators/token-usage.ts
 var TokenUsageEvaluator = class {
   kind = "token_usage";
   config;
@@ -7949,8 +8024,228 @@ var TokenUsageEvaluator = class {
   }
 };
+// src/evaluation/evaluators/tool-trajectory.ts
+function argsMatch(expected, actual) {
+  if (expected === void 0) return true;
+  if (expected === "any") return true;
+  if (actual === void 0) return false;
+  for (const key of Object.keys(expected)) {
+    if (!Object.hasOwn(actual, key)) return false;
+    if (!deepEqual(expected[key], actual[key])) return false;
+  }
+  return true;
+}
+var ToolTrajectoryEvaluator = class {
+  kind = "tool_trajectory";
+  config;
+  constructor(options) {
+    this.config = options.config;
+  }
+  evaluate(context) {
+    const { outputMessages, traceSummary } = context;
+    const toolCalls = this.extractToolCallsFromMessages(outputMessages);
+    if (toolCalls.length === 0 && !traceSummary) {
+      return {
+        score: 0,
+        verdict: "fail",
+        hits: [],
+        misses: ["No trace available for evaluation"],
+        expectedAspectCount: 1
+      };
+    }
+    const summary = toolCalls.length > 0 ? this.buildSummary(toolCalls) : traceSummary;
+    if (!summary) {
+      return {
+        score: 0,
+        verdict: "fail",
+        hits: [],
+        misses: ["No trace available for evaluation"],
+        expectedAspectCount: 1
+      };
+    }
+    switch (this.config.mode) {
+      case "any_order":
+        return this.evaluateAnyOrder(summary);
+      case "in_order":
+        return this.evaluateInOrder(toolCalls);
+      case "exact":
+        return this.evaluateExact(toolCalls);
+      default:
+        return {
+          score: 0,
+          verdict: "fail",
+          hits: [],
+          misses: [`Unknown mode: ${this.config.mode}`],
+          expectedAspectCount: 1
+        };
+    }
+  }
+  /**
+   * Extract tool calls from output messages.
+   */
+  extractToolCallsFromMessages(messages) {
+    if (!messages) {
+      return [];
+    }
+    const toolCalls = [];
+    for (const message of messages) {
+      if (message.toolCalls) {
+        for (const call of message.toolCalls) {
+          toolCalls.push({
+            name: call.tool,
+            args: call.input
+          });
+        }
+      }
+    }
+    return toolCalls;
+  }
+  /**
+   * Build a summary from extracted tool calls.
+   */
+  buildSummary(toolCalls) {
+    const toolCallsByName = {};
+    for (const call of toolCalls) {
+      toolCallsByName[call.name] = (toolCallsByName[call.name] ?? 0) + 1;
+    }
+    const toolNames = Object.keys(toolCallsByName).sort();
+    return {
+      eventCount: toolCalls.length,
+      toolNames,
+      toolCallsByName,
+      errorCount: 0
+    };
+  }
+  evaluateAnyOrder(summary) {
+    const minimums = this.config.minimums ?? {};
+    const toolNames = Object.keys(minimums);
+    if (toolNames.length === 0) {
+      return {
+        score: 1,
+        verdict: "pass",
+        hits: ["No tool requirements specified"],
+        misses: [],
+        expectedAspectCount: 0
+      };
+    }
+    const hits = [];
+    const misses = [];
+    for (const toolName of toolNames) {
+      const required = minimums[toolName];
+      const actual = summary.toolCallsByName[toolName] ?? 0;
+      if (actual >= required) {
+        hits.push(`${toolName}: called ${actual} times (required \u2265${required})`);
+      } else {
+        misses.push(`${toolName}: called ${actual} times (required \u2265${required})`);
+      }
+    }
+    const score = hits.length / toolNames.length;
+    return {
+      score,
+      verdict: scoreToVerdict(score),
+      hits,
+      misses,
+      expectedAspectCount: toolNames.length
+    };
+  }
+  evaluateInOrder(toolCalls) {
+    const expected = this.config.expected ?? [];
+    if (expected.length === 0) {
+      return {
+        score: 1,
+        verdict: "pass",
+        hits: ["No tool sequence specified"],
+        misses: [],
+        expectedAspectCount: 0
+      };
+    }
+    const hits = [];
+    const misses = [];
+    let actualIndex = 0;
+    for (let i = 0; i < expected.length; i++) {
+      const expectedItem = expected[i];
+      const expectedTool = expectedItem.tool;
+      let found = false;
+      let argsMismatch = false;
+      while (actualIndex < toolCalls.length) {
+        const actualCall = toolCalls[actualIndex];
+        if (actualCall.name === expectedTool) {
+          if (argsMatch(expectedItem.args, actualCall.args)) {
+            hits.push(`Found ${expectedTool} at position ${actualIndex}`);
+            actualIndex++;
+            found = true;
+            break;
+          }
+          misses.push(
+            `Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`
+          );
+          actualIndex++;
+          argsMismatch = true;
+          break;
+        }
+        actualIndex++;
+      }
+      if (!found && !argsMismatch) {
+        misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
+      }
+    }
+    const score = hits.length / expected.length;
+    return {
+      score,
+      verdict: scoreToVerdict(score),
+      hits,
+      misses,
+      expectedAspectCount: expected.length
+    };
+  }
+  evaluateExact(toolCalls) {
+    const expected = this.config.expected ?? [];
+    if (expected.length === 0) {
+      return {
+        score: 1,
+        verdict: "pass",
+        hits: ["No tool sequence specified"],
+        misses: [],
+        expectedAspectCount: 0
+      };
+    }
+    const hits = [];
+    const misses = [];
+    if (toolCalls.length !== expected.length) {
+      misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
+    }
+    const checkLength = Math.min(expected.length, toolCalls.length);
+    for (let i = 0; i < checkLength; i++) {
+      const expectedItem = expected[i];
+      const expectedTool = expectedItem.tool;
+      const actualCall = toolCalls[i];
+      const actualTool = actualCall.name;
+      if (actualTool === expectedTool) {
+        if (argsMatch(expectedItem.args, actualCall.args)) {
+          hits.push(`Position ${i}: ${expectedTool}`);
+        } else {
+          misses.push(`Position ${i}: ${expectedTool} args mismatch`);
+        }
+      } else {
+        misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
+      }
+    }
+    for (let i = checkLength; i < expected.length; i++) {
+      misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
+    }
+    const score = hits.length / expected.length;
+    return {
+      score,
+      verdict: scoreToVerdict(score),
+      hits,
+      misses,
+      expectedAspectCount: expected.length
+    };
+  }
+};
 // src/evaluation/orchestrator.ts
-var import_node_crypto4 = require("crypto");
+var import_node_crypto5 = require("crypto");
 var import_node_path16 = __toESM(require("path"), 1);
 // ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
@@ -8162,6 +8457,17 @@ async function runEvaluation(options) {
     }
     return getOrCreateProvider(resolvedJudge);
   };
+  const targetResolver = (name) => {
+    const resolved = resolveTargetByName(name);
+    if (!resolved) {
+      return void 0;
+    }
+    return getOrCreateProvider(resolved);
+  };
+  const availableTargets = [
+    target.name,
+    ...Array.from(targetDefinitions.keys())
+  ];
   const evaluatorRegistry = buildEvaluatorRegistry(evaluators, resolveJudgeProvider);
   const primaryProvider = getOrCreateProvider(target);
   const providerSupportsBatch = target.providerBatching === true && primaryProvider.supportsBatch === true && typeof primaryProvider.invokeBatch === "function";
@@ -8191,7 +8497,9 @@ async function runEvaluation(options) {
         onResult,
         verbose,
         resolveJudgeProvider,
-        agentTimeoutMs
+        agentTimeoutMs,
+        targetResolver,
+        availableTargets
       });
     } catch (error) {
       if (verbose) {
@@ -8230,7 +8538,9 @@ async function runEvaluation(options) {
           cache,
           useCache,
           now,
-          judgeProvider
+          judgeProvider,
+          targetResolver,
+          availableTargets
         });
         if (onProgress) {
           await onProgress({
@@ -8297,7 +8607,9 @@ async function runBatchEvaluation(options) {
     onProgress,
     onResult,
     resolveJudgeProvider,
-    agentTimeoutMs
+    agentTimeoutMs,
+    targetResolver,
+    availableTargets
   } = options;
   const promptInputsList = [];
   const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
@@ -8356,7 +8668,7 @@ async function runBatchEvaluation(options) {
       costUsd: providerResponse.costUsd,
       durationMs: providerResponse.durationMs
     }) : void 0;
-    const candidate = extractLastAssistantContent(outputMessages);
+    const candidate = extractLastAssistantContent2(outputMessages);
     const providerError = extractProviderError(providerResponse);
     let result;
     try {
@@ -8372,7 +8684,9 @@ async function runBatchEvaluation(options) {
         judgeProvider: await resolveJudgeProvider(target),
         agentTimeoutMs,
         outputMessages,
-        traceSummary
+        traceSummary,
+        targetResolver,
+        availableTargets
       });
       if (providerError) {
         result = { ...result, error: providerError };
@@ -8430,7 +8744,9 @@ async function runEvalCase(options) {
     cache,
     useCache,
     signal,
-    judgeProvider
+    judgeProvider,
+    targetResolver,
+    availableTargets
   } = options;
   const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
   const promptInputs = await buildPromptInputs(evalCase, formattingMode);
@@ -8489,7 +8805,7 @@ async function runEvalCase(options) {
     costUsd: providerResponse.costUsd,
     durationMs: providerResponse.durationMs
   }) : void 0;
-  const candidate = extractLastAssistantContent(outputMessages);
+  const candidate = extractLastAssistantContent2(outputMessages);
   const providerError = extractProviderError(providerResponse);
   try {
     const result = await evaluateCandidate({
@@ -8504,7 +8820,9 @@ async function runEvalCase(options) {
       judgeProvider,
       agentTimeoutMs,
       outputMessages,
-      traceSummary
+      traceSummary,
+      targetResolver,
+      availableTargets
     });
     return providerError ? { ...result, error: providerError } : result;
   } catch (error) {
@@ -8524,7 +8842,9 @@ async function evaluateCandidate(options) {
     judgeProvider,
     agentTimeoutMs,
     outputMessages,
-    traceSummary
+    traceSummary,
+    targetResolver,
+    availableTargets
   } = options;
   const gradeTimestamp = nowFn();
   const { score, evaluatorResults } = await runEvaluatorsForCase({
@@ -8539,7 +8859,9 @@ async function evaluateCandidate(options) {
     judgeProvider,
     agentTimeoutMs,
     outputMessages,
-    traceSummary
+    traceSummary,
+    targetResolver,
+    availableTargets
   });
   const completedAt = nowFn();
   let agentProviderRequest;
@@ -8592,7 +8914,9 @@ async function runEvaluatorsForCase(options) {
     judgeProvider,
     agentTimeoutMs,
     outputMessages,
-    traceSummary
+    traceSummary,
+    targetResolver,
+    availableTargets
   } = options;
   if (evalCase.evaluators && evalCase.evaluators.length > 0) {
     return runEvaluatorList({
@@ -8608,7 +8932,9 @@ async function runEvaluatorsForCase(options) {
       judgeProvider,
       agentTimeoutMs,
       outputMessages,
-      traceSummary
+      traceSummary,
+      targetResolver,
+      availableTargets
     });
   }
   const evaluatorKind = evalCase.evaluator ?? "llm_judge";
@@ -8626,7 +8952,9 @@ async function runEvaluatorsForCase(options) {
     now,
     judgeProvider,
     outputMessages,
-    traceSummary
+    traceSummary,
+    targetResolver,
+    availableTargets
   });
   return { score };
 }
@@ -8644,7 +8972,9 @@ async function runEvaluatorList(options) {
     judgeProvider,
     agentTimeoutMs,
     outputMessages,
-    traceSummary
+    traceSummary,
+    targetResolver,
+    availableTargets
   } = options;
   const scored = [];
   const evaluatorResults = [];
@@ -8682,7 +9012,8 @@ async function runEvaluatorList(options) {
           script: evaluator.script,
           cwd: evaluator.resolvedCwd ?? evaluator.cwd,
           agentTimeoutMs,
-          config: evaluator.config
+          config: evaluator.config,
+          target: evaluator.target
         });
         const score2 = await codeEvaluator.evaluate({
           evalCase,
@@ -8692,8 +9023,11 @@ async function runEvaluatorList(options) {
           attempt,
           promptInputs,
           now,
+          judgeProvider,
           outputMessages,
-          traceSummary
+          traceSummary,
+          targetResolver,
+          availableTargets
         });
         const weight = evaluator.weight ?? 1;
         scored.push({ score: score2, name: evaluator.name, type: "code_judge", weight });
@@ -8706,7 +9040,8 @@ async function runEvaluatorList(options) {
           hits: score2.hits,
           misses: score2.misses,
           reasoning: score2.reasoning,
-          evaluatorProviderRequest: score2.evaluatorRawRequest
+          evaluatorProviderRequest: score2.evaluatorRawRequest,
+          details: score2.details
         });
       }
       if (evaluator.type === "composite") {
@@ -8720,7 +9055,8 @@ async function runEvaluatorList(options) {
                 script: memberConfig.script,
                 cwd: memberConfig.resolvedCwd ?? memberConfig.cwd,
                 agentTimeoutMs,
-                config: memberConfig.config
+                config: memberConfig.config,
+                target: memberConfig.target
               });
             case "composite":
               return new CompositeEvaluator({
@@ -8769,7 +9105,9 @@ async function runEvaluatorList(options) {
           now,
           judgeProvider,
           outputMessages,
-          traceSummary
+          traceSummary,
+          targetResolver,
+          availableTargets
         });
         const weight = evaluator.weight ?? 1;
         scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
@@ -8965,11 +9303,11 @@ async function runEvaluatorList(options) {
     (total, entry) => total + (entry.score.expectedAspectCount ?? 0),
     0
   );
-  const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString2);
+  const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString);
   const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
   const score = {
     score: aggregateScore,
-    verdict: scoreToVerdict2(aggregateScore),
+    verdict: scoreToVerdict(aggregateScore),
     hits,
     misses,
     expectedAspectCount,
@@ -9016,18 +9354,6 @@ async function resolveCustomPrompt(config) {
   }
   return config.prompt;
 }
-function isNonEmptyString2(value) {
-  return typeof value === "string" && value.trim().length > 0;
-}
-function scoreToVerdict2(score) {
-  if (score >= 0.8) {
-    return "pass";
-  }
-  if (score >= 0.6) {
-    return "borderline";
-  }
-  return "fail";
-}
 function filterEvalCases(evalCases, evalId) {
   if (!evalId) {
     return evalCases;
@@ -9129,7 +9455,7 @@ function extractProviderError(response) {
   return trimmed.length > 0 ? trimmed : void 0;
 }
 function createCacheKey(provider, target, evalCase, promptInputs) {
-  const hash = (0, import_node_crypto4.createHash)("sha256");
+  const hash = (0, import_node_crypto5.createHash)("sha256");
   hash.update(provider.id);
   hash.update(target.name);
   hash.update(evalCase.id);
@@ -9170,7 +9496,8 @@ function mapChildResults(children) {
     misses: child.misses,
     reasoning: child.reasoning,
     evaluatorProviderRequest: child.evaluatorRawRequest,
-    evaluatorResults: mapChildResults(child.evaluatorResults)
+    evaluatorResults: mapChildResults(child.evaluatorResults),
+    details: child.details
   }));
 }
 function computeWeightedMean(entries) {
@@ -9185,7 +9512,7 @@ function computeWeightedMean(entries) {
 }
 // src/evaluation/generators/rubric-generator.ts
-var import_ai3 = require("ai");
+var import_ai4 = require("ai");
 var import_zod4 = require("zod");
 var rubricItemSchema = import_zod4.z.object({
   id: import_zod4.z.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
@@ -9219,7 +9546,7 @@ You must return a valid JSON object matching this schema:
   let lastError;
   for (let attempt = 1; attempt <= 3; attempt++) {
     try {
-      const { text } = await (0, import_ai3.generateText)({
+      const { text } = await (0, import_ai4.generateText)({
         model,
         system,
         prompt
@@ -9282,31 +9609,39 @@ function createAgentKernel() {
   ToolTrajectoryEvaluator,
   avgToolDurationMs,
   buildDirectoryChain,
+  buildOutputSchema,
   buildPromptInputs,
   buildSearchRoots,
+  clampScore,
   computeTraceSummary,
   consumeClaudeCodeLogEntries,
   consumeCodexLogEntries,
   consumePiLogEntries,
   createAgentKernel,
   createProvider,
+  deepEqual,
   ensureVSCodeSubagents,
+  executeScript,
   explorationRatio,
-  extractCodeBlocks,
+  extractJsonBlob,
   fileExists,
   findGitRoot,
+  freeformEvaluationSchema,
   generateRubrics,
   getHitCount,
   isEvaluatorKind,
   isGuidelineFile,
   isJsonObject,
   isJsonValue,
+  isNonEmptyString,
   isTestMessage,
   isTestMessageRole,
   listTargetNames,
   loadEvalCases,
   mergeExecutionMetrics,
   normalizeLineEndings,
+  parseJsonFromText,
+  parseJsonSafe,
   readJsonFile,
   readTargetDefinitions,
   readTestSuiteMetadata,
@@ -9316,6 +9651,7 @@ function createAgentKernel() {
   resolveTargetDefinition,
   runEvalCase,
   runEvaluation,
+  scoreToVerdict,
   subscribeToClaudeCodeLogEntries,
   subscribeToCodexLogEntries,
   subscribeToPiLogEntries,