npm - @agentv/core - Versions diffs - 2.0.2 → 2.1.1 - Mend

@agentv/core 2.0.2 → 2.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

package/dist/evaluation/validation/index.cjs +0 -11
package/dist/evaluation/validation/index.cjs.map +1 -1
package/dist/evaluation/validation/index.js +0 -11
package/dist/evaluation/validation/index.js.map +1 -1
package/dist/index.cjs +1336 -1007
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +142 -71
package/dist/index.d.ts +142 -71
package/dist/index.js +1293 -973
package/dist/index.js.map +1 -1
package/package.json +2 -2

package/dist/index.js CHANGED Viewed

@@ -150,85 +150,6 @@ import { readFile as readFile5 } from "node:fs/promises";
 import path6 from "node:path";
 import { parse as parse2 } from "yaml";
-// src/evaluation/formatting/segment-formatter.ts
-function extractCodeBlocks(segments) {
-  const CODE_BLOCK_PATTERN = /```[\s\S]*?```/g;
-  const codeBlocks = [];
-  for (const segment of segments) {
-    const typeValue = segment.type;
-    if (typeof typeValue !== "string" || typeValue !== "text") {
-      continue;
-    }
-    const textValue = segment.value;
-    if (typeof textValue !== "string") {
-      continue;
-    }
-    const matches = textValue.match(CODE_BLOCK_PATTERN);
-    if (matches) {
-      codeBlocks.push(...matches);
-    }
-  }
-  return codeBlocks;
-}
-function formatFileContents(parts) {
-  const fileCount = parts.filter((p) => p.isFile).length;
-  if (fileCount > 0) {
-    return parts.map((part) => {
-      if (part.isFile && part.displayPath) {
-        return `<file path="${part.displayPath}">
-${part.content}
-</file>`;
-      }
-      return part.content;
-    }).join("\n\n");
-  }
-  return parts.map((p) => p.content).join(" ");
-}
-function formatSegment(segment, mode = "lm") {
-  const type = asString(segment.type);
-  if (type === "text") {
-    return asString(segment.value);
-  }
-  if (type === "guideline_ref") {
-    const refPath = asString(segment.path);
-    return refPath ? `<Attached: ${refPath}>` : void 0;
-  }
-  if (type === "file") {
-    const filePath = asString(segment.path);
-    if (!filePath) {
-      return void 0;
-    }
-    if (mode === "agent") {
-      return `<file: path="${filePath}">`;
-    }
-    const text = asString(segment.text);
-    if (text && filePath) {
-      return formatFileContents([{ content: text.trim(), isFile: true, displayPath: filePath }]);
-    }
-  }
-  return void 0;
-}
-function hasVisibleContent(segments) {
-  return segments.some((segment) => {
-    const type = asString(segment.type);
-    if (type === "text") {
-      const value = asString(segment.value);
-      return value !== void 0 && value.trim().length > 0;
-    }
-    if (type === "guideline_ref") {
-      return false;
-    }
-    if (type === "file") {
-      const text = asString(segment.text);
-      return text !== void 0 && text.trim().length > 0;
-    }
-    return false;
-  });
-}
-function asString(value) {
-  return typeof value === "string" ? value : void 0;
-}
 // src/evaluation/loaders/config-loader.ts
 import { readFile } from "node:fs/promises";
 import path2 from "node:path";
@@ -336,7 +257,6 @@ async function resolveFileReference2(rawValue, searchRoots) {
 }
 // src/evaluation/loaders/config-loader.ts
-var SCHEMA_CONFIG_V2 = "agentv-config-v2";
 var ANSI_YELLOW = "\x1B[33m";
 var ANSI_RESET = "\x1B[0m";
 async function loadConfig(evalFilePath, repoRoot) {
@@ -354,13 +274,6 @@ async function loadConfig(evalFilePath, repoRoot) {
         continue;
       }
       const config = parsed;
-      const schema = config.$schema;
-      if (schema !== SCHEMA_CONFIG_V2) {
-        const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${configPath}. Expected '${SCHEMA_CONFIG_V2}'` : `Missing required field '$schema' in ${configPath}.
-Please add '$schema: ${SCHEMA_CONFIG_V2}' at the top of the file.`;
-        logWarning(message);
-        continue;
-      }
       const guidelinePatterns = config.guideline_patterns;
       if (guidelinePatterns !== void 0 && !Array.isArray(guidelinePatterns)) {
         logWarning(`Invalid guideline_patterns in ${configPath}, expected array`);
@@ -469,7 +382,8 @@ var ANSI_YELLOW3 = "\x1B[33m";
 var ANSI_RESET3 = "\x1B[0m";
 async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
   const execution = rawEvalCase.execution;
-  const candidateEvaluators = isJsonObject2(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators ?? globalExecution?.evaluators;
+  const executionObject = isJsonObject2(execution) ? execution : void 0;
+  const candidateEvaluators = (executionObject ? executionObject.evaluators : void 0) ?? rawEvalCase.evaluators ?? globalExecution?.evaluators;
   if (candidateEvaluators === void 0) {
     return void 0;
   }
@@ -483,7 +397,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
       logWarning2(`Skipping invalid evaluator entry for '${evalId}' (expected object)`);
       continue;
     }
-    const name = asString2(rawEvaluator.name);
+    const name = asString(rawEvaluator.name);
     const typeValue = rawEvaluator.type;
     if (!name || !isEvaluatorKind(typeValue)) {
       logWarning2(`Skipping evaluator with invalid name/type in '${evalId}'`);
@@ -511,7 +425,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
         continue;
       }
       const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
-      const cwd = asString2(rawEvaluator.cwd);
+      const cwd = asString(rawEvaluator.cwd);
       let resolvedCwd;
       if (cwd) {
         const resolved = await resolveFileReference2(cwd, searchRoots);
@@ -526,7 +440,29 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
       } else {
         resolvedCwd = searchRoots[0];
       }
-      const knownProps = /* @__PURE__ */ new Set(["name", "type", "script", "cwd", "weight"]);
+      const rawTarget = rawEvaluator.target;
+      let targetConfig;
+      if (rawTarget !== void 0) {
+        if (isJsonObject2(rawTarget)) {
+          const maxCalls = rawTarget.max_calls;
+          if (maxCalls !== void 0 && (typeof maxCalls !== "number" || maxCalls < 0)) {
+            logWarning2(
+              `Invalid target.max_calls for evaluator '${name}' in '${evalId}': must be a non-negative number`
+            );
+          } else {
+            targetConfig = {
+              ...typeof maxCalls === "number" ? { max_calls: maxCalls } : {}
+            };
+          }
+        } else if (rawTarget === true) {
+          targetConfig = {};
+        } else {
+          logWarning2(
+            `Invalid target config for evaluator '${name}' in '${evalId}': expected object or true`
+          );
+        }
+      }
+      const knownProps = /* @__PURE__ */ new Set(["name", "type", "script", "cwd", "weight", "target"]);
       const config = {};
       for (const [key, value] of Object.entries(rawEvaluator)) {
         if (!knownProps.has(key) && value !== void 0) {
@@ -540,7 +476,8 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
         cwd,
         resolvedCwd,
         ...weight2 !== void 0 ? { weight: weight2 } : {},
-        ...Object.keys(config).length > 0 ? { config } : {}
+        ...Object.keys(config).length > 0 ? { config } : {},
+        ...targetConfig !== void 0 ? { target: targetConfig } : {}
       });
       continue;
     }
@@ -557,7 +494,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
         logWarning2(`Skipping composite evaluator '${name}' in '${evalId}': missing aggregator`);
         continue;
       }
-      const aggregatorType = asString2(rawAggregator.type);
+      const aggregatorType = asString(rawAggregator.type);
       if (aggregatorType !== "weighted_average" && aggregatorType !== "code_judge" && aggregatorType !== "llm_judge") {
         logWarning2(
           `Skipping composite evaluator '${name}' in '${evalId}': invalid aggregator type '${aggregatorType}'`
@@ -570,7 +507,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
           logWarning2(`Skipping invalid member evaluator in composite '${name}' (expected object)`);
           continue;
         }
-        const memberName = asString2(rawMember.name);
+        const memberName = asString(rawMember.name);
         const memberType = rawMember.type;
         if (!memberName || !isEvaluatorKind(memberType)) {
           logWarning2(`Skipping member evaluator with invalid name/type in composite '${name}'`);
@@ -608,7 +545,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
           ...Object.keys(parsedWeights).length > 0 ? { weights: parsedWeights } : {}
         };
       } else if (aggregatorType === "code_judge") {
-        const aggregatorPath = asString2(rawAggregator.path);
+        const aggregatorPath = asString(rawAggregator.path);
         if (!aggregatorPath) {
           logWarning2(
             `Skipping composite evaluator '${name}' in '${evalId}': code_judge aggregator missing path`
@@ -621,7 +558,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
           cwd: searchRoots[0]
         };
       } else {
-        const aggregatorPrompt = asString2(rawAggregator.prompt);
+        const aggregatorPrompt = asString(rawAggregator.prompt);
         let promptPath2;
         if (aggregatorPrompt) {
           const resolved = await resolveFileReference2(aggregatorPrompt, searchRoots);
@@ -646,7 +583,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
       continue;
     }
     if (typeValue === "tool_trajectory") {
-      const mode = asString2(rawEvaluator.mode);
+      const mode = asString(rawEvaluator.mode);
       if (mode !== "any_order" && mode !== "in_order" && mode !== "exact") {
         logWarning2(
           `Skipping tool_trajectory evaluator '${name}' in '${evalId}': invalid mode '${mode}' (must be any_order, in_order, or exact)`
@@ -737,8 +674,8 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
           );
           continue;
         }
-        const fieldPath = asString2(rawField.path);
-        const match = asString2(rawField.match);
+        const fieldPath = asString(rawField.path);
+        const match = asString(rawField.match);
         if (!fieldPath) {
           logWarning2(
             `Skipping field without path in field_accuracy evaluator '${name}' in '${evalId}'`
@@ -768,7 +705,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
         );
         continue;
       }
-      const aggregation = asString2(rawEvaluator.aggregation);
+      const aggregation = asString(rawEvaluator.aggregation);
       const validAggregation = isValidFieldAggregationType(aggregation) ? aggregation : void 0;
       const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
       evaluators.push({
@@ -849,7 +786,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
       });
       continue;
     }
-    const prompt = asString2(rawEvaluator.prompt);
+    const prompt = asString(rawEvaluator.prompt);
     let promptPath;
     if (prompt) {
       const resolved = await resolveFileReference2(prompt, searchRoots);
@@ -868,11 +805,11 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
         );
       }
     }
-    const _model = asString2(rawEvaluator.model);
+    const _model = asString(rawEvaluator.model);
     const rawRubrics = rawEvaluator.rubrics;
     const parsedRubrics = Array.isArray(rawRubrics) ? rawRubrics.filter((r) => isJsonObject2(r)).map((rubric, index) => ({
-      id: asString2(rubric.id) ?? `rubric-${index + 1}`,
-      description: asString2(rubric.description) ?? "",
+      id: asString(rubric.id) ?? `rubric-${index + 1}`,
+      description: asString(rubric.description) ?? "",
       weight: typeof rubric.weight === "number" ? rubric.weight : 1,
       required: typeof rubric.required === "boolean" ? rubric.required : true
     })).filter((r) => r.description.length > 0) : void 0;
@@ -916,7 +853,7 @@ function coerceEvaluator(candidate, contextId) {
   logWarning2(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
   return void 0;
 }
-function asString2(value) {
+function asString(value) {
   return typeof value === "string" ? value : void 0;
 }
 function asStringArray(value, description) {
@@ -992,6 +929,68 @@ function isValidFieldAggregationType(value) {
 // src/evaluation/loaders/message-processor.ts
 import { readFile as readFile3 } from "node:fs/promises";
 import path4 from "node:path";
+// src/evaluation/formatting/segment-formatter.ts
+function formatFileContents(parts) {
+  const fileCount = parts.filter((p) => p.isFile).length;
+  if (fileCount > 0) {
+    return parts.map((part) => {
+      if (part.isFile && part.displayPath) {
+        return `<file path="${part.displayPath}">
+${part.content}
+</file>`;
+      }
+      return part.content;
+    }).join("\n\n");
+  }
+  return parts.map((p) => p.content).join(" ");
+}
+function formatSegment(segment, mode = "lm") {
+  const type = asString2(segment.type);
+  if (type === "text") {
+    return asString2(segment.value);
+  }
+  if (type === "guideline_ref") {
+    const refPath = asString2(segment.path);
+    return refPath ? `<Attached: ${refPath}>` : void 0;
+  }
+  if (type === "file") {
+    const filePath = asString2(segment.path);
+    if (!filePath) {
+      return void 0;
+    }
+    if (mode === "agent") {
+      return `<file: path="${filePath}">`;
+    }
+    const text = asString2(segment.text);
+    if (text && filePath) {
+      return formatFileContents([{ content: text.trim(), isFile: true, displayPath: filePath }]);
+    }
+  }
+  return void 0;
+}
+function hasVisibleContent(segments) {
+  return segments.some((segment) => {
+    const type = asString2(segment.type);
+    if (type === "text") {
+      const value = asString2(segment.value);
+      return value !== void 0 && value.trim().length > 0;
+    }
+    if (type === "guideline_ref") {
+      return false;
+    }
+    if (type === "file") {
+      const text = asString2(segment.text);
+      return text !== void 0 && text.trim().length > 0;
+    }
+    return false;
+  });
+}
+function asString2(value) {
+  return typeof value === "string" ? value : void 0;
+}
+// src/evaluation/loaders/message-processor.ts
 var ANSI_YELLOW4 = "\x1B[33m";
 var ANSI_RESET4 = "\x1B[0m";
 async function processMessages(options) {
@@ -1297,9 +1296,6 @@ ${messageContent}`);
         questionParts.push(formattedContent);
       }
     }
-    if (testCase.code_snippets.length > 0) {
-      questionParts.push(testCase.code_snippets.join("\n"));
-    }
     question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
   }
   const chatPrompt = useRoleMarkers ? buildChatPromptFromSegments({
@@ -1498,7 +1494,6 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
       repoRootPath,
       verbose
     }) : [];
-    const codeSnippets = extractCodeBlocks(inputSegments);
     let referenceAnswer = "";
     if (outputSegments.length > 0) {
       const lastMessage = outputSegments[outputSegments.length - 1];
@@ -1571,7 +1566,6 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
       guideline_paths: guidelinePaths.map((guidelinePath) => path6.resolve(guidelinePath)),
       guideline_patterns: guidelinePatterns,
       file_paths: allFilePaths,
-      code_snippets: codeSnippets,
       expected_outcome: outcome,
       evaluator: evalCaseEvaluatorKind,
       evaluators
@@ -5311,9 +5305,64 @@ function resolveAndCreateProvider(definition, env = process.env) {
   return createProvider(resolved);
 }
-// src/evaluation/evaluators.ts
-import { generateText as generateText2 } from "ai";
-import { z as z2 } from "zod";
+// src/evaluation/evaluators/scoring.ts
+function scoreToVerdict(score) {
+  if (score >= 0.8) {
+    return "pass";
+  }
+  if (score >= 0.6) {
+    return "borderline";
+  }
+  return "fail";
+}
+function clampScore(value) {
+  if (Number.isNaN(value) || !Number.isFinite(value)) {
+    return 0;
+  }
+  if (value < 0) {
+    return 0;
+  }
+  if (value > 1) {
+    return 1;
+  }
+  return value;
+}
+function extractJsonBlob(text) {
+  const match = text.match(/\{[\s\S]*\}/);
+  return match?.[0];
+}
+function parseJsonFromText(text) {
+  const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
+  const blob = extractJsonBlob(cleaned) ?? cleaned;
+  return JSON.parse(blob);
+}
+function isNonEmptyString(value) {
+  return typeof value === "string" && value.trim().length > 0;
+}
+function parseJsonSafe(payload) {
+  try {
+    return JSON.parse(payload);
+  } catch {
+    return void 0;
+  }
+}
+function deepEqual(a, b) {
+  if (a === b) return true;
+  if (a === null || b === null) return a === b;
+  if (typeof a !== typeof b) return false;
+  if (typeof a !== "object") return a === b;
+  if (Array.isArray(a) !== Array.isArray(b)) return false;
+  if (Array.isArray(a) && Array.isArray(b)) {
+    if (a.length !== b.length) return false;
+    return a.every((val, i) => deepEqual(val, b[i]));
+  }
+  const aObj = a;
+  const bObj = b;
+  const aKeys = Object.keys(aObj);
+  const bKeys = Object.keys(bObj);
+  if (aKeys.length !== bKeys.length) return false;
+  return aKeys.every((key) => Object.hasOwn(bObj, key) && deepEqual(aObj[key], bObj[key]));
+}
 // src/runtime/exec.ts
 function shellEscapePath(value) {
@@ -5338,7 +5387,9 @@ async function execFileWithStdinBun(argv, stdinPayload, options) {
     cwd: options.cwd,
     stdin: encoder.encode(stdinPayload),
     stdout: "pipe",
-    stderr: "pipe"
+    stderr: "pipe",
+    // Merge additional env vars with process.env
+    env: options.env ? { ...process.env, ...options.env } : process.env
   });
   let timedOut = false;
   const timeout = options.timeoutMs !== void 0 ? setTimeout(() => {
@@ -5373,7 +5424,9 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
     const [cmd, ...args] = argv;
     const child = spawn4(cmd, args, {
       cwd: options.cwd,
-      stdio: ["pipe", "pipe", "pipe"]
+      stdio: ["pipe", "pipe", "pipe"],
+      // Merge additional env vars with process.env
+      env: options.env ? { ...process.env, ...options.env } : process.env
     });
     const stdoutChunks = [];
     const stderrChunks = [];
@@ -5426,7 +5479,9 @@ async function execShellWithStdin(command, stdinPayload, options = {}) {
       const child = spawn4(wrappedCommand, {
         shell: true,
         cwd: options.cwd,
-        stdio: ["ignore", "ignore", "ignore"]
+        stdio: ["ignore", "ignore", "ignore"],
+        // Merge additional env vars with process.env
+        env: options.env ? { ...process.env, ...options.env } : process.env
       });
       const timeout = options.timeoutMs ? setTimeout(() => {
         child.kill();
@@ -5453,32 +5508,387 @@ async function execShellWithStdin(command, stdinPayload, options = {}) {
   }
 }
-// src/evaluation/case-conversion.ts
-function toSnakeCase(str) {
-  if (/^[A-Z]/.test(str)) {
-    return str;
+// src/runtime/target-proxy.ts
+import { randomBytes } from "node:crypto";
+import { createServer } from "node:http";
+var DEFAULT_MAX_CALLS = 50;
+async function createTargetProxy(options) {
+  const { defaultProvider, targetResolver, availableTargets, maxCalls } = options;
+  const token = randomBytes(32).toString("hex");
+  let callCount = 0;
+  let isShutdown = false;
+  const targetsList = availableTargets ?? [defaultProvider.targetName];
+  function resolveProvider(targetName) {
+    if (targetName === void 0 || targetName === defaultProvider.targetName) {
+      return defaultProvider;
+    }
+    if (targetResolver) {
+      return targetResolver(targetName);
+    }
+    return void 0;
   }
-  return str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
-}
-function toSnakeCaseDeep(obj) {
-  if (obj === null || obj === void 0) {
-    return obj;
+  const server = createServer(async (req, res) => {
+    res.setHeader("Access-Control-Allow-Origin", "*");
+    res.setHeader("Access-Control-Allow-Methods", "GET, POST, OPTIONS");
+    res.setHeader("Access-Control-Allow-Headers", "Content-Type, Authorization");
+    if (req.method === "OPTIONS") {
+      res.writeHead(204);
+      res.end();
+      return;
+    }
+    const authHeader = req.headers.authorization;
+    if (!authHeader || authHeader !== `Bearer ${token}`) {
+      sendJson(res, 401, { error: "Unauthorized" });
+      return;
+    }
+    if (isShutdown) {
+      sendJson(res, 503, { error: "Proxy is shutting down" });
+      return;
+    }
+    const url2 = req.url ?? "";
+    if (req.method === "GET" && url2 === "/info") {
+      handleInfo(res);
+      return;
+    }
+    if (req.method === "POST" && url2 === "/invoke") {
+      await handleInvoke(req, res);
+      return;
+    }
+    if (req.method === "POST" && url2 === "/invokeBatch") {
+      await handleInvokeBatch(req, res);
+      return;
+    }
+    sendJson(res, 404, { error: "Not found" });
+  });
+  function handleInfo(res) {
+    const response = {
+      targetName: defaultProvider.targetName,
+      maxCalls,
+      callCount,
+      availableTargets: targetsList
+    };
+    sendJson(res, 200, response);
   }
-  if (Array.isArray(obj)) {
-    return obj.map((item) => toSnakeCaseDeep(item));
+  async function handleInvoke(req, res) {
+    if (callCount >= maxCalls) {
+      sendJson(res, 429, { error: `Max calls exceeded (limit: ${maxCalls})` });
+      return;
+    }
+    try {
+      const body = await readBody(req);
+      const request = JSON.parse(body);
+      if (!request.question || typeof request.question !== "string") {
+        sendJson(res, 400, { error: "Missing required field: question" });
+        return;
+      }
+      const provider = resolveProvider(request.target);
+      if (!provider) {
+        sendJson(res, 400, {
+          error: `Unknown target '${request.target}'. Available: ${targetsList.join(", ")}`
+        });
+        return;
+      }
+      callCount++;
+      const response = await provider.invoke({
+        question: request.question,
+        systemPrompt: request.systemPrompt,
+        evalCaseId: request.evalCaseId ?? "proxy",
+        attempt: request.attempt ?? 1
+      });
+      const outputMessages = response.outputMessages ?? [];
+      const rawText = extractLastAssistantContent2(outputMessages);
+      const result = {
+        outputMessages,
+        rawText
+      };
+      sendJson(res, 200, result);
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      sendJson(res, 500, { error: message });
+    }
   }
-  if (typeof obj === "object") {
-    const result = {};
-    for (const [key, value] of Object.entries(obj)) {
-      const snakeKey = toSnakeCase(key);
-      result[snakeKey] = toSnakeCaseDeep(value);
+  async function handleInvokeBatch(req, res) {
+    try {
+      const body = await readBody(req);
+      const { requests } = JSON.parse(body);
+      if (!Array.isArray(requests)) {
+        sendJson(res, 400, { error: "Missing required field: requests (array)" });
+        return;
+      }
+      if (callCount + requests.length > maxCalls) {
+        sendJson(res, 429, {
+          error: `Batch would exceed max calls (current: ${callCount}, batch: ${requests.length}, limit: ${maxCalls})`
+        });
+        return;
+      }
+      const responses = [];
+      for (const request of requests) {
+        if (!request.question || typeof request.question !== "string") {
+          responses.push({
+            outputMessages: [],
+            rawText: "Error: Missing required field: question"
+          });
+          continue;
+        }
+        const provider = resolveProvider(request.target);
+        if (!provider) {
+          responses.push({
+            outputMessages: [],
+            rawText: `Error: Unknown target '${request.target}'. Available: ${targetsList.join(", ")}`
+          });
+          continue;
+        }
+        callCount++;
+        try {
+          const response = await provider.invoke({
+            question: request.question,
+            systemPrompt: request.systemPrompt,
+            evalCaseId: request.evalCaseId ?? "proxy",
+            attempt: request.attempt ?? 1
+          });
+          const outputMessages = response.outputMessages ?? [];
+          responses.push({
+            outputMessages,
+            rawText: extractLastAssistantContent2(outputMessages)
+          });
+        } catch (error) {
+          const message = error instanceof Error ? error.message : String(error);
+          responses.push({
+            outputMessages: [],
+            rawText: `Error: ${message}`
+          });
+        }
+      }
+      sendJson(res, 200, { responses });
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      sendJson(res, 500, { error: message });
     }
-    return result;
   }
-  return obj;
+  await new Promise((resolve, reject) => {
+    server.once("error", reject);
+    server.listen(0, "127.0.0.1", () => {
+      server.removeListener("error", reject);
+      resolve();
+    });
+  });
+  const address = server.address();
+  const url = `http://127.0.0.1:${address.port}`;
+  return {
+    url,
+    token,
+    shutdown: async () => {
+      isShutdown = true;
+      return new Promise((resolve, reject) => {
+        server.close((err) => {
+          if (err) reject(err);
+          else resolve();
+        });
+      });
+    },
+    getUsageMetadata: () => ({
+      callCount,
+      maxCalls
+    })
+  };
+}
+function sendJson(res, statusCode, body) {
+  res.writeHead(statusCode, { "Content-Type": "application/json" });
+  res.end(JSON.stringify(body));
+}
+function readBody(req) {
+  return new Promise((resolve, reject) => {
+    const chunks = [];
+    req.on("data", (chunk) => chunks.push(chunk));
+    req.on("end", () => resolve(Buffer.concat(chunks).toString("utf8")));
+    req.on("error", reject);
+  });
+}
+function extractLastAssistantContent2(messages) {
+  for (let i = messages.length - 1; i >= 0; i--) {
+    const msg = messages[i];
+    if (msg.role === "assistant" && msg.content !== void 0) {
+      if (typeof msg.content === "string") {
+        return msg.content;
+      }
+      if (Array.isArray(msg.content)) {
+        for (const part of msg.content) {
+          if (typeof part === "object" && part !== null && "text" in part) {
+            return String(part.text);
+          }
+        }
+      }
+    }
+  }
+  return void 0;
+}
+// src/evaluation/case-conversion.ts
+function toSnakeCase(str) {
+  if (/^[A-Z]/.test(str)) {
+    return str;
+  }
+  return str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
+}
+function toSnakeCaseDeep(obj) {
+  if (obj === null || obj === void 0) {
+    return obj;
+  }
+  if (Array.isArray(obj)) {
+    return obj.map((item) => toSnakeCaseDeep(item));
+  }
+  if (typeof obj === "object") {
+    const result = {};
+    for (const [key, value] of Object.entries(obj)) {
+      const snakeKey = toSnakeCase(key);
+      result[snakeKey] = toSnakeCaseDeep(value);
+    }
+    return result;
+  }
+  return obj;
+}
+// src/evaluation/evaluators/code-evaluator.ts
+var CodeEvaluator = class {
+  kind = "code";
+  script;
+  cwd;
+  agentTimeoutMs;
+  config;
+  target;
+  constructor(options) {
+    this.script = options.script;
+    this.cwd = options.cwd;
+    this.agentTimeoutMs = options.agentTimeoutMs;
+    this.config = options.config;
+    this.target = options.target;
+  }
+  async evaluate(context) {
+    const payload = {
+      question: context.evalCase.question,
+      expectedOutcome: context.evalCase.expected_outcome,
+      expectedMessages: context.evalCase.expected_messages,
+      referenceAnswer: context.evalCase.reference_answer,
+      candidateAnswer: context.candidate,
+      outputMessages: context.outputMessages ?? null,
+      guidelineFiles: context.evalCase.guideline_paths,
+      inputFiles: context.evalCase.file_paths.filter(
+        (path15) => !context.evalCase.guideline_paths.includes(path15)
+      ),
+      inputMessages: context.evalCase.input_messages,
+      traceSummary: context.traceSummary ?? null,
+      config: this.config ?? null
+    };
+    const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
+    let proxyEnv;
+    let proxyShutdown;
+    let getProxyUsage;
+    if (this.target !== void 0 && context.judgeProvider) {
+      const maxCalls = this.target.max_calls ?? DEFAULT_MAX_CALLS;
+      const proxy = await createTargetProxy({
+        defaultProvider: context.judgeProvider,
+        targetResolver: context.targetResolver,
+        availableTargets: context.availableTargets,
+        maxCalls
+      });
+      proxyEnv = {
+        AGENTV_TARGET_PROXY_URL: proxy.url,
+        AGENTV_TARGET_PROXY_TOKEN: proxy.token
+      };
+      proxyShutdown = proxy.shutdown;
+      getProxyUsage = proxy.getUsageMetadata;
+    }
+    try {
+      const stdout = await executeScript(
+        this.script,
+        inputPayload,
+        this.agentTimeoutMs,
+        this.cwd,
+        proxyEnv
+      );
+      const parsed = parseJsonSafe(stdout);
+      const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
+      const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
+      const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
+      const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
+      const details = parsed?.details && typeof parsed.details === "object" && !Array.isArray(parsed.details) ? parsed.details : void 0;
+      const proxyUsage = getProxyUsage?.();
+      const evaluatorRawRequest = {
+        script: this.script,
+        ...this.cwd ? { cwd: this.cwd } : {},
+        ...proxyUsage ? {
+          target_proxy: {
+            call_count: proxyUsage.callCount,
+            max_calls: proxyUsage.maxCalls
+          }
+        } : {}
+      };
+      return {
+        score,
+        verdict: scoreToVerdict(score),
+        hits,
+        misses,
+        expectedAspectCount: hits.length + misses.length || 1,
+        reasoning,
+        evaluatorRawRequest,
+        ...details ? { details } : {}
+      };
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      const proxyUsage = getProxyUsage?.();
+      return {
+        score: 0,
+        verdict: "fail",
+        hits: [],
+        misses: [`Code evaluator failed: ${message}`],
+        expectedAspectCount: 1,
+        reasoning: message,
+        evaluatorRawRequest: {
+          script: this.script,
+          ...this.cwd ? { cwd: this.cwd } : {},
+          ...proxyUsage ? {
+            target_proxy: {
+              call_count: proxyUsage.callCount,
+              max_calls: proxyUsage.maxCalls
+            }
+          } : {},
+          error: message
+        }
+      };
+    } finally {
+      if (proxyShutdown) {
+        await proxyShutdown();
+      }
+    }
+  }
+};
+async function executeScript(scriptPath, input, agentTimeoutMs, cwd, env) {
+  const { stdout, stderr, exitCode } = typeof scriptPath === "string" ? await execShellWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env }) : await execFileWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env });
+  if (exitCode !== 0) {
+    const trimmedErr = formatStderr(stderr);
+    throw new Error(
+      trimmedErr.length > 0 ? `Code evaluator exited with code ${exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${exitCode}`
+    );
+  }
+  return stdout.trim();
+}
+function formatStderr(stderr) {
+  const trimmed = stderr.trim();
+  const maxLength = 2e3;
+  if (trimmed.length <= maxLength) {
+    return trimmed;
+  }
+  const tail = trimmed.slice(-maxLength);
+  return `...(truncated, last ${maxLength} chars)
+${tail}`;
 }
-// src/evaluation/evaluators.ts
+// src/evaluation/evaluators/composite.ts
+import { generateText as generateText3 } from "ai";
+// src/evaluation/evaluators/llm-judge.ts
+import { generateText as generateText2 } from "ai";
+import { z as z2 } from "zod";
 var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
 Use the reference_answer as a gold standard for a high-quality response (if provided). The reference_answer may be a simple text response, or it may contain a sequence of expected agent messages including tool calls. When it contains multiple messages, the last message represents the final expected answer. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
@@ -5558,7 +5968,7 @@ var LlmJudgeEvaluator = class {
       target: judgeProvider.targetName
     };
     try {
-      const { data, providerResponse } = await this.runWithRetry({
+      const { data } = await this.runWithRetry({
         context,
         judgeProvider,
         systemPrompt,
@@ -5707,105 +6117,11 @@ You must return a valid JSON object matching this schema:
   "overall_reasoning": "string (summary)"
 }`;
 }
-function scoreToVerdict(score) {
-  if (score >= 0.8) {
-    return "pass";
-  }
-  if (score >= 0.6) {
-    return "borderline";
-  }
-  return "fail";
-}
-function clampScore(value) {
-  if (Number.isNaN(value) || !Number.isFinite(value)) {
-    return 0;
-  }
-  if (value < 0) {
-    return 0;
-  }
-  if (value > 1) {
-    return 1;
-  }
-  return value;
-}
-function extractJsonBlob(text) {
-  const match = text.match(/\{[\s\S]*\}/);
-  return match?.[0];
-}
-function parseJsonFromText(text) {
-  const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
-  const blob = extractJsonBlob(cleaned) ?? cleaned;
-  return JSON.parse(blob);
-}
-function isNonEmptyString(value) {
-  return typeof value === "string" && value.trim().length > 0;
+function substituteVariables(template, variables) {
+  return template.replace(/\{\{\s*([a-zA-Z0-9_]+)\s*\}\}/g, (match, varName) => {
+    return variables[varName] ?? match;
+  });
 }
-var CodeEvaluator = class {
-  kind = "code";
-  script;
-  cwd;
-  agentTimeoutMs;
-  config;
-  constructor(options) {
-    this.script = options.script;
-    this.cwd = options.cwd;
-    this.agentTimeoutMs = options.agentTimeoutMs;
-    this.config = options.config;
-  }
-  async evaluate(context) {
-    const payload = {
-      question: context.evalCase.question,
-      expectedOutcome: context.evalCase.expected_outcome,
-      expectedMessages: context.evalCase.expected_messages,
-      referenceAnswer: context.evalCase.reference_answer,
-      candidateAnswer: context.candidate,
-      outputMessages: context.outputMessages ?? null,
-      guidelineFiles: context.evalCase.guideline_paths,
-      inputFiles: context.evalCase.file_paths.filter(
-        (path15) => !context.evalCase.guideline_paths.includes(path15)
-      ),
-      inputMessages: context.evalCase.input_messages,
-      traceSummary: context.traceSummary ?? null,
-      config: this.config ?? null
-    };
-    const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
-    try {
-      const stdout = await executeScript(this.script, inputPayload, this.agentTimeoutMs, this.cwd);
-      const parsed = parseJsonSafe(stdout);
-      const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
-      const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
-      const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
-      const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
-      return {
-        score,
-        verdict: scoreToVerdict(score),
-        hits,
-        misses,
-        expectedAspectCount: hits.length + misses.length || 1,
-        reasoning,
-        evaluatorRawRequest: {
-          script: this.script,
-          ...this.cwd ? { cwd: this.cwd } : {}
-        }
-      };
-    } catch (error) {
-      const message = error instanceof Error ? error.message : String(error);
-      return {
-        score: 0,
-        verdict: "fail",
-        hits: [],
-        misses: [`Code evaluator failed: ${message}`],
-        expectedAspectCount: 1,
-        reasoning: message,
-        evaluatorRawRequest: {
-          script: this.script,
-          ...this.cwd ? { cwd: this.cwd } : {},
-          error: message
-        }
-      };
-    }
-  }
-};
 function calculateRubricScore(result, rubrics) {
   const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
   const hits = [];
@@ -5833,273 +6149,281 @@ function calculateRubricScore(result, rubrics) {
   const verdict = failedRequired ? "fail" : scoreToVerdict(score);
   return { score, verdict, hits, misses };
 }
-async function executeScript(scriptPath, input, agentTimeoutMs, cwd) {
-  const { stdout, stderr, exitCode } = typeof scriptPath === "string" ? await execShellWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs }) : await execFileWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs });
-  if (exitCode !== 0) {
-    const trimmedErr = formatStderr(stderr);
-    throw new Error(
-      trimmedErr.length > 0 ? `Code evaluator exited with code ${exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${exitCode}`
-    );
-  }
-  return stdout.trim();
-}
-function formatStderr(stderr) {
-  const trimmed = stderr.trim();
-  const maxLength = 2e3;
-  if (trimmed.length <= maxLength) {
-    return trimmed;
-  }
-  const tail = trimmed.slice(-maxLength);
-  return `...(truncated, last ${maxLength} chars)
-${tail}`;
-}
-function parseJsonSafe(payload) {
-  try {
-    return JSON.parse(payload);
-  } catch {
-    return void 0;
-  }
-}
-function substituteVariables(template, variables) {
-  return template.replace(/\{\{\s*([a-zA-Z0-9_]+)\s*\}\}/g, (match, varName) => {
-    return variables[varName] ?? match;
-  });
-}
-function deepEqual(a, b) {
-  if (a === b) return true;
-  if (a === null || b === null) return a === b;
-  if (typeof a !== typeof b) return false;
-  if (typeof a !== "object") return a === b;
-  if (Array.isArray(a) !== Array.isArray(b)) return false;
-  if (Array.isArray(a) && Array.isArray(b)) {
-    if (a.length !== b.length) return false;
-    return a.every((val, i) => deepEqual(val, b[i]));
-  }
-  const aObj = a;
-  const bObj = b;
-  const aKeys = Object.keys(aObj);
-  const bKeys = Object.keys(bObj);
-  if (aKeys.length !== bKeys.length) return false;
-  return aKeys.every((key) => Object.hasOwn(bObj, key) && deepEqual(aObj[key], bObj[key]));
-}
-function argsMatch(expected, actual) {
-  if (expected === void 0) return true;
-  if (expected === "any") return true;
-  if (actual === void 0) return false;
-  for (const key of Object.keys(expected)) {
-    if (!Object.hasOwn(actual, key)) return false;
-    if (!deepEqual(expected[key], actual[key])) return false;
-  }
-  return true;
-}
-var ToolTrajectoryEvaluator = class {
-  kind = "tool_trajectory";
+// src/evaluation/evaluators/composite.ts
+var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
+{{EVALUATOR_RESULTS_JSON}}
+Decide the final score and verdict based on all evaluator results.
+Return a JSON object with: score (0.0-1.0), verdict (pass/fail/borderline), and reasoning.`;
+var CompositeEvaluator = class {
+  kind = "composite";
   config;
+  evaluatorFactory;
+  cwd;
   constructor(options) {
     this.config = options.config;
+    this.evaluatorFactory = options.evaluatorFactory;
+    this.cwd = options.cwd;
   }
-  evaluate(context) {
-    const { outputMessages, traceSummary } = context;
-    const toolCalls = this.extractToolCallsFromMessages(outputMessages);
-    if (toolCalls.length === 0 && !traceSummary) {
-      return {
-        score: 0,
-        verdict: "fail",
-        hits: [],
-        misses: ["No trace available for evaluation"],
-        expectedAspectCount: 1
-      };
-    }
-    const summary = toolCalls.length > 0 ? this.buildSummary(toolCalls) : traceSummary;
-    if (!summary) {
-      return {
-        score: 0,
-        verdict: "fail",
-        hits: [],
-        misses: ["No trace available for evaluation"],
-        expectedAspectCount: 1
-      };
-    }
-    switch (this.config.mode) {
-      case "any_order":
-        return this.evaluateAnyOrder(summary);
-      case "in_order":
-        return this.evaluateInOrder(toolCalls);
-      case "exact":
-        return this.evaluateExact(toolCalls);
-      default:
+  async evaluate(context) {
+    const memberResults = await Promise.all(
+      this.config.evaluators.map(async (memberConfig) => {
+        const evaluator = this.evaluatorFactory.create(memberConfig, context);
         return {
-          score: 0,
-          verdict: "fail",
-          hits: [],
-          misses: [`Unknown mode: ${this.config.mode}`],
-          expectedAspectCount: 1
+          id: memberConfig.name,
+          type: memberConfig.type,
+          result: await evaluator.evaluate(context)
         };
-    }
+      })
+    );
+    return this.aggregate(memberResults, context);
   }
-  /**
-   * Extract tool calls from output messages.
-   */
-  extractToolCallsFromMessages(messages) {
-    if (!messages) {
-      return [];
-    }
-    const toolCalls = [];
-    for (const message of messages) {
-      if (message.toolCalls) {
-        for (const call of message.toolCalls) {
-          toolCalls.push({
-            name: call.tool,
-            args: call.input
-          });
-        }
-      }
+  async aggregate(results, context) {
+    const aggregator = this.config.aggregator;
+    switch (aggregator.type) {
+      case "code_judge":
+        return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
+      case "llm_judge":
+        return this.runLlmAggregator(results, context, aggregator);
+      default:
+        return this.runWeightedAverage(results, aggregator.weights);
     }
-    return toolCalls;
   }
-  /**
-   * Build a summary from extracted tool calls.
-   */
-  buildSummary(toolCalls) {
-    const toolCallsByName = {};
-    for (const call of toolCalls) {
-      toolCallsByName[call.name] = (toolCallsByName[call.name] ?? 0) + 1;
+  runWeightedAverage(results, weights) {
+    let totalWeight = 0;
+    let weightedSum = 0;
+    const allHits = [];
+    const allMisses = [];
+    const reasoningParts = [];
+    const evaluatorResults = [];
+    for (const member of results) {
+      const weight = weights?.[member.id] ?? 1;
+      totalWeight += weight;
+      weightedSum += member.result.score * weight;
+      allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
+      allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
+      if (member.result.reasoning) {
+        reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
+      }
+      evaluatorResults.push({
+        name: member.id,
+        type: member.type,
+        score: member.result.score,
+        weight,
+        verdict: member.result.verdict,
+        hits: [...member.result.hits],
+        misses: [...member.result.misses],
+        reasoning: member.result.reasoning,
+        evaluatorRawRequest: member.result.evaluatorRawRequest,
+        evaluatorResults: member.result.evaluatorResults,
+        details: member.result.details
+      });
     }
-    const toolNames = Object.keys(toolCallsByName).sort();
+    const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
     return {
-      eventCount: toolCalls.length,
-      toolNames,
-      toolCallsByName,
-      errorCount: 0
+      score: clampScore(finalScore),
+      verdict: scoreToVerdict(finalScore),
+      hits: allHits,
+      misses: allMisses,
+      expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
+      reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
+      evaluatorRawRequest: {
+        aggregator: "weighted_average",
+        ...weights ? { weights } : {}
+      },
+      evaluatorResults
     };
   }
-  evaluateAnyOrder(summary) {
-    const minimums = this.config.minimums ?? {};
-    const toolNames = Object.keys(minimums);
-    if (toolNames.length === 0) {
+  async runCodeAggregator(results, scriptPath, cwd, weights) {
+    const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
+    const inputPayload = JSON.stringify({ results: resultsObject }, null, 2);
+    const evaluatorResults = results.map((member) => ({
+      name: member.id,
+      type: member.type,
+      score: member.result.score,
+      weight: weights?.[member.id] ?? 1,
+      verdict: member.result.verdict,
+      hits: [...member.result.hits],
+      misses: [...member.result.misses],
+      reasoning: member.result.reasoning,
+      evaluatorRawRequest: member.result.evaluatorRawRequest,
+      evaluatorResults: member.result.evaluatorResults,
+      details: member.result.details
+    }));
+    try {
+      const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
+      const parsed = parseJsonSafe(stdout);
+      const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
+      const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
+      const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
+      const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
+      const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
       return {
-        score: 1,
-        verdict: "pass",
-        hits: ["No tool requirements specified"],
-        misses: [],
-        expectedAspectCount: 0
+        score,
+        verdict,
+        hits,
+        misses,
+        expectedAspectCount: hits.length + misses.length || 1,
+        reasoning,
+        evaluatorRawRequest: {
+          aggregator: "code_judge",
+          script: scriptPath
+        },
+        evaluatorResults
+      };
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      return {
+        score: 0,
+        verdict: "fail",
+        hits: [],
+        misses: [`Code aggregator failed: ${message}`],
+        expectedAspectCount: 1,
+        reasoning: message,
+        evaluatorRawRequest: {
+          aggregator: "code_judge",
+          script: scriptPath,
+          error: message
+        },
+        evaluatorResults
       };
     }
-    const hits = [];
-    const misses = [];
-    for (const toolName of toolNames) {
-      const required = minimums[toolName];
-      const actual = summary.toolCallsByName[toolName] ?? 0;
-      if (actual >= required) {
-        hits.push(`${toolName}: called ${actual} times (required \u2265${required})`);
-      } else {
-        misses.push(`${toolName}: called ${actual} times (required \u2265${required})`);
-      }
+  }
+  async runLlmAggregator(results, context, config) {
+    const judgeProvider = context.judgeProvider;
+    if (!judgeProvider) {
+      throw new Error("No judge provider available for LLM aggregation");
     }
-    const score = hits.length / toolNames.length;
-    return {
-      score,
-      verdict: scoreToVerdict(score),
-      hits,
-      misses,
-      expectedAspectCount: toolNames.length
+    const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
+    const resultsJson = JSON.stringify(resultsObject, null, 2);
+    const evaluatorResults = results.map((member) => ({
+      name: member.id,
+      type: member.type,
+      score: member.result.score,
+      verdict: member.result.verdict,
+      hits: [...member.result.hits],
+      misses: [...member.result.misses],
+      reasoning: member.result.reasoning,
+      evaluatorRawRequest: member.result.evaluatorRawRequest,
+      evaluatorResults: member.result.evaluatorResults,
+      details: member.result.details
+    }));
+    const promptTemplate = config.prompt ?? DEFAULT_COMPOSITE_AGGREGATOR_PROMPT;
+    const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
+    const systemPrompt = buildOutputSchema();
+    const evaluatorRawRequest = {
+      aggregator: "llm_judge",
+      userPrompt,
+      systemPrompt,
+      target: judgeProvider.targetName
     };
-  }
-  evaluateInOrder(toolCalls) {
-    const expected = this.config.expected ?? [];
-    if (expected.length === 0) {
+    try {
+      const model = judgeProvider.asLanguageModel?.();
+      if (model) {
+        const { text } = await generateText3({
+          model,
+          system: systemPrompt,
+          prompt: userPrompt
+        });
+        const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text));
+        const score2 = clampScore(data2.score);
+        const hits2 = Array.isArray(data2.hits) ? data2.hits.filter(isNonEmptyString).slice(0, 4) : [];
+        const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
+        const reasoning2 = data2.reasoning;
+        return {
+          score: score2,
+          verdict: scoreToVerdict(score2),
+          hits: hits2,
+          misses: misses2,
+          expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
+          reasoning: reasoning2,
+          evaluatorRawRequest,
+          evaluatorResults
+        };
+      }
+      const response = await judgeProvider.invoke({
+        question: userPrompt,
+        systemPrompt,
+        evalCaseId: context.evalCase.id,
+        attempt: context.attempt
+      });
+      const data = freeformEvaluationSchema.parse(
+        parseJsonFromText(extractLastAssistantContent(response.outputMessages))
+      );
+      const score = clampScore(data.score);
+      const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
+      const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
+      const reasoning = data.reasoning;
       return {
-        score: 1,
-        verdict: "pass",
-        hits: ["No tool sequence specified"],
-        misses: [],
-        expectedAspectCount: 0
+        score,
+        verdict: scoreToVerdict(score),
+        hits,
+        misses,
+        expectedAspectCount: Math.max(hits.length + misses.length, 1),
+        reasoning,
+        evaluatorRawRequest,
+        evaluatorResults
       };
-    }
-    const hits = [];
-    const misses = [];
-    let actualIndex = 0;
-    for (let i = 0; i < expected.length; i++) {
-      const expectedItem = expected[i];
-      const expectedTool = expectedItem.tool;
-      let found = false;
-      let argsMismatch = false;
-      while (actualIndex < toolCalls.length) {
-        const actualCall = toolCalls[actualIndex];
-        if (actualCall.name === expectedTool) {
-          if (argsMatch(expectedItem.args, actualCall.args)) {
-            hits.push(`Found ${expectedTool} at position ${actualIndex}`);
-            actualIndex++;
-            found = true;
-            break;
-          }
-          misses.push(
-            `Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`
-          );
-          actualIndex++;
-          argsMismatch = true;
-          break;
-        }
-        actualIndex++;
-      }
-      if (!found && !argsMismatch) {
-        misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
-      }
-    }
-    const score = hits.length / expected.length;
-    return {
-      score,
-      verdict: scoreToVerdict(score),
-      hits,
-      misses,
-      expectedAspectCount: expected.length
-    };
-  }
-  evaluateExact(toolCalls) {
-    const expected = this.config.expected ?? [];
-    if (expected.length === 0) {
+    } catch {
       return {
-        score: 1,
-        verdict: "pass",
-        hits: ["No tool sequence specified"],
+        score: 0,
+        verdict: "fail",
+        hits: [],
         misses: [],
-        expectedAspectCount: 0
+        expectedAspectCount: 1,
+        evaluatorRawRequest,
+        evaluatorResults
       };
     }
-    const hits = [];
-    const misses = [];
-    if (toolCalls.length !== expected.length) {
-      misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
-    }
-    const checkLength = Math.min(expected.length, toolCalls.length);
-    for (let i = 0; i < checkLength; i++) {
-      const expectedItem = expected[i];
-      const expectedTool = expectedItem.tool;
-      const actualCall = toolCalls[i];
-      const actualTool = actualCall.name;
-      if (actualTool === expectedTool) {
-        if (argsMatch(expectedItem.args, actualCall.args)) {
-          hits.push(`Position ${i}: ${expectedTool}`);
-        } else {
-          misses.push(`Position ${i}: ${expectedTool} args mismatch`);
+  }
+};
+// src/evaluation/evaluators/cost.ts
+var CostEvaluator = class {
+  kind = "cost";
+  config;
+  constructor(options) {
+    this.config = options.config;
+  }
+  evaluate(context) {
+    const { budget } = this.config;
+    const costUsd = context.traceSummary?.costUsd;
+    if (costUsd === void 0) {
+      return {
+        score: 0,
+        verdict: "fail",
+        hits: [],
+        misses: ["No cost data available in trace"],
+        expectedAspectCount: 1,
+        reasoning: "Execution cost not reported by provider",
+        evaluatorRawRequest: {
+          type: "cost",
+          budget,
+          costUsd: null
         }
-      } else {
-        misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
-      }
-    }
-    for (let i = checkLength; i < expected.length; i++) {
-      misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
+      };
     }
-    const score = hits.length / expected.length;
+    const passed = costUsd <= budget;
+    const score = passed ? 1 : 0;
+    const formatCost = (n) => `$${n.toFixed(4)}`;
     return {
       score,
-      verdict: scoreToVerdict(score),
-      hits,
-      misses,
-      expectedAspectCount: expected.length
+      verdict: passed ? "pass" : "fail",
+      hits: passed ? [`Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`] : [],
+      misses: passed ? [] : [`Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`],
+      expectedAspectCount: 1,
+      reasoning: `Execution cost ${formatCost(costUsd)} (budget: ${formatCost(budget)})`,
+      evaluatorRawRequest: {
+        type: "cost",
+        budget,
+        costUsd
+      }
     };
   }
 };
+// src/evaluation/evaluators/field-accuracy.ts
 var DEFAULT_DATE_FORMATS = [
   "YYYY-MM-DDTHH:mm:ssZ",
   // ISO with timezone
@@ -6312,434 +6636,209 @@ var FieldAccuracyEvaluator = class {
     }
     if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
       return {
-        path: path15,
-        score: 0,
-        weight,
-        hit: false,
-        message: `${path15} (invalid numeric value)`
-      };
-    }
-    const diff = Math.abs(candidateNum - expectedNum);
-    let withinTolerance;
-    if (relative) {
-      const relativeDiff = expectedNum === 0 ? diff : diff / Math.abs(expectedNum);
-      withinTolerance = relativeDiff <= tolerance;
-    } else {
-      withinTolerance = diff <= tolerance;
-    }
-    if (withinTolerance) {
-      return {
-        path: path15,
-        score: 1,
-        weight,
-        hit: true,
-        message: `${path15} (within tolerance: diff=${diff.toFixed(2)})`
-      };
-    }
-    return {
-      path: path15,
-      score: 0,
-      weight,
-      hit: false,
-      message: `${path15} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
-    };
-  }
-  /**
-   * Date comparison with format normalization.
-   */
-  compareDate(path15, candidateValue, expectedValue, fieldConfig, weight) {
-    const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
-    const candidateDate = parseDate(String(candidateValue), formats);
-    const expectedDate = parseDate(String(expectedValue), formats);
-    if (candidateDate === null) {
-      return {
-        path: path15,
-        score: 0,
-        weight,
-        hit: false,
-        message: `${path15} (unparseable candidate date)`
-      };
-    }
-    if (expectedDate === null) {
-      return {
-        path: path15,
-        score: 0,
-        weight,
-        hit: false,
-        message: `${path15} (unparseable expected date)`
-      };
-    }
-    if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
-      return {
-        path: path15,
-        score: 1,
-        weight,
-        hit: true,
-        message: path15
-      };
-    }
-    return {
-      path: path15,
-      score: 0,
-      weight,
-      hit: false,
-      message: `${path15} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
-    };
-  }
-  /**
-   * Aggregate field results using configured strategy.
-   */
-  aggregateResults(results) {
-    const aggregation = this.config.aggregation ?? "weighted_average";
-    const hits = [];
-    const misses = [];
-    for (const result of results) {
-      if (result.hit) {
-        hits.push(result.message);
-      } else {
-        misses.push(result.message);
-      }
-    }
-    let score;
-    if (aggregation === "all_or_nothing") {
-      score = misses.length === 0 ? 1 : 0;
-    } else {
-      const totalWeight = results.reduce((sum, r) => sum + r.weight, 0);
-      if (totalWeight === 0) {
-        score = results.length === 0 ? 1 : 0;
-      } else {
-        const weightedSum = results.reduce((sum, r) => sum + r.score * r.weight, 0);
-        score = weightedSum / totalWeight;
-      }
-    }
-    const reasoning = `${hits.length}/${results.length} fields matched`;
-    return {
-      score: clampScore(score),
-      verdict: scoreToVerdict(score),
-      hits: hits.slice(0, 4),
-      misses: misses.slice(0, 4),
-      expectedAspectCount: results.length,
-      reasoning
-    };
-  }
-};
-function resolvePath(obj, path15) {
-  if (!path15 || !obj) {
-    return void 0;
-  }
-  const parts = path15.split(/\.|\[|\]/).filter((p) => p.length > 0);
-  let current = obj;
-  for (const part of parts) {
-    if (current === null || current === void 0) {
-      return void 0;
-    }
-    if (typeof current !== "object") {
-      return void 0;
-    }
-    const isIndex = /^\d+$/.test(part);
-    if (isIndex && Array.isArray(current)) {
-      current = current[Number.parseInt(part, 10)];
-    } else {
-      current = current[part];
-    }
-  }
-  return current;
-}
-function toNumber(value) {
-  if (typeof value === "number") {
-    return value;
-  }
-  if (typeof value === "string") {
-    const num = Number.parseFloat(value);
-    return Number.isNaN(num) ? null : num;
-  }
-  return null;
-}
-function parseDate(dateStr, formats) {
-  if (!dateStr) return null;
-  const trimmed = dateStr.trim();
-  const isoDate = new Date(trimmed);
-  if (!Number.isNaN(isoDate.getTime())) {
-    return isoDate;
-  }
-  const localizedMatch = trimmed.match(/^(\d{1,2})-([A-Za-z]{3,9})-(\d{4})$/);
-  if (localizedMatch) {
-    const day = Number.parseInt(localizedMatch[1], 10);
-    const monthName = localizedMatch[2].toLowerCase();
-    const year = Number.parseInt(localizedMatch[3], 10);
-    const month = MONTH_NAMES[monthName];
-    if (month !== void 0) {
-      return new Date(year, month, day);
-    }
-  }
-  const usMatch = trimmed.match(/^(\d{1,2})[\/\-](\d{1,2})[\/\-](\d{4})$/);
-  if (usMatch) {
-    const hasUSFormat = formats.some((f) => f.includes("MM/DD") || f.includes("MM-DD"));
-    const hasEUFormat = formats.some((f) => f.includes("DD/MM") || f.includes("DD-MM"));
-    if (hasUSFormat && !hasEUFormat) {
-      const month = Number.parseInt(usMatch[1], 10) - 1;
-      const day = Number.parseInt(usMatch[2], 10);
-      const year = Number.parseInt(usMatch[3], 10);
-      if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
-        return new Date(year, month, day);
-      }
-    } else if (hasEUFormat && !hasUSFormat) {
-      const day = Number.parseInt(usMatch[1], 10);
-      const month = Number.parseInt(usMatch[2], 10) - 1;
-      const year = Number.parseInt(usMatch[3], 10);
-      if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
-        return new Date(year, month, day);
-      }
-    } else {
-      const num1 = Number.parseInt(usMatch[1], 10);
-      const num2 = Number.parseInt(usMatch[2], 10);
-      const year = Number.parseInt(usMatch[3], 10);
-      if (num1 > 12 && num2 <= 12) {
-        return new Date(year, num2 - 1, num1);
-      }
-      if (num2 > 12 && num1 <= 12) {
-        return new Date(year, num1 - 1, num2);
-      }
-      if (num1 <= 12 && num2 <= 31) {
-        return new Date(year, num1 - 1, num2);
-      }
-    }
-  }
-  return null;
-}
-function formatDateISO(date) {
-  return date.toISOString().split("T")[0];
-}
-function parseJsonFromTextSafe(text) {
-  const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
-  const match = cleaned.match(/\{[\s\S]*\}/);
-  const blob = match?.[0] ?? cleaned;
-  return JSON.parse(blob);
-}
-var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
-{{EVALUATOR_RESULTS_JSON}}
-Decide the final score and verdict based on all evaluator results.
-Return a JSON object with: score (0.0-1.0), verdict (pass/fail/borderline), and reasoning.`;
-var CompositeEvaluator = class {
-  kind = "composite";
-  config;
-  evaluatorFactory;
-  cwd;
-  constructor(options) {
-    this.config = options.config;
-    this.evaluatorFactory = options.evaluatorFactory;
-    this.cwd = options.cwd;
-  }
-  async evaluate(context) {
-    const memberResults = await Promise.all(
-      this.config.evaluators.map(async (memberConfig) => {
-        const evaluator = this.evaluatorFactory.create(memberConfig, context);
-        return {
-          id: memberConfig.name,
-          type: memberConfig.type,
-          result: await evaluator.evaluate(context)
-        };
-      })
-    );
-    return this.aggregate(memberResults, context);
-  }
-  async aggregate(results, context) {
-    const aggregator = this.config.aggregator;
-    switch (aggregator.type) {
-      case "code_judge":
-        return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
-      case "llm_judge":
-        return this.runLlmAggregator(results, context, aggregator);
-      default:
-        return this.runWeightedAverage(results, aggregator.weights);
-    }
-  }
-  runWeightedAverage(results, weights) {
-    let totalWeight = 0;
-    let weightedSum = 0;
-    const allHits = [];
-    const allMisses = [];
-    const reasoningParts = [];
-    const evaluatorResults = [];
-    for (const member of results) {
-      const weight = weights?.[member.id] ?? 1;
-      totalWeight += weight;
-      weightedSum += member.result.score * weight;
-      allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
-      allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
-      if (member.result.reasoning) {
-        reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
-      }
-      evaluatorResults.push({
-        name: member.id,
-        type: member.type,
-        score: member.result.score,
-        weight,
-        verdict: member.result.verdict,
-        hits: [...member.result.hits],
-        misses: [...member.result.misses],
-        reasoning: member.result.reasoning,
-        evaluatorRawRequest: member.result.evaluatorRawRequest,
-        evaluatorResults: member.result.evaluatorResults
-      });
-    }
-    const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
-    return {
-      score: clampScore(finalScore),
-      verdict: scoreToVerdict(finalScore),
-      hits: allHits,
-      misses: allMisses,
-      expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
-      reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
-      evaluatorRawRequest: {
-        aggregator: "weighted_average",
-        ...weights ? { weights } : {}
-      },
-      evaluatorResults
-    };
-  }
-  async runCodeAggregator(results, scriptPath, cwd, weights) {
-    const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
-    const inputPayload = JSON.stringify({ results: resultsObject }, null, 2);
-    const evaluatorResults = results.map((member) => ({
-      name: member.id,
-      type: member.type,
-      score: member.result.score,
-      weight: weights?.[member.id] ?? 1,
-      verdict: member.result.verdict,
-      hits: [...member.result.hits],
-      misses: [...member.result.misses],
-      reasoning: member.result.reasoning,
-      evaluatorRawRequest: member.result.evaluatorRawRequest,
-      evaluatorResults: member.result.evaluatorResults
-    }));
-    try {
-      const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
-      const parsed = parseJsonSafe(stdout);
-      const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
-      const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
-      const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
-      const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
-      const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
-      return {
-        score,
-        verdict,
-        hits,
-        misses,
-        expectedAspectCount: hits.length + misses.length || 1,
-        reasoning,
-        evaluatorRawRequest: {
-          aggregator: "code_judge",
-          script: scriptPath
-        },
-        evaluatorResults
-      };
-    } catch (error) {
-      const message = error instanceof Error ? error.message : String(error);
-      return {
-        score: 0,
-        verdict: "fail",
-        hits: [],
-        misses: [`Code aggregator failed: ${message}`],
-        expectedAspectCount: 1,
-        reasoning: message,
-        evaluatorRawRequest: {
-          aggregator: "code_judge",
-          script: scriptPath,
-          error: message
-        },
-        evaluatorResults
-      };
-    }
-  }
-  async runLlmAggregator(results, context, config) {
-    const judgeProvider = context.judgeProvider;
-    if (!judgeProvider) {
-      throw new Error("No judge provider available for LLM aggregation");
-    }
-    const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
-    const resultsJson = JSON.stringify(resultsObject, null, 2);
-    const evaluatorResults = results.map((member) => ({
-      name: member.id,
-      type: member.type,
-      score: member.result.score,
-      verdict: member.result.verdict,
-      hits: [...member.result.hits],
-      misses: [...member.result.misses],
-      reasoning: member.result.reasoning,
-      evaluatorRawRequest: member.result.evaluatorRawRequest,
-      evaluatorResults: member.result.evaluatorResults
-    }));
-    const promptTemplate = config.prompt ?? DEFAULT_COMPOSITE_AGGREGATOR_PROMPT;
-    const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
-    const systemPrompt = buildOutputSchema();
-    const evaluatorRawRequest = {
-      aggregator: "llm_judge",
-      userPrompt,
-      systemPrompt,
-      target: judgeProvider.targetName
-    };
-    try {
-      const model = judgeProvider.asLanguageModel?.();
-      if (model) {
-        const { text } = await generateText2({
-          model,
-          system: systemPrompt,
-          prompt: userPrompt
-        });
-        const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text));
-        const score2 = clampScore(data2.score);
-        const hits2 = Array.isArray(data2.hits) ? data2.hits.filter(isNonEmptyString).slice(0, 4) : [];
-        const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
-        const reasoning2 = data2.reasoning;
-        return {
-          score: score2,
-          verdict: scoreToVerdict(score2),
-          hits: hits2,
-          misses: misses2,
-          expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
-          reasoning: reasoning2,
-          evaluatorRawRequest,
-          evaluatorResults
-        };
-      }
-      const response = await judgeProvider.invoke({
-        question: userPrompt,
-        systemPrompt,
-        evalCaseId: context.evalCase.id,
-        attempt: context.attempt
-      });
-      const data = freeformEvaluationSchema.parse(
-        parseJsonFromText(extractLastAssistantContent(response.outputMessages))
-      );
-      const score = clampScore(data.score);
-      const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
-      const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
-      const reasoning = data.reasoning;
-      return {
-        score,
-        verdict: scoreToVerdict(score),
-        hits,
-        misses,
-        expectedAspectCount: Math.max(hits.length + misses.length, 1),
-        reasoning,
-        evaluatorRawRequest,
-        evaluatorResults
+        path: path15,
+        score: 0,
+        weight,
+        hit: false,
+        message: `${path15} (invalid numeric value)`
       };
-    } catch {
+    }
+    const diff = Math.abs(candidateNum - expectedNum);
+    let withinTolerance;
+    if (relative) {
+      const relativeDiff = expectedNum === 0 ? diff : diff / Math.abs(expectedNum);
+      withinTolerance = relativeDiff <= tolerance;
+    } else {
+      withinTolerance = diff <= tolerance;
+    }
+    if (withinTolerance) {
+      return {
+        path: path15,
+        score: 1,
+        weight,
+        hit: true,
+        message: `${path15} (within tolerance: diff=${diff.toFixed(2)})`
+      };
+    }
+    return {
+      path: path15,
+      score: 0,
+      weight,
+      hit: false,
+      message: `${path15} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
+    };
+  }
+  /**
+   * Date comparison with format normalization.
+   */
+  compareDate(path15, candidateValue, expectedValue, fieldConfig, weight) {
+    const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
+    const candidateDate = parseDate(String(candidateValue), formats);
+    const expectedDate = parseDate(String(expectedValue), formats);
+    if (candidateDate === null) {
       return {
+        path: path15,
         score: 0,
-        verdict: "fail",
-        hits: [],
-        misses: [],
-        expectedAspectCount: 1,
-        evaluatorRawRequest,
-        evaluatorResults
+        weight,
+        hit: false,
+        message: `${path15} (unparseable candidate date)`
+      };
+    }
+    if (expectedDate === null) {
+      return {
+        path: path15,
+        score: 0,
+        weight,
+        hit: false,
+        message: `${path15} (unparseable expected date)`
+      };
+    }
+    if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
+      return {
+        path: path15,
+        score: 1,
+        weight,
+        hit: true,
+        message: path15
       };
     }
+    return {
+      path: path15,
+      score: 0,
+      weight,
+      hit: false,
+      message: `${path15} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
+    };
+  }
+  /**
+   * Aggregate field results using configured strategy.
+   */
+  aggregateResults(results) {
+    const aggregation = this.config.aggregation ?? "weighted_average";
+    const hits = [];
+    const misses = [];
+    for (const result of results) {
+      if (result.hit) {
+        hits.push(result.message);
+      } else {
+        misses.push(result.message);
+      }
+    }
+    let score;
+    if (aggregation === "all_or_nothing") {
+      score = misses.length === 0 ? 1 : 0;
+    } else {
+      const totalWeight = results.reduce((sum, r) => sum + r.weight, 0);
+      if (totalWeight === 0) {
+        score = results.length === 0 ? 1 : 0;
+      } else {
+        const weightedSum = results.reduce((sum, r) => sum + r.score * r.weight, 0);
+        score = weightedSum / totalWeight;
+      }
+    }
+    const reasoning = `${hits.length}/${results.length} fields matched`;
+    return {
+      score: clampScore(score),
+      verdict: scoreToVerdict(score),
+      hits: hits.slice(0, 4),
+      misses: misses.slice(0, 4),
+      expectedAspectCount: results.length,
+      reasoning
+    };
   }
 };
+function resolvePath(obj, path15) {
+  if (!path15 || !obj) {
+    return void 0;
+  }
+  const parts = path15.split(/\.|\[|\]/).filter((p) => p.length > 0);
+  let current = obj;
+  for (const part of parts) {
+    if (current === null || current === void 0) {
+      return void 0;
+    }
+    if (typeof current !== "object") {
+      return void 0;
+    }
+    const isIndex = /^\d+$/.test(part);
+    if (isIndex && Array.isArray(current)) {
+      current = current[Number.parseInt(part, 10)];
+    } else {
+      current = current[part];
+    }
+  }
+  return current;
+}
+function toNumber(value) {
+  if (typeof value === "number") {
+    return value;
+  }
+  if (typeof value === "string") {
+    const num = Number.parseFloat(value);
+    return Number.isNaN(num) ? null : num;
+  }
+  return null;
+}
+function parseDate(dateStr, formats) {
+  if (!dateStr) return null;
+  const trimmed = dateStr.trim();
+  const isoDate = new Date(trimmed);
+  if (!Number.isNaN(isoDate.getTime())) {
+    return isoDate;
+  }
+  const localizedMatch = trimmed.match(/^(\d{1,2})-([A-Za-z]{3,9})-(\d{4})$/);
+  if (localizedMatch) {
+    const day = Number.parseInt(localizedMatch[1], 10);
+    const monthName = localizedMatch[2].toLowerCase();
+    const year = Number.parseInt(localizedMatch[3], 10);
+    const month = MONTH_NAMES[monthName];
+    if (month !== void 0) {
+      return new Date(year, month, day);
+    }
+  }
+  const usMatch = trimmed.match(/^(\d{1,2})[\/\-](\d{1,2})[\/\-](\d{4})$/);
+  if (usMatch) {
+    const hasUSFormat = formats.some((f) => f.includes("MM/DD") || f.includes("MM-DD"));
+    const hasEUFormat = formats.some((f) => f.includes("DD/MM") || f.includes("DD-MM"));
+    if (hasUSFormat && !hasEUFormat) {
+      const month = Number.parseInt(usMatch[1], 10) - 1;
+      const day = Number.parseInt(usMatch[2], 10);
+      const year = Number.parseInt(usMatch[3], 10);
+      if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
+        return new Date(year, month, day);
+      }
+    } else if (hasEUFormat && !hasUSFormat) {
+      const day = Number.parseInt(usMatch[1], 10);
+      const month = Number.parseInt(usMatch[2], 10) - 1;
+      const year = Number.parseInt(usMatch[3], 10);
+      if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
+        return new Date(year, month, day);
+      }
+    } else {
+      const num1 = Number.parseInt(usMatch[1], 10);
+      const num2 = Number.parseInt(usMatch[2], 10);
+      const year = Number.parseInt(usMatch[3], 10);
+      if (num1 > 12 && num2 <= 12) {
+        return new Date(year, num2 - 1, num1);
+      }
+      if (num2 > 12 && num1 <= 12) {
+        return new Date(year, num1 - 1, num2);
+      }
+      if (num1 <= 12 && num2 <= 31) {
+        return new Date(year, num1 - 1, num2);
+      }
+    }
+  }
+  return null;
+}
+function formatDateISO(date) {
+  return date.toISOString().split("T")[0];
+}
+function parseJsonFromTextSafe(text) {
+  return parseJsonFromText(text);
+}
+// src/evaluation/evaluators/latency.ts
 var LatencyEvaluator = class {
   kind = "latency";
   config;
@@ -6772,57 +6871,17 @@ var LatencyEvaluator = class {
       hits: passed ? [`Duration ${durationMs}ms <= ${threshold}ms threshold`] : [],
       misses: passed ? [] : [`Duration ${durationMs}ms > ${threshold}ms threshold`],
       expectedAspectCount: 1,
-      reasoning: `Execution took ${durationMs}ms (threshold: ${threshold}ms)`,
-      evaluatorRawRequest: {
-        type: "latency",
-        threshold,
-        durationMs
-      }
-    };
-  }
-};
-var CostEvaluator = class {
-  kind = "cost";
-  config;
-  constructor(options) {
-    this.config = options.config;
-  }
-  evaluate(context) {
-    const { budget } = this.config;
-    const costUsd = context.traceSummary?.costUsd;
-    if (costUsd === void 0) {
-      return {
-        score: 0,
-        verdict: "fail",
-        hits: [],
-        misses: ["No cost data available in trace"],
-        expectedAspectCount: 1,
-        reasoning: "Execution cost not reported by provider",
-        evaluatorRawRequest: {
-          type: "cost",
-          budget,
-          costUsd: null
-        }
-      };
-    }
-    const passed = costUsd <= budget;
-    const score = passed ? 1 : 0;
-    const formatCost = (n) => `$${n.toFixed(4)}`;
-    return {
-      score,
-      verdict: passed ? "pass" : "fail",
-      hits: passed ? [`Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`] : [],
-      misses: passed ? [] : [`Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`],
-      expectedAspectCount: 1,
-      reasoning: `Execution cost ${formatCost(costUsd)} (budget: ${formatCost(budget)})`,
-      evaluatorRawRequest: {
-        type: "cost",
-        budget,
-        costUsd
+      reasoning: `Execution took ${durationMs}ms (threshold: ${threshold}ms)`,
+      evaluatorRawRequest: {
+        type: "latency",
+        threshold,
+        durationMs
       }
     };
   }
 };
+// src/evaluation/evaluators/token-usage.ts
 var TokenUsageEvaluator = class {
   kind = "token_usage";
   config;
@@ -6906,6 +6965,226 @@ var TokenUsageEvaluator = class {
   }
 };
+// src/evaluation/evaluators/tool-trajectory.ts
+function argsMatch(expected, actual) {
+  if (expected === void 0) return true;
+  if (expected === "any") return true;
+  if (actual === void 0) return false;
+  for (const key of Object.keys(expected)) {
+    if (!Object.hasOwn(actual, key)) return false;
+    if (!deepEqual(expected[key], actual[key])) return false;
+  }
+  return true;
+}
+var ToolTrajectoryEvaluator = class {
+  kind = "tool_trajectory";
+  config;
+  constructor(options) {
+    this.config = options.config;
+  }
+  evaluate(context) {
+    const { outputMessages, traceSummary } = context;
+    const toolCalls = this.extractToolCallsFromMessages(outputMessages);
+    if (toolCalls.length === 0 && !traceSummary) {
+      return {
+        score: 0,
+        verdict: "fail",
+        hits: [],
+        misses: ["No trace available for evaluation"],
+        expectedAspectCount: 1
+      };
+    }
+    const summary = toolCalls.length > 0 ? this.buildSummary(toolCalls) : traceSummary;
+    if (!summary) {
+      return {
+        score: 0,
+        verdict: "fail",
+        hits: [],
+        misses: ["No trace available for evaluation"],
+        expectedAspectCount: 1
+      };
+    }
+    switch (this.config.mode) {
+      case "any_order":
+        return this.evaluateAnyOrder(summary);
+      case "in_order":
+        return this.evaluateInOrder(toolCalls);
+      case "exact":
+        return this.evaluateExact(toolCalls);
+      default:
+        return {
+          score: 0,
+          verdict: "fail",
+          hits: [],
+          misses: [`Unknown mode: ${this.config.mode}`],
+          expectedAspectCount: 1
+        };
+    }
+  }
+  /**
+   * Extract tool calls from output messages.
+   */
+  extractToolCallsFromMessages(messages) {
+    if (!messages) {
+      return [];
+    }
+    const toolCalls = [];
+    for (const message of messages) {
+      if (message.toolCalls) {
+        for (const call of message.toolCalls) {
+          toolCalls.push({
+            name: call.tool,
+            args: call.input
+          });
+        }
+      }
+    }
+    return toolCalls;
+  }
+  /**
+   * Build a summary from extracted tool calls.
+   */
+  buildSummary(toolCalls) {
+    const toolCallsByName = {};
+    for (const call of toolCalls) {
+      toolCallsByName[call.name] = (toolCallsByName[call.name] ?? 0) + 1;
+    }
+    const toolNames = Object.keys(toolCallsByName).sort();
+    return {
+      eventCount: toolCalls.length,
+      toolNames,
+      toolCallsByName,
+      errorCount: 0
+    };
+  }
+  evaluateAnyOrder(summary) {
+    const minimums = this.config.minimums ?? {};
+    const toolNames = Object.keys(minimums);
+    if (toolNames.length === 0) {
+      return {
+        score: 1,
+        verdict: "pass",
+        hits: ["No tool requirements specified"],
+        misses: [],
+        expectedAspectCount: 0
+      };
+    }
+    const hits = [];
+    const misses = [];
+    for (const toolName of toolNames) {
+      const required = minimums[toolName];
+      const actual = summary.toolCallsByName[toolName] ?? 0;
+      if (actual >= required) {
+        hits.push(`${toolName}: called ${actual} times (required \u2265${required})`);
+      } else {
+        misses.push(`${toolName}: called ${actual} times (required \u2265${required})`);
+      }
+    }
+    const score = hits.length / toolNames.length;
+    return {
+      score,
+      verdict: scoreToVerdict(score),
+      hits,
+      misses,
+      expectedAspectCount: toolNames.length
+    };
+  }
+  evaluateInOrder(toolCalls) {
+    const expected = this.config.expected ?? [];
+    if (expected.length === 0) {
+      return {
+        score: 1,
+        verdict: "pass",
+        hits: ["No tool sequence specified"],
+        misses: [],
+        expectedAspectCount: 0
+      };
+    }
+    const hits = [];
+    const misses = [];
+    let actualIndex = 0;
+    for (let i = 0; i < expected.length; i++) {
+      const expectedItem = expected[i];
+      const expectedTool = expectedItem.tool;
+      let found = false;
+      let argsMismatch = false;
+      while (actualIndex < toolCalls.length) {
+        const actualCall = toolCalls[actualIndex];
+        if (actualCall.name === expectedTool) {
+          if (argsMatch(expectedItem.args, actualCall.args)) {
+            hits.push(`Found ${expectedTool} at position ${actualIndex}`);
+            actualIndex++;
+            found = true;
+            break;
+          }
+          misses.push(
+            `Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`
+          );
+          actualIndex++;
+          argsMismatch = true;
+          break;
+        }
+        actualIndex++;
+      }
+      if (!found && !argsMismatch) {
+        misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
+      }
+    }
+    const score = hits.length / expected.length;
+    return {
+      score,
+      verdict: scoreToVerdict(score),
+      hits,
+      misses,
+      expectedAspectCount: expected.length
+    };
+  }
+  evaluateExact(toolCalls) {
+    const expected = this.config.expected ?? [];
+    if (expected.length === 0) {
+      return {
+        score: 1,
+        verdict: "pass",
+        hits: ["No tool sequence specified"],
+        misses: [],
+        expectedAspectCount: 0
+      };
+    }
+    const hits = [];
+    const misses = [];
+    if (toolCalls.length !== expected.length) {
+      misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
+    }
+    const checkLength = Math.min(expected.length, toolCalls.length);
+    for (let i = 0; i < checkLength; i++) {
+      const expectedItem = expected[i];
+      const expectedTool = expectedItem.tool;
+      const actualCall = toolCalls[i];
+      const actualTool = actualCall.name;
+      if (actualTool === expectedTool) {
+        if (argsMatch(expectedItem.args, actualCall.args)) {
+          hits.push(`Position ${i}: ${expectedTool}`);
+        } else {
+          misses.push(`Position ${i}: ${expectedTool} args mismatch`);
+        }
+      } else {
+        misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
+      }
+    }
+    for (let i = checkLength; i < expected.length; i++) {
+      misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
+    }
+    const score = hits.length / expected.length;
+    return {
+      score,
+      verdict: scoreToVerdict(score),
+      hits,
+      misses,
+      expectedAspectCount: expected.length
+    };
+  }
+};
 // src/evaluation/orchestrator.ts
 import { createHash } from "node:crypto";
 import path14 from "node:path";
@@ -7119,6 +7398,17 @@ async function runEvaluation(options) {
     }
     return getOrCreateProvider(resolvedJudge);
   };
+  const targetResolver = (name) => {
+    const resolved = resolveTargetByName(name);
+    if (!resolved) {
+      return void 0;
+    }
+    return getOrCreateProvider(resolved);
+  };
+  const availableTargets = [
+    target.name,
+    ...Array.from(targetDefinitions.keys())
+  ];
   const evaluatorRegistry = buildEvaluatorRegistry(evaluators, resolveJudgeProvider);
   const primaryProvider = getOrCreateProvider(target);
   const providerSupportsBatch = target.providerBatching === true && primaryProvider.supportsBatch === true && typeof primaryProvider.invokeBatch === "function";
@@ -7148,7 +7438,9 @@ async function runEvaluation(options) {
         onResult,
         verbose,
         resolveJudgeProvider,
-        agentTimeoutMs
+        agentTimeoutMs,
+        targetResolver,
+        availableTargets
       });
     } catch (error) {
       if (verbose) {
@@ -7187,7 +7479,9 @@ async function runEvaluation(options) {
           cache,
           useCache,
           now,
-          judgeProvider
+          judgeProvider,
+          targetResolver,
+          availableTargets
         });
         if (onProgress) {
           await onProgress({
@@ -7254,7 +7548,9 @@ async function runBatchEvaluation(options) {
     onProgress,
     onResult,
     resolveJudgeProvider,
-    agentTimeoutMs
+    agentTimeoutMs,
+    targetResolver,
+    availableTargets
   } = options;
   const promptInputsList = [];
   const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
@@ -7329,7 +7625,9 @@ async function runBatchEvaluation(options) {
         judgeProvider: await resolveJudgeProvider(target),
         agentTimeoutMs,
         outputMessages,
-        traceSummary
+        traceSummary,
+        targetResolver,
+        availableTargets
       });
       if (providerError) {
         result = { ...result, error: providerError };
@@ -7387,7 +7685,9 @@ async function runEvalCase(options) {
     cache,
     useCache,
     signal,
-    judgeProvider
+    judgeProvider,
+    targetResolver,
+    availableTargets
   } = options;
   const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
   const promptInputs = await buildPromptInputs(evalCase, formattingMode);
@@ -7461,7 +7761,9 @@ async function runEvalCase(options) {
       judgeProvider,
       agentTimeoutMs,
       outputMessages,
-      traceSummary
+      traceSummary,
+      targetResolver,
+      availableTargets
     });
     return providerError ? { ...result, error: providerError } : result;
   } catch (error) {
@@ -7481,7 +7783,9 @@ async function evaluateCandidate(options) {
     judgeProvider,
     agentTimeoutMs,
     outputMessages,
-    traceSummary
+    traceSummary,
+    targetResolver,
+    availableTargets
   } = options;
   const gradeTimestamp = nowFn();
   const { score, evaluatorResults } = await runEvaluatorsForCase({
@@ -7496,7 +7800,9 @@ async function evaluateCandidate(options) {
     judgeProvider,
     agentTimeoutMs,
     outputMessages,
-    traceSummary
+    traceSummary,
+    targetResolver,
+    availableTargets
   });
   const completedAt = nowFn();
   let agentProviderRequest;
@@ -7549,7 +7855,9 @@ async function runEvaluatorsForCase(options) {
     judgeProvider,
     agentTimeoutMs,
     outputMessages,
-    traceSummary
+    traceSummary,
+    targetResolver,
+    availableTargets
   } = options;
   if (evalCase.evaluators && evalCase.evaluators.length > 0) {
     return runEvaluatorList({
@@ -7565,7 +7873,9 @@ async function runEvaluatorsForCase(options) {
       judgeProvider,
       agentTimeoutMs,
       outputMessages,
-      traceSummary
+      traceSummary,
+      targetResolver,
+      availableTargets
     });
   }
   const evaluatorKind = evalCase.evaluator ?? "llm_judge";
@@ -7583,7 +7893,9 @@ async function runEvaluatorsForCase(options) {
     now,
     judgeProvider,
     outputMessages,
-    traceSummary
+    traceSummary,
+    targetResolver,
+    availableTargets
   });
   return { score };
 }
@@ -7601,7 +7913,9 @@ async function runEvaluatorList(options) {
     judgeProvider,
     agentTimeoutMs,
     outputMessages,
-    traceSummary
+    traceSummary,
+    targetResolver,
+    availableTargets
   } = options;
   const scored = [];
   const evaluatorResults = [];
@@ -7639,7 +7953,8 @@ async function runEvaluatorList(options) {
           script: evaluator.script,
           cwd: evaluator.resolvedCwd ?? evaluator.cwd,
           agentTimeoutMs,
-          config: evaluator.config
+          config: evaluator.config,
+          target: evaluator.target
         });
         const score2 = await codeEvaluator.evaluate({
           evalCase,
@@ -7649,8 +7964,11 @@ async function runEvaluatorList(options) {
           attempt,
           promptInputs,
           now,
+          judgeProvider,
           outputMessages,
-          traceSummary
+          traceSummary,
+          targetResolver,
+          availableTargets
         });
         const weight = evaluator.weight ?? 1;
         scored.push({ score: score2, name: evaluator.name, type: "code_judge", weight });
@@ -7663,7 +7981,8 @@ async function runEvaluatorList(options) {
           hits: score2.hits,
           misses: score2.misses,
           reasoning: score2.reasoning,
-          evaluatorProviderRequest: score2.evaluatorRawRequest
+          evaluatorProviderRequest: score2.evaluatorRawRequest,
+          details: score2.details
         });
       }
       if (evaluator.type === "composite") {
@@ -7677,7 +7996,8 @@ async function runEvaluatorList(options) {
                 script: memberConfig.script,
                 cwd: memberConfig.resolvedCwd ?? memberConfig.cwd,
                 agentTimeoutMs,
-                config: memberConfig.config
+                config: memberConfig.config,
+                target: memberConfig.target
               });
             case "composite":
               return new CompositeEvaluator({
@@ -7726,7 +8046,9 @@ async function runEvaluatorList(options) {
           now,
           judgeProvider,
           outputMessages,
-          traceSummary
+          traceSummary,
+          targetResolver,
+          availableTargets
         });
         const weight = evaluator.weight ?? 1;
         scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
@@ -7922,11 +8244,11 @@ async function runEvaluatorList(options) {
     (total, entry) => total + (entry.score.expectedAspectCount ?? 0),
     0
   );
-  const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString2);
+  const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString);
   const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
   const score = {
     score: aggregateScore,
-    verdict: scoreToVerdict2(aggregateScore),
+    verdict: scoreToVerdict(aggregateScore),
     hits,
     misses,
     expectedAspectCount,
@@ -7973,18 +8295,6 @@ async function resolveCustomPrompt(config) {
   }
   return config.prompt;
 }
-function isNonEmptyString2(value) {
-  return typeof value === "string" && value.trim().length > 0;
-}
-function scoreToVerdict2(score) {
-  if (score >= 0.8) {
-    return "pass";
-  }
-  if (score >= 0.6) {
-    return "borderline";
-  }
-  return "fail";
-}
 function filterEvalCases(evalCases, evalId) {
   if (!evalId) {
     return evalCases;
@@ -8127,7 +8437,8 @@ function mapChildResults(children) {
     misses: child.misses,
     reasoning: child.reasoning,
     evaluatorProviderRequest: child.evaluatorRawRequest,
-    evaluatorResults: mapChildResults(child.evaluatorResults)
+    evaluatorResults: mapChildResults(child.evaluatorResults),
+    details: child.details
   }));
 }
 function computeWeightedMean(entries) {
@@ -8142,7 +8453,7 @@ function computeWeightedMean(entries) {
 }
 // src/evaluation/generators/rubric-generator.ts
-import { generateText as generateText3 } from "ai";
+import { generateText as generateText4 } from "ai";
 import { z as z3 } from "zod";
 var rubricItemSchema = z3.object({
   id: z3.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
@@ -8176,7 +8487,7 @@ You must return a valid JSON object matching this schema:
   let lastError;
   for (let attempt = 1; attempt <= 3; attempt++) {
     try {
-      const { text } = await generateText3({
+      const { text } = await generateText4({
         model,
         system,
         prompt
@@ -8238,31 +8549,39 @@ export {
   ToolTrajectoryEvaluator,
   avgToolDurationMs,
   buildDirectoryChain,
+  buildOutputSchema,
   buildPromptInputs,
   buildSearchRoots,
+  clampScore,
   computeTraceSummary,
   consumeClaudeCodeLogEntries,
   consumeCodexLogEntries,
   consumePiLogEntries,
   createAgentKernel,
   createProvider,
+  deepEqual,
   ensureVSCodeSubagents,
+  executeScript,
   explorationRatio,
-  extractCodeBlocks,
+  extractJsonBlob,
   fileExists,
   findGitRoot,
+  freeformEvaluationSchema,
   generateRubrics,
   getHitCount,
   isEvaluatorKind,
   isGuidelineFile,
   isJsonObject,
   isJsonValue,
+  isNonEmptyString,
   isTestMessage,
   isTestMessageRole,
   listTargetNames,
   loadEvalCases,
   mergeExecutionMetrics,
   normalizeLineEndings,
+  parseJsonFromText,
+  parseJsonSafe,
   readJsonFile,
   readTargetDefinitions,
   readTestSuiteMetadata,
@@ -8272,6 +8591,7 @@ export {
   resolveTargetDefinition,
   runEvalCase,
   runEvaluation,
+  scoreToVerdict,
   subscribeToClaudeCodeLogEntries,
   subscribeToCodexLogEntries,
   subscribeToPiLogEntries,