npm - @agentv/core - Versions diffs - 2.0.1 → 2.1.0 - Mend

@agentv/core 2.0.1 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/dist/{chunk-IBTKEEOT.js → chunk-KDEP4I7G.js} +44 -1
package/dist/chunk-KDEP4I7G.js.map +1 -0
package/dist/evaluation/validation/index.cjs +1 -0
package/dist/evaluation/validation/index.cjs.map +1 -1
package/dist/evaluation/validation/index.js +1 -1
package/dist/index.cjs +1641 -1138
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +157 -100
package/dist/index.d.ts +157 -100
package/dist/index.js +1451 -997
package/dist/index.js.map +1 -1
package/package.json +4 -1
package/dist/chunk-IBTKEEOT.js.map +0 -1

package/dist/index.js CHANGED Viewed

@@ -10,7 +10,7 @@ import {
   readTextFile,
   resolveFileReference,
   resolveTargetDefinition
-} from "./chunk-IBTKEEOT.js";
+} from "./chunk-KDEP4I7G.js";
 // src/evaluation/types.ts
 var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
@@ -150,85 +150,6 @@ import { readFile as readFile5 } from "node:fs/promises";
 import path6 from "node:path";
 import { parse as parse2 } from "yaml";
-// src/evaluation/formatting/segment-formatter.ts
-function extractCodeBlocks(segments) {
-  const CODE_BLOCK_PATTERN = /```[\s\S]*?```/g;
-  const codeBlocks = [];
-  for (const segment of segments) {
-    const typeValue = segment.type;
-    if (typeof typeValue !== "string" || typeValue !== "text") {
-      continue;
-    }
-    const textValue = segment.value;
-    if (typeof textValue !== "string") {
-      continue;
-    }
-    const matches = textValue.match(CODE_BLOCK_PATTERN);
-    if (matches) {
-      codeBlocks.push(...matches);
-    }
-  }
-  return codeBlocks;
-}
-function formatFileContents(parts) {
-  const fileCount = parts.filter((p) => p.isFile).length;
-  if (fileCount > 0) {
-    return parts.map((part) => {
-      if (part.isFile && part.displayPath) {
-        return `<file path="${part.displayPath}">
-${part.content}
-</file>`;
-      }
-      return part.content;
-    }).join("\n\n");
-  }
-  return parts.map((p) => p.content).join(" ");
-}
-function formatSegment(segment, mode = "lm") {
-  const type = asString(segment.type);
-  if (type === "text") {
-    return asString(segment.value);
-  }
-  if (type === "guideline_ref") {
-    const refPath = asString(segment.path);
-    return refPath ? `<Attached: ${refPath}>` : void 0;
-  }
-  if (type === "file") {
-    const filePath = asString(segment.path);
-    if (!filePath) {
-      return void 0;
-    }
-    if (mode === "agent") {
-      return `<file: path="${filePath}">`;
-    }
-    const text = asString(segment.text);
-    if (text && filePath) {
-      return formatFileContents([{ content: text.trim(), isFile: true, displayPath: filePath }]);
-    }
-  }
-  return void 0;
-}
-function hasVisibleContent(segments) {
-  return segments.some((segment) => {
-    const type = asString(segment.type);
-    if (type === "text") {
-      const value = asString(segment.value);
-      return value !== void 0 && value.trim().length > 0;
-    }
-    if (type === "guideline_ref") {
-      return false;
-    }
-    if (type === "file") {
-      const text = asString(segment.text);
-      return text !== void 0 && text.trim().length > 0;
-    }
-    return false;
-  });
-}
-function asString(value) {
-  return typeof value === "string" ? value : void 0;
-}
 // src/evaluation/loaders/config-loader.ts
 import { readFile } from "node:fs/promises";
 import path2 from "node:path";
@@ -483,7 +404,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
       logWarning2(`Skipping invalid evaluator entry for '${evalId}' (expected object)`);
       continue;
     }
-    const name = asString2(rawEvaluator.name);
+    const name = asString(rawEvaluator.name);
     const typeValue = rawEvaluator.type;
     if (!name || !isEvaluatorKind(typeValue)) {
       logWarning2(`Skipping evaluator with invalid name/type in '${evalId}'`);
@@ -511,7 +432,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
         continue;
       }
       const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
-      const cwd = asString2(rawEvaluator.cwd);
+      const cwd = asString(rawEvaluator.cwd);
       let resolvedCwd;
       if (cwd) {
         const resolved = await resolveFileReference2(cwd, searchRoots);
@@ -526,7 +447,29 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
       } else {
         resolvedCwd = searchRoots[0];
       }
-      const knownProps = /* @__PURE__ */ new Set(["name", "type", "script", "cwd", "weight"]);
+      const rawTarget = rawEvaluator.target;
+      let targetConfig;
+      if (rawTarget !== void 0) {
+        if (isJsonObject2(rawTarget)) {
+          const maxCalls = rawTarget.max_calls;
+          if (maxCalls !== void 0 && (typeof maxCalls !== "number" || maxCalls < 0)) {
+            logWarning2(
+              `Invalid target.max_calls for evaluator '${name}' in '${evalId}': must be a non-negative number`
+            );
+          } else {
+            targetConfig = {
+              ...typeof maxCalls === "number" ? { max_calls: maxCalls } : {}
+            };
+          }
+        } else if (rawTarget === true) {
+          targetConfig = {};
+        } else {
+          logWarning2(
+            `Invalid target config for evaluator '${name}' in '${evalId}': expected object or true`
+          );
+        }
+      }
+      const knownProps = /* @__PURE__ */ new Set(["name", "type", "script", "cwd", "weight", "target"]);
       const config = {};
       for (const [key, value] of Object.entries(rawEvaluator)) {
         if (!knownProps.has(key) && value !== void 0) {
@@ -540,7 +483,8 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
         cwd,
         resolvedCwd,
         ...weight2 !== void 0 ? { weight: weight2 } : {},
-        ...Object.keys(config).length > 0 ? { config } : {}
+        ...Object.keys(config).length > 0 ? { config } : {},
+        ...targetConfig !== void 0 ? { target: targetConfig } : {}
       });
       continue;
     }
@@ -557,7 +501,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
         logWarning2(`Skipping composite evaluator '${name}' in '${evalId}': missing aggregator`);
         continue;
       }
-      const aggregatorType = asString2(rawAggregator.type);
+      const aggregatorType = asString(rawAggregator.type);
       if (aggregatorType !== "weighted_average" && aggregatorType !== "code_judge" && aggregatorType !== "llm_judge") {
         logWarning2(
           `Skipping composite evaluator '${name}' in '${evalId}': invalid aggregator type '${aggregatorType}'`
@@ -570,7 +514,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
           logWarning2(`Skipping invalid member evaluator in composite '${name}' (expected object)`);
           continue;
         }
-        const memberName = asString2(rawMember.name);
+        const memberName = asString(rawMember.name);
         const memberType = rawMember.type;
         if (!memberName || !isEvaluatorKind(memberType)) {
           logWarning2(`Skipping member evaluator with invalid name/type in composite '${name}'`);
@@ -608,7 +552,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
           ...Object.keys(parsedWeights).length > 0 ? { weights: parsedWeights } : {}
         };
       } else if (aggregatorType === "code_judge") {
-        const aggregatorPath = asString2(rawAggregator.path);
+        const aggregatorPath = asString(rawAggregator.path);
         if (!aggregatorPath) {
           logWarning2(
             `Skipping composite evaluator '${name}' in '${evalId}': code_judge aggregator missing path`
@@ -621,7 +565,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
           cwd: searchRoots[0]
         };
       } else {
-        const aggregatorPrompt = asString2(rawAggregator.prompt);
+        const aggregatorPrompt = asString(rawAggregator.prompt);
         let promptPath2;
         if (aggregatorPrompt) {
           const resolved = await resolveFileReference2(aggregatorPrompt, searchRoots);
@@ -646,7 +590,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
       continue;
     }
     if (typeValue === "tool_trajectory") {
-      const mode = asString2(rawEvaluator.mode);
+      const mode = asString(rawEvaluator.mode);
       if (mode !== "any_order" && mode !== "in_order" && mode !== "exact") {
         logWarning2(
           `Skipping tool_trajectory evaluator '${name}' in '${evalId}': invalid mode '${mode}' (must be any_order, in_order, or exact)`
@@ -737,8 +681,8 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
           );
           continue;
         }
-        const fieldPath = asString2(rawField.path);
-        const match = asString2(rawField.match);
+        const fieldPath = asString(rawField.path);
+        const match = asString(rawField.match);
         if (!fieldPath) {
           logWarning2(
             `Skipping field without path in field_accuracy evaluator '${name}' in '${evalId}'`
@@ -768,7 +712,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
         );
         continue;
       }
-      const aggregation = asString2(rawEvaluator.aggregation);
+      const aggregation = asString(rawEvaluator.aggregation);
       const validAggregation = isValidFieldAggregationType(aggregation) ? aggregation : void 0;
       const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
       evaluators.push({
@@ -849,7 +793,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
       });
       continue;
     }
-    const prompt = asString2(rawEvaluator.prompt);
+    const prompt = asString(rawEvaluator.prompt);
     let promptPath;
     if (prompt) {
       const resolved = await resolveFileReference2(prompt, searchRoots);
@@ -868,11 +812,11 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
         );
       }
     }
-    const _model = asString2(rawEvaluator.model);
+    const _model = asString(rawEvaluator.model);
     const rawRubrics = rawEvaluator.rubrics;
     const parsedRubrics = Array.isArray(rawRubrics) ? rawRubrics.filter((r) => isJsonObject2(r)).map((rubric, index) => ({
-      id: asString2(rubric.id) ?? `rubric-${index + 1}`,
-      description: asString2(rubric.description) ?? "",
+      id: asString(rubric.id) ?? `rubric-${index + 1}`,
+      description: asString(rubric.description) ?? "",
       weight: typeof rubric.weight === "number" ? rubric.weight : 1,
       required: typeof rubric.required === "boolean" ? rubric.required : true
     })).filter((r) => r.description.length > 0) : void 0;
@@ -916,7 +860,7 @@ function coerceEvaluator(candidate, contextId) {
   logWarning2(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
   return void 0;
 }
-function asString2(value) {
+function asString(value) {
   return typeof value === "string" ? value : void 0;
 }
 function asStringArray(value, description) {
@@ -992,6 +936,68 @@ function isValidFieldAggregationType(value) {
 // src/evaluation/loaders/message-processor.ts
 import { readFile as readFile3 } from "node:fs/promises";
 import path4 from "node:path";
+// src/evaluation/formatting/segment-formatter.ts
+function formatFileContents(parts) {
+  const fileCount = parts.filter((p) => p.isFile).length;
+  if (fileCount > 0) {
+    return parts.map((part) => {
+      if (part.isFile && part.displayPath) {
+        return `<file path="${part.displayPath}">
+${part.content}
+</file>`;
+      }
+      return part.content;
+    }).join("\n\n");
+  }
+  return parts.map((p) => p.content).join(" ");
+}
+function formatSegment(segment, mode = "lm") {
+  const type = asString2(segment.type);
+  if (type === "text") {
+    return asString2(segment.value);
+  }
+  if (type === "guideline_ref") {
+    const refPath = asString2(segment.path);
+    return refPath ? `<Attached: ${refPath}>` : void 0;
+  }
+  if (type === "file") {
+    const filePath = asString2(segment.path);
+    if (!filePath) {
+      return void 0;
+    }
+    if (mode === "agent") {
+      return `<file: path="${filePath}">`;
+    }
+    const text = asString2(segment.text);
+    if (text && filePath) {
+      return formatFileContents([{ content: text.trim(), isFile: true, displayPath: filePath }]);
+    }
+  }
+  return void 0;
+}
+function hasVisibleContent(segments) {
+  return segments.some((segment) => {
+    const type = asString2(segment.type);
+    if (type === "text") {
+      const value = asString2(segment.value);
+      return value !== void 0 && value.trim().length > 0;
+    }
+    if (type === "guideline_ref") {
+      return false;
+    }
+    if (type === "file") {
+      const text = asString2(segment.text);
+      return text !== void 0 && text.trim().length > 0;
+    }
+    return false;
+  });
+}
+function asString2(value) {
+  return typeof value === "string" ? value : void 0;
+}
+// src/evaluation/loaders/message-processor.ts
 var ANSI_YELLOW4 = "\x1B[33m";
 var ANSI_RESET4 = "\x1B[0m";
 async function processMessages(options) {
@@ -1297,9 +1303,6 @@ ${messageContent}`);
         questionParts.push(formattedContent);
       }
     }
-    if (testCase.code_snippets.length > 0) {
-      questionParts.push(testCase.code_snippets.join("\n"));
-    }
     question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
   }
   const chatPrompt = useRoleMarkers ? buildChatPromptFromSegments({
@@ -1498,7 +1501,6 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
       repoRootPath,
       verbose
     }) : [];
-    const codeSnippets = extractCodeBlocks(inputSegments);
     let referenceAnswer = "";
     if (outputSegments.length > 0) {
       const lastMessage = outputSegments[outputSegments.length - 1];
@@ -1571,7 +1573,6 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
       guideline_paths: guidelinePaths.map((guidelinePath) => path6.resolve(guidelinePath)),
       guideline_patterns: guidelinePatterns,
       file_paths: allFilePaths,
-      code_snippets: codeSnippets,
       expected_outcome: outcome,
       evaluator: evalCaseEvaluatorKind,
       evaluators
@@ -4084,6 +4085,167 @@ var MockProvider = class {
   }
 };
+// src/evaluation/providers/pi-agent-sdk.ts
+var piAgentModule = null;
+var piAiModule = null;
+async function loadPiModules() {
+  if (!piAgentModule || !piAiModule) {
+    try {
+      [piAgentModule, piAiModule] = await Promise.all([
+        import("@mariozechner/pi-agent"),
+        import("@mariozechner/pi-ai")
+      ]);
+    } catch (error) {
+      throw new Error(
+        `Failed to load pi-agent-sdk dependencies. Please install them:
+  npm install @mariozechner/pi-agent @mariozechner/pi-ai
+Original error: ${error instanceof Error ? error.message : String(error)}`
+      );
+    }
+  }
+  return {
+    Agent: piAgentModule.Agent,
+    ProviderTransport: piAgentModule.ProviderTransport,
+    getModel: piAiModule.getModel,
+    getEnvApiKey: piAiModule.getEnvApiKey
+  };
+}
+var PiAgentSdkProvider = class {
+  id;
+  kind = "pi-agent-sdk";
+  targetName;
+  supportsBatch = false;
+  config;
+  constructor(targetName, config) {
+    this.id = `pi-agent-sdk:${targetName}`;
+    this.targetName = targetName;
+    this.config = config;
+  }
+  async invoke(request) {
+    if (request.signal?.aborted) {
+      throw new Error("Pi agent SDK request was aborted before execution");
+    }
+    const { Agent, ProviderTransport, getModel, getEnvApiKey } = await loadPiModules();
+    const startTime = Date.now();
+    const providerName = this.config.provider ?? "anthropic";
+    const modelId = this.config.model ?? "claude-sonnet-4-20250514";
+    const model = getModel(providerName, modelId);
+    const systemPrompt = this.config.systemPrompt ?? "Answer directly and concisely.";
+    const transport = new ProviderTransport({
+      getApiKey: async (provider) => {
+        return this.config.apiKey ?? getEnvApiKey(provider) ?? void 0;
+      }
+    });
+    const agent = new Agent({
+      initialState: {
+        systemPrompt,
+        model,
+        tools: [],
+        // No tools for simple Q&A
+        messages: []
+      },
+      transport
+    });
+    const outputMessages = [];
+    let finalAssistantContent = "";
+    const unsubscribe = agent.subscribe((event) => {
+      if (event.type === "message_end") {
+        const msg = event.message;
+        if (msg.role === "assistant") {
+          const content = extractTextContent2(msg.content);
+          if (content) {
+            finalAssistantContent = content;
+          }
+        }
+      }
+    });
+    try {
+      const timeoutMs = this.config.timeoutMs ?? 12e4;
+      const timeoutPromise = new Promise((_, reject) => {
+        setTimeout(
+          () => reject(new Error(`Pi agent SDK timed out after ${timeoutMs}ms`)),
+          timeoutMs
+        );
+      });
+      await Promise.race([agent.prompt(request.question), timeoutPromise]);
+      await agent.waitForIdle();
+      const agentMessages = agent.state.messages;
+      for (const msg of agentMessages) {
+        outputMessages.push(convertAgentMessage(msg));
+      }
+      const durationMs = Date.now() - startTime;
+      return {
+        raw: {
+          messages: agentMessages,
+          systemPrompt,
+          model: this.config.model,
+          provider: this.config.provider
+        },
+        outputMessages,
+        durationMs
+      };
+    } finally {
+      unsubscribe();
+    }
+  }
+};
+function extractTextContent2(content) {
+  if (typeof content === "string") {
+    return content;
+  }
+  if (!Array.isArray(content)) {
+    return void 0;
+  }
+  const textParts = [];
+  for (const part of content) {
+    if (!part || typeof part !== "object") {
+      continue;
+    }
+    const p = part;
+    if (p.type === "text" && typeof p.text === "string") {
+      textParts.push(p.text);
+    }
+  }
+  return textParts.length > 0 ? textParts.join("\n") : void 0;
+}
+function convertAgentMessage(message) {
+  if (!message || typeof message !== "object") {
+    return { role: "unknown", content: String(message) };
+  }
+  const msg = message;
+  const role = typeof msg.role === "string" ? msg.role : "unknown";
+  const content = extractTextContent2(msg.content);
+  const toolCalls = extractToolCalls2(msg.content);
+  const timestamp = typeof msg.timestamp === "number" ? new Date(msg.timestamp).toISOString() : typeof msg.timestamp === "string" ? msg.timestamp : void 0;
+  return {
+    role,
+    content,
+    toolCalls: toolCalls.length > 0 ? toolCalls : void 0,
+    timestamp
+  };
+}
+function extractToolCalls2(content) {
+  if (!Array.isArray(content)) {
+    return [];
+  }
+  const toolCalls = [];
+  for (const part of content) {
+    if (!part || typeof part !== "object") {
+      continue;
+    }
+    const p = part;
+    if (p.type === "tool_use" && typeof p.name === "string") {
+      toolCalls.push({
+        tool: p.name,
+        input: p.input,
+        id: typeof p.id === "string" ? p.id : void 0
+      });
+    }
+  }
+  return toolCalls;
+}
 // src/evaluation/providers/pi-coding-agent.ts
 import { spawn as spawn3 } from "node:child_process";
 import { randomUUID as randomUUID3 } from "node:crypto";
@@ -4599,8 +4761,8 @@ function convertPiMessage(message) {
   if (typeof role !== "string") {
     return void 0;
   }
-  const content = extractTextContent2(msg.content);
-  const toolCalls = extractToolCalls2(msg.content);
+  const content = extractTextContent3(msg.content);
+  const toolCalls = extractToolCalls3(msg.content);
   const timestamp = typeof msg.timestamp === "number" ? new Date(msg.timestamp).toISOString() : typeof msg.timestamp === "string" ? msg.timestamp : void 0;
   const metadata = {};
   if (msg.api) metadata.api = msg.api;
@@ -4616,7 +4778,7 @@ function convertPiMessage(message) {
     metadata: Object.keys(metadata).length > 0 ? metadata : void 0
   };
 }
-function extractTextContent2(content) {
+function extractTextContent3(content) {
   if (typeof content === "string") {
     return content;
   }
@@ -4635,7 +4797,7 @@ function extractTextContent2(content) {
   }
   return textParts.length > 0 ? textParts.join("\n") : void 0;
 }
-function extractToolCalls2(content) {
+function extractToolCalls3(content) {
   if (!Array.isArray(content)) {
     return [];
   }
@@ -5130,6 +5292,8 @@ function createProvider(target) {
       return new CodexProvider(target.name, target.config);
     case "pi-coding-agent":
       return new PiCodingAgentProvider(target.name, target.config);
+    case "pi-agent-sdk":
+      return new PiAgentSdkProvider(target.name, target.config);
     case "claude-code":
       return new ClaudeCodeProvider(target.name, target.config);
     case "mock":
@@ -5148,25 +5312,80 @@ function resolveAndCreateProvider(definition, env = process.env) {
   return createProvider(resolved);
 }
-// src/evaluation/evaluators.ts
-import { generateText as generateText2 } from "ai";
-import { z as z2 } from "zod";
-// src/runtime/exec.ts
-function shellEscapePath(value) {
-  if (process.platform === "win32") {
-    return `"${value.replaceAll('"', '""')}"`;
+// src/evaluation/evaluators/scoring.ts
+function scoreToVerdict(score) {
+  if (score >= 0.8) {
+    return "pass";
   }
-  return `'${value.replaceAll("'", `'"'"'`)}'`;
+  if (score >= 0.6) {
+    return "borderline";
+  }
+  return "fail";
 }
-async function execFileWithStdin(argv, stdinPayload, options = {}) {
-  if (argv.length === 0) {
-    throw new Error("Executable argv must include at least one entry");
+function clampScore(value) {
+  if (Number.isNaN(value) || !Number.isFinite(value)) {
+    return 0;
   }
-  if (typeof Bun !== "undefined") {
-    return execFileWithStdinBun(argv, stdinPayload, options);
+  if (value < 0) {
+    return 0;
   }
-  return execFileWithStdinNode(argv, stdinPayload, options);
+  if (value > 1) {
+    return 1;
+  }
+  return value;
+}
+function extractJsonBlob(text) {
+  const match = text.match(/\{[\s\S]*\}/);
+  return match?.[0];
+}
+function parseJsonFromText(text) {
+  const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
+  const blob = extractJsonBlob(cleaned) ?? cleaned;
+  return JSON.parse(blob);
+}
+function isNonEmptyString(value) {
+  return typeof value === "string" && value.trim().length > 0;
+}
+function parseJsonSafe(payload) {
+  try {
+    return JSON.parse(payload);
+  } catch {
+    return void 0;
+  }
+}
+function deepEqual(a, b) {
+  if (a === b) return true;
+  if (a === null || b === null) return a === b;
+  if (typeof a !== typeof b) return false;
+  if (typeof a !== "object") return a === b;
+  if (Array.isArray(a) !== Array.isArray(b)) return false;
+  if (Array.isArray(a) && Array.isArray(b)) {
+    if (a.length !== b.length) return false;
+    return a.every((val, i) => deepEqual(val, b[i]));
+  }
+  const aObj = a;
+  const bObj = b;
+  const aKeys = Object.keys(aObj);
+  const bKeys = Object.keys(bObj);
+  if (aKeys.length !== bKeys.length) return false;
+  return aKeys.every((key) => Object.hasOwn(bObj, key) && deepEqual(aObj[key], bObj[key]));
+}
+// src/runtime/exec.ts
+function shellEscapePath(value) {
+  if (process.platform === "win32") {
+    return `"${value.replaceAll('"', '""')}"`;
+  }
+  return `'${value.replaceAll("'", `'"'"'`)}'`;
+}
+async function execFileWithStdin(argv, stdinPayload, options = {}) {
+  if (argv.length === 0) {
+    throw new Error("Executable argv must include at least one entry");
+  }
+  if (typeof Bun !== "undefined") {
+    return execFileWithStdinBun(argv, stdinPayload, options);
+  }
+  return execFileWithStdinNode(argv, stdinPayload, options);
 }
 async function execFileWithStdinBun(argv, stdinPayload, options) {
   const command = [...argv];
@@ -5175,7 +5394,9 @@ async function execFileWithStdinBun(argv, stdinPayload, options) {
     cwd: options.cwd,
     stdin: encoder.encode(stdinPayload),
     stdout: "pipe",
-    stderr: "pipe"
+    stderr: "pipe",
+    // Merge additional env vars with process.env
+    env: options.env ? { ...process.env, ...options.env } : process.env
   });
   let timedOut = false;
   const timeout = options.timeoutMs !== void 0 ? setTimeout(() => {
@@ -5210,7 +5431,9 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
     const [cmd, ...args] = argv;
     const child = spawn4(cmd, args, {
       cwd: options.cwd,
-      stdio: ["pipe", "pipe", "pipe"]
+      stdio: ["pipe", "pipe", "pipe"],
+      // Merge additional env vars with process.env
+      env: options.env ? { ...process.env, ...options.env } : process.env
     });
     const stdoutChunks = [];
     const stderrChunks = [];
@@ -5263,7 +5486,9 @@ async function execShellWithStdin(command, stdinPayload, options = {}) {
       const child = spawn4(wrappedCommand, {
         shell: true,
         cwd: options.cwd,
-        stdio: ["ignore", "ignore", "ignore"]
+        stdio: ["ignore", "ignore", "ignore"],
+        // Merge additional env vars with process.env
+        env: options.env ? { ...process.env, ...options.env } : process.env
       });
       const timeout = options.timeoutMs ? setTimeout(() => {
         child.kill();
@@ -5290,6 +5515,221 @@ async function execShellWithStdin(command, stdinPayload, options = {}) {
   }
 }
+// src/runtime/target-proxy.ts
+import { randomBytes } from "node:crypto";
+import { createServer } from "node:http";
+var DEFAULT_MAX_CALLS = 50;
+async function createTargetProxy(options) {
+  const { defaultProvider, targetResolver, availableTargets, maxCalls } = options;
+  const token = randomBytes(32).toString("hex");
+  let callCount = 0;
+  let isShutdown = false;
+  const targetsList = availableTargets ?? [defaultProvider.targetName];
+  function resolveProvider(targetName) {
+    if (targetName === void 0 || targetName === defaultProvider.targetName) {
+      return defaultProvider;
+    }
+    if (targetResolver) {
+      return targetResolver(targetName);
+    }
+    return void 0;
+  }
+  const server = createServer(async (req, res) => {
+    res.setHeader("Access-Control-Allow-Origin", "*");
+    res.setHeader("Access-Control-Allow-Methods", "GET, POST, OPTIONS");
+    res.setHeader("Access-Control-Allow-Headers", "Content-Type, Authorization");
+    if (req.method === "OPTIONS") {
+      res.writeHead(204);
+      res.end();
+      return;
+    }
+    const authHeader = req.headers.authorization;
+    if (!authHeader || authHeader !== `Bearer ${token}`) {
+      sendJson(res, 401, { error: "Unauthorized" });
+      return;
+    }
+    if (isShutdown) {
+      sendJson(res, 503, { error: "Proxy is shutting down" });
+      return;
+    }
+    const url2 = req.url ?? "";
+    if (req.method === "GET" && url2 === "/info") {
+      handleInfo(res);
+      return;
+    }
+    if (req.method === "POST" && url2 === "/invoke") {
+      await handleInvoke(req, res);
+      return;
+    }
+    if (req.method === "POST" && url2 === "/invokeBatch") {
+      await handleInvokeBatch(req, res);
+      return;
+    }
+    sendJson(res, 404, { error: "Not found" });
+  });
+  function handleInfo(res) {
+    const response = {
+      targetName: defaultProvider.targetName,
+      maxCalls,
+      callCount,
+      availableTargets: targetsList
+    };
+    sendJson(res, 200, response);
+  }
+  async function handleInvoke(req, res) {
+    if (callCount >= maxCalls) {
+      sendJson(res, 429, { error: `Max calls exceeded (limit: ${maxCalls})` });
+      return;
+    }
+    try {
+      const body = await readBody(req);
+      const request = JSON.parse(body);
+      if (!request.question || typeof request.question !== "string") {
+        sendJson(res, 400, { error: "Missing required field: question" });
+        return;
+      }
+      const provider = resolveProvider(request.target);
+      if (!provider) {
+        sendJson(res, 400, {
+          error: `Unknown target '${request.target}'. Available: ${targetsList.join(", ")}`
+        });
+        return;
+      }
+      callCount++;
+      const response = await provider.invoke({
+        question: request.question,
+        systemPrompt: request.systemPrompt,
+        evalCaseId: request.evalCaseId ?? "proxy",
+        attempt: request.attempt ?? 1
+      });
+      const outputMessages = response.outputMessages ?? [];
+      const rawText = extractLastAssistantContent2(outputMessages);
+      const result = {
+        outputMessages,
+        rawText
+      };
+      sendJson(res, 200, result);
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      sendJson(res, 500, { error: message });
+    }
+  }
+  async function handleInvokeBatch(req, res) {
+    try {
+      const body = await readBody(req);
+      const { requests } = JSON.parse(body);
+      if (!Array.isArray(requests)) {
+        sendJson(res, 400, { error: "Missing required field: requests (array)" });
+        return;
+      }
+      if (callCount + requests.length > maxCalls) {
+        sendJson(res, 429, {
+          error: `Batch would exceed max calls (current: ${callCount}, batch: ${requests.length}, limit: ${maxCalls})`
+        });
+        return;
+      }
+      const responses = [];
+      for (const request of requests) {
+        if (!request.question || typeof request.question !== "string") {
+          responses.push({
+            outputMessages: [],
+            rawText: "Error: Missing required field: question"
+          });
+          continue;
+        }
+        const provider = resolveProvider(request.target);
+        if (!provider) {
+          responses.push({
+            outputMessages: [],
+            rawText: `Error: Unknown target '${request.target}'. Available: ${targetsList.join(", ")}`
+          });
+          continue;
+        }
+        callCount++;
+        try {
+          const response = await provider.invoke({
+            question: request.question,
+            systemPrompt: request.systemPrompt,
+            evalCaseId: request.evalCaseId ?? "proxy",
+            attempt: request.attempt ?? 1
+          });
+          const outputMessages = response.outputMessages ?? [];
+          responses.push({
+            outputMessages,
+            rawText: extractLastAssistantContent2(outputMessages)
+          });
+        } catch (error) {
+          const message = error instanceof Error ? error.message : String(error);
+          responses.push({
+            outputMessages: [],
+            rawText: `Error: ${message}`
+          });
+        }
+      }
+      sendJson(res, 200, { responses });
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      sendJson(res, 500, { error: message });
+    }
+  }
+  await new Promise((resolve, reject) => {
+    server.once("error", reject);
+    server.listen(0, "127.0.0.1", () => {
+      server.removeListener("error", reject);
+      resolve();
+    });
+  });
+  const address = server.address();
+  const url = `http://127.0.0.1:${address.port}`;
+  return {
+    url,
+    token,
+    shutdown: async () => {
+      isShutdown = true;
+      return new Promise((resolve, reject) => {
+        server.close((err) => {
+          if (err) reject(err);
+          else resolve();
+        });
+      });
+    },
+    getUsageMetadata: () => ({
+      callCount,
+      maxCalls
+    })
+  };
+}
+function sendJson(res, statusCode, body) {
+  res.writeHead(statusCode, { "Content-Type": "application/json" });
+  res.end(JSON.stringify(body));
+}
+function readBody(req) {
+  return new Promise((resolve, reject) => {
+    const chunks = [];
+    req.on("data", (chunk) => chunks.push(chunk));
+    req.on("end", () => resolve(Buffer.concat(chunks).toString("utf8")));
+    req.on("error", reject);
+  });
+}
+function extractLastAssistantContent2(messages) {
+  for (let i = messages.length - 1; i >= 0; i--) {
+    const msg = messages[i];
+    if (msg.role === "assistant" && msg.content !== void 0) {
+      if (typeof msg.content === "string") {
+        return msg.content;
+      }
+      if (Array.isArray(msg.content)) {
+        for (const part of msg.content) {
+          if (typeof part === "object" && part !== null && "text" in part) {
+            return String(part.text);
+          }
+        }
+      }
+    }
+  }
+  return void 0;
+}
 // src/evaluation/case-conversion.ts
 function toSnakeCase(str) {
   if (/^[A-Z]/.test(str)) {
@@ -5297,12 +5737,6 @@ function toSnakeCase(str) {
   }
   return str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
 }
-function toCamelCase(str) {
-  if (/^[A-Z]/.test(str)) {
-    return str;
-  }
-  return str.replace(/_([a-z0-9])/g, (_, letter) => letter.toUpperCase());
-}
 function toSnakeCaseDeep(obj) {
   if (obj === null || obj === void 0) {
     return obj;
@@ -5320,25 +5754,148 @@ function toSnakeCaseDeep(obj) {
   }
   return obj;
 }
-function toCamelCaseDeep(obj) {
-  if (obj === null || obj === void 0) {
-    return obj;
-  }
-  if (Array.isArray(obj)) {
-    return obj.map((item) => toCamelCaseDeep(item));
+// src/evaluation/evaluators/code-evaluator.ts
+var CodeEvaluator = class {
+  kind = "code";
+  script;
+  cwd;
+  agentTimeoutMs;
+  config;
+  target;
+  constructor(options) {
+    this.script = options.script;
+    this.cwd = options.cwd;
+    this.agentTimeoutMs = options.agentTimeoutMs;
+    this.config = options.config;
+    this.target = options.target;
   }
-  if (typeof obj === "object") {
-    const result = {};
-    for (const [key, value] of Object.entries(obj)) {
-      const camelKey = toCamelCase(key);
-      result[camelKey] = toCamelCaseDeep(value);
+  async evaluate(context) {
+    const payload = {
+      question: context.evalCase.question,
+      expectedOutcome: context.evalCase.expected_outcome,
+      expectedMessages: context.evalCase.expected_messages,
+      referenceAnswer: context.evalCase.reference_answer,
+      candidateAnswer: context.candidate,
+      outputMessages: context.outputMessages ?? null,
+      guidelineFiles: context.evalCase.guideline_paths,
+      inputFiles: context.evalCase.file_paths.filter(
+        (path15) => !context.evalCase.guideline_paths.includes(path15)
+      ),
+      inputMessages: context.evalCase.input_messages,
+      traceSummary: context.traceSummary ?? null,
+      config: this.config ?? null
+    };
+    const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
+    let proxyEnv;
+    let proxyShutdown;
+    let getProxyUsage;
+    if (this.target !== void 0 && context.judgeProvider) {
+      const maxCalls = this.target.max_calls ?? DEFAULT_MAX_CALLS;
+      const proxy = await createTargetProxy({
+        defaultProvider: context.judgeProvider,
+        targetResolver: context.targetResolver,
+        availableTargets: context.availableTargets,
+        maxCalls
+      });
+      proxyEnv = {
+        AGENTV_TARGET_PROXY_URL: proxy.url,
+        AGENTV_TARGET_PROXY_TOKEN: proxy.token
+      };
+      proxyShutdown = proxy.shutdown;
+      getProxyUsage = proxy.getUsageMetadata;
+    }
+    try {
+      const stdout = await executeScript(
+        this.script,
+        inputPayload,
+        this.agentTimeoutMs,
+        this.cwd,
+        proxyEnv
+      );
+      const parsed = parseJsonSafe(stdout);
+      const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
+      const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
+      const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
+      const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
+      const details = parsed?.details && typeof parsed.details === "object" && !Array.isArray(parsed.details) ? parsed.details : void 0;
+      const proxyUsage = getProxyUsage?.();
+      const evaluatorRawRequest = {
+        script: this.script,
+        ...this.cwd ? { cwd: this.cwd } : {},
+        ...proxyUsage ? {
+          target_proxy: {
+            call_count: proxyUsage.callCount,
+            max_calls: proxyUsage.maxCalls
+          }
+        } : {}
+      };
+      return {
+        score,
+        verdict: scoreToVerdict(score),
+        hits,
+        misses,
+        expectedAspectCount: hits.length + misses.length || 1,
+        reasoning,
+        evaluatorRawRequest,
+        ...details ? { details } : {}
+      };
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      const proxyUsage = getProxyUsage?.();
+      return {
+        score: 0,
+        verdict: "fail",
+        hits: [],
+        misses: [`Code evaluator failed: ${message}`],
+        expectedAspectCount: 1,
+        reasoning: message,
+        evaluatorRawRequest: {
+          script: this.script,
+          ...this.cwd ? { cwd: this.cwd } : {},
+          ...proxyUsage ? {
+            target_proxy: {
+              call_count: proxyUsage.callCount,
+              max_calls: proxyUsage.maxCalls
+            }
+          } : {},
+          error: message
+        }
+      };
+    } finally {
+      if (proxyShutdown) {
+        await proxyShutdown();
+      }
     }
-    return result;
   }
-  return obj;
+};
+async function executeScript(scriptPath, input, agentTimeoutMs, cwd, env) {
+  const { stdout, stderr, exitCode } = typeof scriptPath === "string" ? await execShellWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env }) : await execFileWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env });
+  if (exitCode !== 0) {
+    const trimmedErr = formatStderr(stderr);
+    throw new Error(
+      trimmedErr.length > 0 ? `Code evaluator exited with code ${exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${exitCode}`
+    );
+  }
+  return stdout.trim();
+}
+function formatStderr(stderr) {
+  const trimmed = stderr.trim();
+  const maxLength = 2e3;
+  if (trimmed.length <= maxLength) {
+    return trimmed;
+  }
+  const tail = trimmed.slice(-maxLength);
+  return `...(truncated, last ${maxLength} chars)
+${tail}`;
 }
-// src/evaluation/evaluators.ts
+// src/evaluation/evaluators/composite.ts
+import { generateText as generateText3 } from "ai";
+// src/evaluation/evaluators/llm-judge.ts
+import { generateText as generateText2 } from "ai";
+import { z as z2 } from "zod";
 var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
 Use the reference_answer as a gold standard for a high-quality response (if provided). The reference_answer may be a simple text response, or it may contain a sequence of expected agent messages including tool calls. When it contains multiple messages, the last message represents the final expected answer. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
@@ -5418,7 +5975,7 @@ var LlmJudgeEvaluator = class {
       target: judgeProvider.targetName
     };
     try {
-      const { data, providerResponse } = await this.runWithRetry({
+      const { data } = await this.runWithRetry({
         context,
         judgeProvider,
         systemPrompt,
@@ -5567,105 +6124,11 @@ You must return a valid JSON object matching this schema:
   "overall_reasoning": "string (summary)"
 }`;
 }
-function scoreToVerdict(score) {
-  if (score >= 0.8) {
-    return "pass";
-  }
-  if (score >= 0.6) {
-    return "borderline";
-  }
-  return "fail";
+function substituteVariables(template, variables) {
+  return template.replace(/\{\{\s*([a-zA-Z0-9_]+)\s*\}\}/g, (match, varName) => {
+    return variables[varName] ?? match;
+  });
 }
-function clampScore(value) {
-  if (Number.isNaN(value) || !Number.isFinite(value)) {
-    return 0;
-  }
-  if (value < 0) {
-    return 0;
-  }
-  if (value > 1) {
-    return 1;
-  }
-  return value;
-}
-function extractJsonBlob(text) {
-  const match = text.match(/\{[\s\S]*\}/);
-  return match?.[0];
-}
-function parseJsonFromText(text) {
-  const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
-  const blob = extractJsonBlob(cleaned) ?? cleaned;
-  return JSON.parse(blob);
-}
-function isNonEmptyString(value) {
-  return typeof value === "string" && value.trim().length > 0;
-}
-var CodeEvaluator = class {
-  kind = "code";
-  script;
-  cwd;
-  agentTimeoutMs;
-  config;
-  constructor(options) {
-    this.script = options.script;
-    this.cwd = options.cwd;
-    this.agentTimeoutMs = options.agentTimeoutMs;
-    this.config = options.config;
-  }
-  async evaluate(context) {
-    const payload = {
-      question: context.evalCase.question,
-      expectedOutcome: context.evalCase.expected_outcome,
-      expectedMessages: context.evalCase.expected_messages,
-      referenceAnswer: context.evalCase.reference_answer,
-      candidateAnswer: context.candidate,
-      outputMessages: context.outputMessages ?? null,
-      guidelineFiles: context.evalCase.guideline_paths,
-      inputFiles: context.evalCase.file_paths.filter(
-        (path15) => !context.evalCase.guideline_paths.includes(path15)
-      ),
-      inputMessages: context.evalCase.input_messages,
-      traceSummary: context.traceSummary ?? null,
-      config: this.config ?? null
-    };
-    const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
-    try {
-      const stdout = await executeScript(this.script, inputPayload, this.agentTimeoutMs, this.cwd);
-      const parsed = parseJsonSafe(stdout);
-      const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
-      const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
-      const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
-      const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
-      return {
-        score,
-        verdict: scoreToVerdict(score),
-        hits,
-        misses,
-        expectedAspectCount: hits.length + misses.length || 1,
-        reasoning,
-        evaluatorRawRequest: {
-          script: this.script,
-          ...this.cwd ? { cwd: this.cwd } : {}
-        }
-      };
-    } catch (error) {
-      const message = error instanceof Error ? error.message : String(error);
-      return {
-        score: 0,
-        verdict: "fail",
-        hits: [],
-        misses: [`Code evaluator failed: ${message}`],
-        expectedAspectCount: 1,
-        reasoning: message,
-        evaluatorRawRequest: {
-          script: this.script,
-          ...this.cwd ? { cwd: this.cwd } : {},
-          error: message
-        }
-      };
-    }
-  }
-};
 function calculateRubricScore(result, rubrics) {
   const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
   const hits = [];
@@ -5693,273 +6156,281 @@ function calculateRubricScore(result, rubrics) {
   const verdict = failedRequired ? "fail" : scoreToVerdict(score);
   return { score, verdict, hits, misses };
 }
-async function executeScript(scriptPath, input, agentTimeoutMs, cwd) {
-  const { stdout, stderr, exitCode } = typeof scriptPath === "string" ? await execShellWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs }) : await execFileWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs });
-  if (exitCode !== 0) {
-    const trimmedErr = formatStderr(stderr);
-    throw new Error(
-      trimmedErr.length > 0 ? `Code evaluator exited with code ${exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${exitCode}`
-    );
-  }
-  return stdout.trim();
-}
-function formatStderr(stderr) {
-  const trimmed = stderr.trim();
-  const maxLength = 2e3;
-  if (trimmed.length <= maxLength) {
-    return trimmed;
-  }
-  const tail = trimmed.slice(-maxLength);
-  return `...(truncated, last ${maxLength} chars)
-${tail}`;
-}
-function parseJsonSafe(payload) {
-  try {
-    return JSON.parse(payload);
-  } catch {
-    return void 0;
-  }
-}
-function substituteVariables(template, variables) {
-  return template.replace(/\{\{\s*([a-zA-Z0-9_]+)\s*\}\}/g, (match, varName) => {
-    return variables[varName] ?? match;
-  });
-}
-function deepEqual(a, b) {
-  if (a === b) return true;
-  if (a === null || b === null) return a === b;
-  if (typeof a !== typeof b) return false;
-  if (typeof a !== "object") return a === b;
-  if (Array.isArray(a) !== Array.isArray(b)) return false;
-  if (Array.isArray(a) && Array.isArray(b)) {
-    if (a.length !== b.length) return false;
-    return a.every((val, i) => deepEqual(val, b[i]));
-  }
-  const aObj = a;
-  const bObj = b;
-  const aKeys = Object.keys(aObj);
-  const bKeys = Object.keys(bObj);
-  if (aKeys.length !== bKeys.length) return false;
-  return aKeys.every((key) => Object.hasOwn(bObj, key) && deepEqual(aObj[key], bObj[key]));
-}
-function argsMatch(expected, actual) {
-  if (expected === void 0) return true;
-  if (expected === "any") return true;
-  if (actual === void 0) return false;
-  for (const key of Object.keys(expected)) {
-    if (!Object.hasOwn(actual, key)) return false;
-    if (!deepEqual(expected[key], actual[key])) return false;
-  }
-  return true;
-}
-var ToolTrajectoryEvaluator = class {
-  kind = "tool_trajectory";
+// src/evaluation/evaluators/composite.ts
+var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
+{{EVALUATOR_RESULTS_JSON}}
+Decide the final score and verdict based on all evaluator results.
+Return a JSON object with: score (0.0-1.0), verdict (pass/fail/borderline), and reasoning.`;
+var CompositeEvaluator = class {
+  kind = "composite";
   config;
+  evaluatorFactory;
+  cwd;
   constructor(options) {
     this.config = options.config;
+    this.evaluatorFactory = options.evaluatorFactory;
+    this.cwd = options.cwd;
   }
-  evaluate(context) {
-    const { outputMessages, traceSummary } = context;
-    const toolCalls = this.extractToolCallsFromMessages(outputMessages);
-    if (toolCalls.length === 0 && !traceSummary) {
-      return {
-        score: 0,
-        verdict: "fail",
-        hits: [],
-        misses: ["No trace available for evaluation"],
-        expectedAspectCount: 1
-      };
-    }
-    const summary = toolCalls.length > 0 ? this.buildSummary(toolCalls) : traceSummary;
-    if (!summary) {
-      return {
-        score: 0,
-        verdict: "fail",
-        hits: [],
-        misses: ["No trace available for evaluation"],
-        expectedAspectCount: 1
-      };
-    }
-    switch (this.config.mode) {
-      case "any_order":
-        return this.evaluateAnyOrder(summary);
-      case "in_order":
-        return this.evaluateInOrder(toolCalls);
-      case "exact":
-        return this.evaluateExact(toolCalls);
-      default:
+  async evaluate(context) {
+    const memberResults = await Promise.all(
+      this.config.evaluators.map(async (memberConfig) => {
+        const evaluator = this.evaluatorFactory.create(memberConfig, context);
         return {
-          score: 0,
-          verdict: "fail",
-          hits: [],
-          misses: [`Unknown mode: ${this.config.mode}`],
-          expectedAspectCount: 1
+          id: memberConfig.name,
+          type: memberConfig.type,
+          result: await evaluator.evaluate(context)
         };
-    }
+      })
+    );
+    return this.aggregate(memberResults, context);
   }
-  /**
-   * Extract tool calls from output messages.
-   */
-  extractToolCallsFromMessages(messages) {
-    if (!messages) {
-      return [];
+  async aggregate(results, context) {
+    const aggregator = this.config.aggregator;
+    switch (aggregator.type) {
+      case "code_judge":
+        return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
+      case "llm_judge":
+        return this.runLlmAggregator(results, context, aggregator);
+      default:
+        return this.runWeightedAverage(results, aggregator.weights);
     }
-    const toolCalls = [];
-    for (const message of messages) {
-      if (message.toolCalls) {
-        for (const call of message.toolCalls) {
-          toolCalls.push({
-            name: call.tool,
-            args: call.input
-          });
-        }
+  }
+  runWeightedAverage(results, weights) {
+    let totalWeight = 0;
+    let weightedSum = 0;
+    const allHits = [];
+    const allMisses = [];
+    const reasoningParts = [];
+    const evaluatorResults = [];
+    for (const member of results) {
+      const weight = weights?.[member.id] ?? 1;
+      totalWeight += weight;
+      weightedSum += member.result.score * weight;
+      allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
+      allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
+      if (member.result.reasoning) {
+        reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
       }
+      evaluatorResults.push({
+        name: member.id,
+        type: member.type,
+        score: member.result.score,
+        weight,
+        verdict: member.result.verdict,
+        hits: [...member.result.hits],
+        misses: [...member.result.misses],
+        reasoning: member.result.reasoning,
+        evaluatorRawRequest: member.result.evaluatorRawRequest,
+        evaluatorResults: member.result.evaluatorResults,
+        details: member.result.details
+      });
     }
-    return toolCalls;
-  }
-  /**
-   * Build a summary from extracted tool calls.
-   */
-  buildSummary(toolCalls) {
-    const toolCallsByName = {};
-    for (const call of toolCalls) {
-      toolCallsByName[call.name] = (toolCallsByName[call.name] ?? 0) + 1;
-    }
-    const toolNames = Object.keys(toolCallsByName).sort();
+    const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
     return {
-      eventCount: toolCalls.length,
-      toolNames,
-      toolCallsByName,
-      errorCount: 0
+      score: clampScore(finalScore),
+      verdict: scoreToVerdict(finalScore),
+      hits: allHits,
+      misses: allMisses,
+      expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
+      reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
+      evaluatorRawRequest: {
+        aggregator: "weighted_average",
+        ...weights ? { weights } : {}
+      },
+      evaluatorResults
     };
   }
-  evaluateAnyOrder(summary) {
-    const minimums = this.config.minimums ?? {};
-    const toolNames = Object.keys(minimums);
-    if (toolNames.length === 0) {
+  async runCodeAggregator(results, scriptPath, cwd, weights) {
+    const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
+    const inputPayload = JSON.stringify({ results: resultsObject }, null, 2);
+    const evaluatorResults = results.map((member) => ({
+      name: member.id,
+      type: member.type,
+      score: member.result.score,
+      weight: weights?.[member.id] ?? 1,
+      verdict: member.result.verdict,
+      hits: [...member.result.hits],
+      misses: [...member.result.misses],
+      reasoning: member.result.reasoning,
+      evaluatorRawRequest: member.result.evaluatorRawRequest,
+      evaluatorResults: member.result.evaluatorResults,
+      details: member.result.details
+    }));
+    try {
+      const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
+      const parsed = parseJsonSafe(stdout);
+      const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
+      const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
+      const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
+      const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
+      const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
       return {
-        score: 1,
-        verdict: "pass",
-        hits: ["No tool requirements specified"],
-        misses: [],
-        expectedAspectCount: 0
+        score,
+        verdict,
+        hits,
+        misses,
+        expectedAspectCount: hits.length + misses.length || 1,
+        reasoning,
+        evaluatorRawRequest: {
+          aggregator: "code_judge",
+          script: scriptPath
+        },
+        evaluatorResults
+      };
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      return {
+        score: 0,
+        verdict: "fail",
+        hits: [],
+        misses: [`Code aggregator failed: ${message}`],
+        expectedAspectCount: 1,
+        reasoning: message,
+        evaluatorRawRequest: {
+          aggregator: "code_judge",
+          script: scriptPath,
+          error: message
+        },
+        evaluatorResults
       };
     }
-    const hits = [];
-    const misses = [];
-    for (const toolName of toolNames) {
-      const required = minimums[toolName];
-      const actual = summary.toolCallsByName[toolName] ?? 0;
-      if (actual >= required) {
-        hits.push(`${toolName}: called ${actual} times (required \u2265${required})`);
-      } else {
-        misses.push(`${toolName}: called ${actual} times (required \u2265${required})`);
-      }
+  }
+  async runLlmAggregator(results, context, config) {
+    const judgeProvider = context.judgeProvider;
+    if (!judgeProvider) {
+      throw new Error("No judge provider available for LLM aggregation");
     }
-    const score = hits.length / toolNames.length;
-    return {
-      score,
-      verdict: scoreToVerdict(score),
-      hits,
-      misses,
-      expectedAspectCount: toolNames.length
+    const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
+    const resultsJson = JSON.stringify(resultsObject, null, 2);
+    const evaluatorResults = results.map((member) => ({
+      name: member.id,
+      type: member.type,
+      score: member.result.score,
+      verdict: member.result.verdict,
+      hits: [...member.result.hits],
+      misses: [...member.result.misses],
+      reasoning: member.result.reasoning,
+      evaluatorRawRequest: member.result.evaluatorRawRequest,
+      evaluatorResults: member.result.evaluatorResults,
+      details: member.result.details
+    }));
+    const promptTemplate = config.prompt ?? DEFAULT_COMPOSITE_AGGREGATOR_PROMPT;
+    const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
+    const systemPrompt = buildOutputSchema();
+    const evaluatorRawRequest = {
+      aggregator: "llm_judge",
+      userPrompt,
+      systemPrompt,
+      target: judgeProvider.targetName
     };
-  }
-  evaluateInOrder(toolCalls) {
-    const expected = this.config.expected ?? [];
-    if (expected.length === 0) {
+    try {
+      const model = judgeProvider.asLanguageModel?.();
+      if (model) {
+        const { text } = await generateText3({
+          model,
+          system: systemPrompt,
+          prompt: userPrompt
+        });
+        const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text));
+        const score2 = clampScore(data2.score);
+        const hits2 = Array.isArray(data2.hits) ? data2.hits.filter(isNonEmptyString).slice(0, 4) : [];
+        const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
+        const reasoning2 = data2.reasoning;
+        return {
+          score: score2,
+          verdict: scoreToVerdict(score2),
+          hits: hits2,
+          misses: misses2,
+          expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
+          reasoning: reasoning2,
+          evaluatorRawRequest,
+          evaluatorResults
+        };
+      }
+      const response = await judgeProvider.invoke({
+        question: userPrompt,
+        systemPrompt,
+        evalCaseId: context.evalCase.id,
+        attempt: context.attempt
+      });
+      const data = freeformEvaluationSchema.parse(
+        parseJsonFromText(extractLastAssistantContent(response.outputMessages))
+      );
+      const score = clampScore(data.score);
+      const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
+      const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
+      const reasoning = data.reasoning;
       return {
-        score: 1,
-        verdict: "pass",
-        hits: ["No tool sequence specified"],
+        score,
+        verdict: scoreToVerdict(score),
+        hits,
+        misses,
+        expectedAspectCount: Math.max(hits.length + misses.length, 1),
+        reasoning,
+        evaluatorRawRequest,
+        evaluatorResults
+      };
+    } catch {
+      return {
+        score: 0,
+        verdict: "fail",
+        hits: [],
         misses: [],
-        expectedAspectCount: 0
+        expectedAspectCount: 1,
+        evaluatorRawRequest,
+        evaluatorResults
       };
     }
-    const hits = [];
-    const misses = [];
-    let actualIndex = 0;
-    for (let i = 0; i < expected.length; i++) {
-      const expectedItem = expected[i];
-      const expectedTool = expectedItem.tool;
-      let found = false;
-      let argsMismatch = false;
-      while (actualIndex < toolCalls.length) {
-        const actualCall = toolCalls[actualIndex];
-        if (actualCall.name === expectedTool) {
-          if (argsMatch(expectedItem.args, actualCall.args)) {
-            hits.push(`Found ${expectedTool} at position ${actualIndex}`);
-            actualIndex++;
-            found = true;
-            break;
-          }
-          misses.push(
-            `Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`
-          );
-          actualIndex++;
-          argsMismatch = true;
-          break;
+  }
+};
+// src/evaluation/evaluators/cost.ts
+var CostEvaluator = class {
+  kind = "cost";
+  config;
+  constructor(options) {
+    this.config = options.config;
+  }
+  evaluate(context) {
+    const { budget } = this.config;
+    const costUsd = context.traceSummary?.costUsd;
+    if (costUsd === void 0) {
+      return {
+        score: 0,
+        verdict: "fail",
+        hits: [],
+        misses: ["No cost data available in trace"],
+        expectedAspectCount: 1,
+        reasoning: "Execution cost not reported by provider",
+        evaluatorRawRequest: {
+          type: "cost",
+          budget,
+          costUsd: null
         }
-        actualIndex++;
-      }
-      if (!found && !argsMismatch) {
-        misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
-      }
+      };
     }
-    const score = hits.length / expected.length;
+    const passed = costUsd <= budget;
+    const score = passed ? 1 : 0;
+    const formatCost = (n) => `$${n.toFixed(4)}`;
     return {
       score,
-      verdict: scoreToVerdict(score),
-      hits,
-      misses,
-      expectedAspectCount: expected.length
-    };
-  }
-  evaluateExact(toolCalls) {
-    const expected = this.config.expected ?? [];
-    if (expected.length === 0) {
-      return {
-        score: 1,
-        verdict: "pass",
-        hits: ["No tool sequence specified"],
-        misses: [],
-        expectedAspectCount: 0
-      };
-    }
-    const hits = [];
-    const misses = [];
-    if (toolCalls.length !== expected.length) {
-      misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
-    }
-    const checkLength = Math.min(expected.length, toolCalls.length);
-    for (let i = 0; i < checkLength; i++) {
-      const expectedItem = expected[i];
-      const expectedTool = expectedItem.tool;
-      const actualCall = toolCalls[i];
-      const actualTool = actualCall.name;
-      if (actualTool === expectedTool) {
-        if (argsMatch(expectedItem.args, actualCall.args)) {
-          hits.push(`Position ${i}: ${expectedTool}`);
-        } else {
-          misses.push(`Position ${i}: ${expectedTool} args mismatch`);
-        }
-      } else {
-        misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
-      }
-    }
-    for (let i = checkLength; i < expected.length; i++) {
-      misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
-    }
-    const score = hits.length / expected.length;
-    return {
-      score,
-      verdict: scoreToVerdict(score),
-      hits,
-      misses,
-      expectedAspectCount: expected.length
+      verdict: passed ? "pass" : "fail",
+      hits: passed ? [`Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`] : [],
+      misses: passed ? [] : [`Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`],
+      expectedAspectCount: 1,
+      reasoning: `Execution cost ${formatCost(costUsd)} (budget: ${formatCost(budget)})`,
+      evaluatorRawRequest: {
+        type: "cost",
+        budget,
+        costUsd
+      }
     };
   }
 };
+// src/evaluation/evaluators/field-accuracy.ts
 var DEFAULT_DATE_FORMATS = [
   "YYYY-MM-DDTHH:mm:ssZ",
   // ISO with timezone
@@ -6168,438 +6639,213 @@ var FieldAccuracyEvaluator = class {
         weight,
         hit: false,
         message: `${path15} (non-numeric value)`
-      };
-    }
-    if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
-      return {
-        path: path15,
-        score: 0,
-        weight,
-        hit: false,
-        message: `${path15} (invalid numeric value)`
-      };
-    }
-    const diff = Math.abs(candidateNum - expectedNum);
-    let withinTolerance;
-    if (relative) {
-      const relativeDiff = expectedNum === 0 ? diff : diff / Math.abs(expectedNum);
-      withinTolerance = relativeDiff <= tolerance;
-    } else {
-      withinTolerance = diff <= tolerance;
-    }
-    if (withinTolerance) {
-      return {
-        path: path15,
-        score: 1,
-        weight,
-        hit: true,
-        message: `${path15} (within tolerance: diff=${diff.toFixed(2)})`
-      };
-    }
-    return {
-      path: path15,
-      score: 0,
-      weight,
-      hit: false,
-      message: `${path15} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
-    };
-  }
-  /**
-   * Date comparison with format normalization.
-   */
-  compareDate(path15, candidateValue, expectedValue, fieldConfig, weight) {
-    const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
-    const candidateDate = parseDate(String(candidateValue), formats);
-    const expectedDate = parseDate(String(expectedValue), formats);
-    if (candidateDate === null) {
-      return {
-        path: path15,
-        score: 0,
-        weight,
-        hit: false,
-        message: `${path15} (unparseable candidate date)`
-      };
-    }
-    if (expectedDate === null) {
-      return {
-        path: path15,
-        score: 0,
-        weight,
-        hit: false,
-        message: `${path15} (unparseable expected date)`
-      };
-    }
-    if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
-      return {
-        path: path15,
-        score: 1,
-        weight,
-        hit: true,
-        message: path15
-      };
-    }
-    return {
-      path: path15,
-      score: 0,
-      weight,
-      hit: false,
-      message: `${path15} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
-    };
-  }
-  /**
-   * Aggregate field results using configured strategy.
-   */
-  aggregateResults(results) {
-    const aggregation = this.config.aggregation ?? "weighted_average";
-    const hits = [];
-    const misses = [];
-    for (const result of results) {
-      if (result.hit) {
-        hits.push(result.message);
-      } else {
-        misses.push(result.message);
-      }
-    }
-    let score;
-    if (aggregation === "all_or_nothing") {
-      score = misses.length === 0 ? 1 : 0;
-    } else {
-      const totalWeight = results.reduce((sum, r) => sum + r.weight, 0);
-      if (totalWeight === 0) {
-        score = results.length === 0 ? 1 : 0;
-      } else {
-        const weightedSum = results.reduce((sum, r) => sum + r.score * r.weight, 0);
-        score = weightedSum / totalWeight;
-      }
-    }
-    const reasoning = `${hits.length}/${results.length} fields matched`;
-    return {
-      score: clampScore(score),
-      verdict: scoreToVerdict(score),
-      hits: hits.slice(0, 4),
-      misses: misses.slice(0, 4),
-      expectedAspectCount: results.length,
-      reasoning
-    };
-  }
-};
-function resolvePath(obj, path15) {
-  if (!path15 || !obj) {
-    return void 0;
-  }
-  const parts = path15.split(/\.|\[|\]/).filter((p) => p.length > 0);
-  let current = obj;
-  for (const part of parts) {
-    if (current === null || current === void 0) {
-      return void 0;
-    }
-    if (typeof current !== "object") {
-      return void 0;
-    }
-    const isIndex = /^\d+$/.test(part);
-    if (isIndex && Array.isArray(current)) {
-      current = current[Number.parseInt(part, 10)];
-    } else {
-      current = current[part];
-    }
-  }
-  return current;
-}
-function toNumber(value) {
-  if (typeof value === "number") {
-    return value;
-  }
-  if (typeof value === "string") {
-    const num = Number.parseFloat(value);
-    return Number.isNaN(num) ? null : num;
-  }
-  return null;
-}
-function parseDate(dateStr, formats) {
-  if (!dateStr) return null;
-  const trimmed = dateStr.trim();
-  const isoDate = new Date(trimmed);
-  if (!Number.isNaN(isoDate.getTime())) {
-    return isoDate;
-  }
-  const localizedMatch = trimmed.match(/^(\d{1,2})-([A-Za-z]{3,9})-(\d{4})$/);
-  if (localizedMatch) {
-    const day = Number.parseInt(localizedMatch[1], 10);
-    const monthName = localizedMatch[2].toLowerCase();
-    const year = Number.parseInt(localizedMatch[3], 10);
-    const month = MONTH_NAMES[monthName];
-    if (month !== void 0) {
-      return new Date(year, month, day);
-    }
-  }
-  const usMatch = trimmed.match(/^(\d{1,2})[\/\-](\d{1,2})[\/\-](\d{4})$/);
-  if (usMatch) {
-    const hasUSFormat = formats.some((f) => f.includes("MM/DD") || f.includes("MM-DD"));
-    const hasEUFormat = formats.some((f) => f.includes("DD/MM") || f.includes("DD-MM"));
-    if (hasUSFormat && !hasEUFormat) {
-      const month = Number.parseInt(usMatch[1], 10) - 1;
-      const day = Number.parseInt(usMatch[2], 10);
-      const year = Number.parseInt(usMatch[3], 10);
-      if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
-        return new Date(year, month, day);
-      }
-    } else if (hasEUFormat && !hasUSFormat) {
-      const day = Number.parseInt(usMatch[1], 10);
-      const month = Number.parseInt(usMatch[2], 10) - 1;
-      const year = Number.parseInt(usMatch[3], 10);
-      if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
-        return new Date(year, month, day);
-      }
-    } else {
-      const num1 = Number.parseInt(usMatch[1], 10);
-      const num2 = Number.parseInt(usMatch[2], 10);
-      const year = Number.parseInt(usMatch[3], 10);
-      if (num1 > 12 && num2 <= 12) {
-        return new Date(year, num2 - 1, num1);
-      }
-      if (num2 > 12 && num1 <= 12) {
-        return new Date(year, num1 - 1, num2);
-      }
-      if (num1 <= 12 && num2 <= 31) {
-        return new Date(year, num1 - 1, num2);
-      }
-    }
-  }
-  return null;
-}
-function formatDateISO(date) {
-  return date.toISOString().split("T")[0];
-}
-function parseJsonFromTextSafe(text) {
-  const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
-  const match = cleaned.match(/\{[\s\S]*\}/);
-  const blob = match?.[0] ?? cleaned;
-  return JSON.parse(blob);
-}
-var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
-{{EVALUATOR_RESULTS_JSON}}
-Decide the final score and verdict based on all evaluator results.
-Return a JSON object with: score (0.0-1.0), verdict (pass/fail/borderline), and reasoning.`;
-var CompositeEvaluator = class {
-  kind = "composite";
-  config;
-  evaluatorFactory;
-  cwd;
-  constructor(options) {
-    this.config = options.config;
-    this.evaluatorFactory = options.evaluatorFactory;
-    this.cwd = options.cwd;
-  }
-  async evaluate(context) {
-    const memberResults = await Promise.all(
-      this.config.evaluators.map(async (memberConfig) => {
-        const evaluator = this.evaluatorFactory.create(memberConfig, context);
-        return {
-          id: memberConfig.name,
-          type: memberConfig.type,
-          result: await evaluator.evaluate(context)
-        };
-      })
-    );
-    return this.aggregate(memberResults, context);
-  }
-  async aggregate(results, context) {
-    const aggregator = this.config.aggregator;
-    switch (aggregator.type) {
-      case "code_judge":
-        return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
-      case "llm_judge":
-        return this.runLlmAggregator(results, context, aggregator);
-      default:
-        return this.runWeightedAverage(results, aggregator.weights);
-    }
-  }
-  runWeightedAverage(results, weights) {
-    let totalWeight = 0;
-    let weightedSum = 0;
-    const allHits = [];
-    const allMisses = [];
-    const reasoningParts = [];
-    const evaluatorResults = [];
-    for (const member of results) {
-      const weight = weights?.[member.id] ?? 1;
-      totalWeight += weight;
-      weightedSum += member.result.score * weight;
-      allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
-      allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
-      if (member.result.reasoning) {
-        reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
-      }
-      evaluatorResults.push({
-        name: member.id,
-        type: member.type,
-        score: member.result.score,
-        weight,
-        verdict: member.result.verdict,
-        hits: [...member.result.hits],
-        misses: [...member.result.misses],
-        reasoning: member.result.reasoning,
-        evaluatorRawRequest: member.result.evaluatorRawRequest,
-        evaluatorResults: member.result.evaluatorResults
-      });
-    }
-    const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
-    return {
-      score: clampScore(finalScore),
-      verdict: scoreToVerdict(finalScore),
-      hits: allHits,
-      misses: allMisses,
-      expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
-      reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
-      evaluatorRawRequest: {
-        aggregator: "weighted_average",
-        ...weights ? { weights } : {}
-      },
-      evaluatorResults
-    };
-  }
-  async runCodeAggregator(results, scriptPath, cwd, weights) {
-    const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
-    const inputPayload = JSON.stringify({ results: resultsObject }, null, 2);
-    const evaluatorResults = results.map((member) => ({
-      name: member.id,
-      type: member.type,
-      score: member.result.score,
-      weight: weights?.[member.id] ?? 1,
-      verdict: member.result.verdict,
-      hits: [...member.result.hits],
-      misses: [...member.result.misses],
-      reasoning: member.result.reasoning,
-      evaluatorRawRequest: member.result.evaluatorRawRequest,
-      evaluatorResults: member.result.evaluatorResults
-    }));
-    try {
-      const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
-      const parsed = parseJsonSafe(stdout);
-      const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
-      const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
-      const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
-      const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
-      const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
-      return {
-        score,
-        verdict,
-        hits,
-        misses,
-        expectedAspectCount: hits.length + misses.length || 1,
-        reasoning,
-        evaluatorRawRequest: {
-          aggregator: "code_judge",
-          script: scriptPath
-        },
-        evaluatorResults
-      };
-    } catch (error) {
-      const message = error instanceof Error ? error.message : String(error);
-      return {
-        score: 0,
-        verdict: "fail",
-        hits: [],
-        misses: [`Code aggregator failed: ${message}`],
-        expectedAspectCount: 1,
-        reasoning: message,
-        evaluatorRawRequest: {
-          aggregator: "code_judge",
-          script: scriptPath,
-          error: message
-        },
-        evaluatorResults
-      };
-    }
-  }
-  async runLlmAggregator(results, context, config) {
-    const judgeProvider = context.judgeProvider;
-    if (!judgeProvider) {
-      throw new Error("No judge provider available for LLM aggregation");
-    }
-    const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
-    const resultsJson = JSON.stringify(resultsObject, null, 2);
-    const evaluatorResults = results.map((member) => ({
-      name: member.id,
-      type: member.type,
-      score: member.result.score,
-      verdict: member.result.verdict,
-      hits: [...member.result.hits],
-      misses: [...member.result.misses],
-      reasoning: member.result.reasoning,
-      evaluatorRawRequest: member.result.evaluatorRawRequest,
-      evaluatorResults: member.result.evaluatorResults
-    }));
-    const promptTemplate = config.prompt ?? DEFAULT_COMPOSITE_AGGREGATOR_PROMPT;
-    const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
-    const systemPrompt = buildOutputSchema();
-    const evaluatorRawRequest = {
-      aggregator: "llm_judge",
-      userPrompt,
-      systemPrompt,
-      target: judgeProvider.targetName
-    };
-    try {
-      const model = judgeProvider.asLanguageModel?.();
-      if (model) {
-        const { text } = await generateText2({
-          model,
-          system: systemPrompt,
-          prompt: userPrompt
-        });
-        const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text));
-        const score2 = clampScore(data2.score);
-        const hits2 = Array.isArray(data2.hits) ? data2.hits.filter(isNonEmptyString).slice(0, 4) : [];
-        const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
-        const reasoning2 = data2.reasoning;
-        return {
-          score: score2,
-          verdict: scoreToVerdict(score2),
-          hits: hits2,
-          misses: misses2,
-          expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
-          reasoning: reasoning2,
-          evaluatorRawRequest,
-          evaluatorResults
-        };
-      }
-      const response = await judgeProvider.invoke({
-        question: userPrompt,
-        systemPrompt,
-        evalCaseId: context.evalCase.id,
-        attempt: context.attempt
-      });
-      const data = freeformEvaluationSchema.parse(
-        parseJsonFromText(extractLastAssistantContent(response.outputMessages))
-      );
-      const score = clampScore(data.score);
-      const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
-      const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
-      const reasoning = data.reasoning;
+      };
+    }
+    if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
       return {
-        score,
-        verdict: scoreToVerdict(score),
-        hits,
-        misses,
-        expectedAspectCount: Math.max(hits.length + misses.length, 1),
-        reasoning,
-        evaluatorRawRequest,
-        evaluatorResults
+        path: path15,
+        score: 0,
+        weight,
+        hit: false,
+        message: `${path15} (invalid numeric value)`
       };
-    } catch {
+    }
+    const diff = Math.abs(candidateNum - expectedNum);
+    let withinTolerance;
+    if (relative) {
+      const relativeDiff = expectedNum === 0 ? diff : diff / Math.abs(expectedNum);
+      withinTolerance = relativeDiff <= tolerance;
+    } else {
+      withinTolerance = diff <= tolerance;
+    }
+    if (withinTolerance) {
+      return {
+        path: path15,
+        score: 1,
+        weight,
+        hit: true,
+        message: `${path15} (within tolerance: diff=${diff.toFixed(2)})`
+      };
+    }
+    return {
+      path: path15,
+      score: 0,
+      weight,
+      hit: false,
+      message: `${path15} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
+    };
+  }
+  /**
+   * Date comparison with format normalization.
+   */
+  compareDate(path15, candidateValue, expectedValue, fieldConfig, weight) {
+    const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
+    const candidateDate = parseDate(String(candidateValue), formats);
+    const expectedDate = parseDate(String(expectedValue), formats);
+    if (candidateDate === null) {
       return {
+        path: path15,
         score: 0,
-        verdict: "fail",
-        hits: [],
-        misses: [],
-        expectedAspectCount: 1,
-        evaluatorRawRequest,
-        evaluatorResults
+        weight,
+        hit: false,
+        message: `${path15} (unparseable candidate date)`
+      };
+    }
+    if (expectedDate === null) {
+      return {
+        path: path15,
+        score: 0,
+        weight,
+        hit: false,
+        message: `${path15} (unparseable expected date)`
+      };
+    }
+    if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
+      return {
+        path: path15,
+        score: 1,
+        weight,
+        hit: true,
+        message: path15
       };
     }
+    return {
+      path: path15,
+      score: 0,
+      weight,
+      hit: false,
+      message: `${path15} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
+    };
+  }
+  /**
+   * Aggregate field results using configured strategy.
+   */
+  aggregateResults(results) {
+    const aggregation = this.config.aggregation ?? "weighted_average";
+    const hits = [];
+    const misses = [];
+    for (const result of results) {
+      if (result.hit) {
+        hits.push(result.message);
+      } else {
+        misses.push(result.message);
+      }
+    }
+    let score;
+    if (aggregation === "all_or_nothing") {
+      score = misses.length === 0 ? 1 : 0;
+    } else {
+      const totalWeight = results.reduce((sum, r) => sum + r.weight, 0);
+      if (totalWeight === 0) {
+        score = results.length === 0 ? 1 : 0;
+      } else {
+        const weightedSum = results.reduce((sum, r) => sum + r.score * r.weight, 0);
+        score = weightedSum / totalWeight;
+      }
+    }
+    const reasoning = `${hits.length}/${results.length} fields matched`;
+    return {
+      score: clampScore(score),
+      verdict: scoreToVerdict(score),
+      hits: hits.slice(0, 4),
+      misses: misses.slice(0, 4),
+      expectedAspectCount: results.length,
+      reasoning
+    };
   }
 };
+function resolvePath(obj, path15) {
+  if (!path15 || !obj) {
+    return void 0;
+  }
+  const parts = path15.split(/\.|\[|\]/).filter((p) => p.length > 0);
+  let current = obj;
+  for (const part of parts) {
+    if (current === null || current === void 0) {
+      return void 0;
+    }
+    if (typeof current !== "object") {
+      return void 0;
+    }
+    const isIndex = /^\d+$/.test(part);
+    if (isIndex && Array.isArray(current)) {
+      current = current[Number.parseInt(part, 10)];
+    } else {
+      current = current[part];
+    }
+  }
+  return current;
+}
+function toNumber(value) {
+  if (typeof value === "number") {
+    return value;
+  }
+  if (typeof value === "string") {
+    const num = Number.parseFloat(value);
+    return Number.isNaN(num) ? null : num;
+  }
+  return null;
+}
+function parseDate(dateStr, formats) {
+  if (!dateStr) return null;
+  const trimmed = dateStr.trim();
+  const isoDate = new Date(trimmed);
+  if (!Number.isNaN(isoDate.getTime())) {
+    return isoDate;
+  }
+  const localizedMatch = trimmed.match(/^(\d{1,2})-([A-Za-z]{3,9})-(\d{4})$/);
+  if (localizedMatch) {
+    const day = Number.parseInt(localizedMatch[1], 10);
+    const monthName = localizedMatch[2].toLowerCase();
+    const year = Number.parseInt(localizedMatch[3], 10);
+    const month = MONTH_NAMES[monthName];
+    if (month !== void 0) {
+      return new Date(year, month, day);
+    }
+  }
+  const usMatch = trimmed.match(/^(\d{1,2})[\/\-](\d{1,2})[\/\-](\d{4})$/);
+  if (usMatch) {
+    const hasUSFormat = formats.some((f) => f.includes("MM/DD") || f.includes("MM-DD"));
+    const hasEUFormat = formats.some((f) => f.includes("DD/MM") || f.includes("DD-MM"));
+    if (hasUSFormat && !hasEUFormat) {
+      const month = Number.parseInt(usMatch[1], 10) - 1;
+      const day = Number.parseInt(usMatch[2], 10);
+      const year = Number.parseInt(usMatch[3], 10);
+      if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
+        return new Date(year, month, day);
+      }
+    } else if (hasEUFormat && !hasUSFormat) {
+      const day = Number.parseInt(usMatch[1], 10);
+      const month = Number.parseInt(usMatch[2], 10) - 1;
+      const year = Number.parseInt(usMatch[3], 10);
+      if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
+        return new Date(year, month, day);
+      }
+    } else {
+      const num1 = Number.parseInt(usMatch[1], 10);
+      const num2 = Number.parseInt(usMatch[2], 10);
+      const year = Number.parseInt(usMatch[3], 10);
+      if (num1 > 12 && num2 <= 12) {
+        return new Date(year, num2 - 1, num1);
+      }
+      if (num2 > 12 && num1 <= 12) {
+        return new Date(year, num1 - 1, num2);
+      }
+      if (num1 <= 12 && num2 <= 31) {
+        return new Date(year, num1 - 1, num2);
+      }
+    }
+  }
+  return null;
+}
+function formatDateISO(date) {
+  return date.toISOString().split("T")[0];
+}
+function parseJsonFromTextSafe(text) {
+  return parseJsonFromText(text);
+}
+// src/evaluation/evaluators/latency.ts
 var LatencyEvaluator = class {
   kind = "latency";
   config;
@@ -6635,54 +6881,14 @@ var LatencyEvaluator = class {
       reasoning: `Execution took ${durationMs}ms (threshold: ${threshold}ms)`,
       evaluatorRawRequest: {
         type: "latency",
-        threshold,
-        durationMs
-      }
-    };
-  }
-};
-var CostEvaluator = class {
-  kind = "cost";
-  config;
-  constructor(options) {
-    this.config = options.config;
-  }
-  evaluate(context) {
-    const { budget } = this.config;
-    const costUsd = context.traceSummary?.costUsd;
-    if (costUsd === void 0) {
-      return {
-        score: 0,
-        verdict: "fail",
-        hits: [],
-        misses: ["No cost data available in trace"],
-        expectedAspectCount: 1,
-        reasoning: "Execution cost not reported by provider",
-        evaluatorRawRequest: {
-          type: "cost",
-          budget,
-          costUsd: null
-        }
-      };
-    }
-    const passed = costUsd <= budget;
-    const score = passed ? 1 : 0;
-    const formatCost = (n) => `$${n.toFixed(4)}`;
-    return {
-      score,
-      verdict: passed ? "pass" : "fail",
-      hits: passed ? [`Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`] : [],
-      misses: passed ? [] : [`Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`],
-      expectedAspectCount: 1,
-      reasoning: `Execution cost ${formatCost(costUsd)} (budget: ${formatCost(budget)})`,
-      evaluatorRawRequest: {
-        type: "cost",
-        budget,
-        costUsd
+        threshold,
+        durationMs
       }
     };
   }
 };
+// src/evaluation/evaluators/token-usage.ts
 var TokenUsageEvaluator = class {
   kind = "token_usage";
   config;
@@ -6766,6 +6972,226 @@ var TokenUsageEvaluator = class {
   }
 };
+// src/evaluation/evaluators/tool-trajectory.ts
+function argsMatch(expected, actual) {
+  if (expected === void 0) return true;
+  if (expected === "any") return true;
+  if (actual === void 0) return false;
+  for (const key of Object.keys(expected)) {
+    if (!Object.hasOwn(actual, key)) return false;
+    if (!deepEqual(expected[key], actual[key])) return false;
+  }
+  return true;
+}
+var ToolTrajectoryEvaluator = class {
+  kind = "tool_trajectory";
+  config;
+  constructor(options) {
+    this.config = options.config;
+  }
+  evaluate(context) {
+    const { outputMessages, traceSummary } = context;
+    const toolCalls = this.extractToolCallsFromMessages(outputMessages);
+    if (toolCalls.length === 0 && !traceSummary) {
+      return {
+        score: 0,
+        verdict: "fail",
+        hits: [],
+        misses: ["No trace available for evaluation"],
+        expectedAspectCount: 1
+      };
+    }
+    const summary = toolCalls.length > 0 ? this.buildSummary(toolCalls) : traceSummary;
+    if (!summary) {
+      return {
+        score: 0,
+        verdict: "fail",
+        hits: [],
+        misses: ["No trace available for evaluation"],
+        expectedAspectCount: 1
+      };
+    }
+    switch (this.config.mode) {
+      case "any_order":
+        return this.evaluateAnyOrder(summary);
+      case "in_order":
+        return this.evaluateInOrder(toolCalls);
+      case "exact":
+        return this.evaluateExact(toolCalls);
+      default:
+        return {
+          score: 0,
+          verdict: "fail",
+          hits: [],
+          misses: [`Unknown mode: ${this.config.mode}`],
+          expectedAspectCount: 1
+        };
+    }
+  }
+  /**
+   * Extract tool calls from output messages.
+   */
+  extractToolCallsFromMessages(messages) {
+    if (!messages) {
+      return [];
+    }
+    const toolCalls = [];
+    for (const message of messages) {
+      if (message.toolCalls) {
+        for (const call of message.toolCalls) {
+          toolCalls.push({
+            name: call.tool,
+            args: call.input
+          });
+        }
+      }
+    }
+    return toolCalls;
+  }
+  /**
+   * Build a summary from extracted tool calls.
+   */
+  buildSummary(toolCalls) {
+    const toolCallsByName = {};
+    for (const call of toolCalls) {
+      toolCallsByName[call.name] = (toolCallsByName[call.name] ?? 0) + 1;
+    }
+    const toolNames = Object.keys(toolCallsByName).sort();
+    return {
+      eventCount: toolCalls.length,
+      toolNames,
+      toolCallsByName,
+      errorCount: 0
+    };
+  }
+  evaluateAnyOrder(summary) {
+    const minimums = this.config.minimums ?? {};
+    const toolNames = Object.keys(minimums);
+    if (toolNames.length === 0) {
+      return {
+        score: 1,
+        verdict: "pass",
+        hits: ["No tool requirements specified"],
+        misses: [],
+        expectedAspectCount: 0
+      };
+    }
+    const hits = [];
+    const misses = [];
+    for (const toolName of toolNames) {
+      const required = minimums[toolName];
+      const actual = summary.toolCallsByName[toolName] ?? 0;
+      if (actual >= required) {
+        hits.push(`${toolName}: called ${actual} times (required \u2265${required})`);
+      } else {
+        misses.push(`${toolName}: called ${actual} times (required \u2265${required})`);
+      }
+    }
+    const score = hits.length / toolNames.length;
+    return {
+      score,
+      verdict: scoreToVerdict(score),
+      hits,
+      misses,
+      expectedAspectCount: toolNames.length
+    };
+  }
+  evaluateInOrder(toolCalls) {
+    const expected = this.config.expected ?? [];
+    if (expected.length === 0) {
+      return {
+        score: 1,
+        verdict: "pass",
+        hits: ["No tool sequence specified"],
+        misses: [],
+        expectedAspectCount: 0
+      };
+    }
+    const hits = [];
+    const misses = [];
+    let actualIndex = 0;
+    for (let i = 0; i < expected.length; i++) {
+      const expectedItem = expected[i];
+      const expectedTool = expectedItem.tool;
+      let found = false;
+      let argsMismatch = false;
+      while (actualIndex < toolCalls.length) {
+        const actualCall = toolCalls[actualIndex];
+        if (actualCall.name === expectedTool) {
+          if (argsMatch(expectedItem.args, actualCall.args)) {
+            hits.push(`Found ${expectedTool} at position ${actualIndex}`);
+            actualIndex++;
+            found = true;
+            break;
+          }
+          misses.push(
+            `Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`
+          );
+          actualIndex++;
+          argsMismatch = true;
+          break;
+        }
+        actualIndex++;
+      }
+      if (!found && !argsMismatch) {
+        misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
+      }
+    }
+    const score = hits.length / expected.length;
+    return {
+      score,
+      verdict: scoreToVerdict(score),
+      hits,
+      misses,
+      expectedAspectCount: expected.length
+    };
+  }
+  evaluateExact(toolCalls) {
+    const expected = this.config.expected ?? [];
+    if (expected.length === 0) {
+      return {
+        score: 1,
+        verdict: "pass",
+        hits: ["No tool sequence specified"],
+        misses: [],
+        expectedAspectCount: 0
+      };
+    }
+    const hits = [];
+    const misses = [];
+    if (toolCalls.length !== expected.length) {
+      misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
+    }
+    const checkLength = Math.min(expected.length, toolCalls.length);
+    for (let i = 0; i < checkLength; i++) {
+      const expectedItem = expected[i];
+      const expectedTool = expectedItem.tool;
+      const actualCall = toolCalls[i];
+      const actualTool = actualCall.name;
+      if (actualTool === expectedTool) {
+        if (argsMatch(expectedItem.args, actualCall.args)) {
+          hits.push(`Position ${i}: ${expectedTool}`);
+        } else {
+          misses.push(`Position ${i}: ${expectedTool} args mismatch`);
+        }
+      } else {
+        misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
+      }
+    }
+    for (let i = checkLength; i < expected.length; i++) {
+      misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
+    }
+    const score = hits.length / expected.length;
+    return {
+      score,
+      verdict: scoreToVerdict(score),
+      hits,
+      misses,
+      expectedAspectCount: expected.length
+    };
+  }
+};
 // src/evaluation/orchestrator.ts
 import { createHash } from "node:crypto";
 import path14 from "node:path";
@@ -6979,6 +7405,17 @@ async function runEvaluation(options) {
     }
     return getOrCreateProvider(resolvedJudge);
   };
+  const targetResolver = (name) => {
+    const resolved = resolveTargetByName(name);
+    if (!resolved) {
+      return void 0;
+    }
+    return getOrCreateProvider(resolved);
+  };
+  const availableTargets = [
+    target.name,
+    ...Array.from(targetDefinitions.keys())
+  ];
   const evaluatorRegistry = buildEvaluatorRegistry(evaluators, resolveJudgeProvider);
   const primaryProvider = getOrCreateProvider(target);
   const providerSupportsBatch = target.providerBatching === true && primaryProvider.supportsBatch === true && typeof primaryProvider.invokeBatch === "function";
@@ -7008,7 +7445,9 @@ async function runEvaluation(options) {
         onResult,
         verbose,
         resolveJudgeProvider,
-        agentTimeoutMs
+        agentTimeoutMs,
+        targetResolver,
+        availableTargets
       });
     } catch (error) {
       if (verbose) {
@@ -7047,7 +7486,9 @@ async function runEvaluation(options) {
           cache,
           useCache,
           now,
-          judgeProvider
+          judgeProvider,
+          targetResolver,
+          availableTargets
         });
         if (onProgress) {
           await onProgress({
@@ -7114,7 +7555,9 @@ async function runBatchEvaluation(options) {
     onProgress,
     onResult,
     resolveJudgeProvider,
-    agentTimeoutMs
+    agentTimeoutMs,
+    targetResolver,
+    availableTargets
   } = options;
   const promptInputsList = [];
   const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
@@ -7189,7 +7632,9 @@ async function runBatchEvaluation(options) {
         judgeProvider: await resolveJudgeProvider(target),
         agentTimeoutMs,
         outputMessages,
-        traceSummary
+        traceSummary,
+        targetResolver,
+        availableTargets
       });
       if (providerError) {
         result = { ...result, error: providerError };
@@ -7247,7 +7692,9 @@ async function runEvalCase(options) {
     cache,
     useCache,
     signal,
-    judgeProvider
+    judgeProvider,
+    targetResolver,
+    availableTargets
   } = options;
   const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
   const promptInputs = await buildPromptInputs(evalCase, formattingMode);
@@ -7321,7 +7768,9 @@ async function runEvalCase(options) {
       judgeProvider,
       agentTimeoutMs,
       outputMessages,
-      traceSummary
+      traceSummary,
+      targetResolver,
+      availableTargets
     });
     return providerError ? { ...result, error: providerError } : result;
   } catch (error) {
@@ -7341,7 +7790,9 @@ async function evaluateCandidate(options) {
     judgeProvider,
     agentTimeoutMs,
     outputMessages,
-    traceSummary
+    traceSummary,
+    targetResolver,
+    availableTargets
   } = options;
   const gradeTimestamp = nowFn();
   const { score, evaluatorResults } = await runEvaluatorsForCase({
@@ -7356,7 +7807,9 @@ async function evaluateCandidate(options) {
     judgeProvider,
     agentTimeoutMs,
     outputMessages,
-    traceSummary
+    traceSummary,
+    targetResolver,
+    availableTargets
   });
   const completedAt = nowFn();
   let agentProviderRequest;
@@ -7409,7 +7862,9 @@ async function runEvaluatorsForCase(options) {
     judgeProvider,
     agentTimeoutMs,
     outputMessages,
-    traceSummary
+    traceSummary,
+    targetResolver,
+    availableTargets
   } = options;
   if (evalCase.evaluators && evalCase.evaluators.length > 0) {
     return runEvaluatorList({
@@ -7425,7 +7880,9 @@ async function runEvaluatorsForCase(options) {
       judgeProvider,
       agentTimeoutMs,
       outputMessages,
-      traceSummary
+      traceSummary,
+      targetResolver,
+      availableTargets
     });
   }
   const evaluatorKind = evalCase.evaluator ?? "llm_judge";
@@ -7443,7 +7900,9 @@ async function runEvaluatorsForCase(options) {
     now,
     judgeProvider,
     outputMessages,
-    traceSummary
+    traceSummary,
+    targetResolver,
+    availableTargets
   });
   return { score };
 }
@@ -7461,7 +7920,9 @@ async function runEvaluatorList(options) {
     judgeProvider,
     agentTimeoutMs,
     outputMessages,
-    traceSummary
+    traceSummary,
+    targetResolver,
+    availableTargets
   } = options;
   const scored = [];
   const evaluatorResults = [];
@@ -7499,7 +7960,8 @@ async function runEvaluatorList(options) {
           script: evaluator.script,
           cwd: evaluator.resolvedCwd ?? evaluator.cwd,
           agentTimeoutMs,
-          config: evaluator.config
+          config: evaluator.config,
+          target: evaluator.target
         });
         const score2 = await codeEvaluator.evaluate({
           evalCase,
@@ -7509,8 +7971,11 @@ async function runEvaluatorList(options) {
           attempt,
           promptInputs,
           now,
+          judgeProvider,
           outputMessages,
-          traceSummary
+          traceSummary,
+          targetResolver,
+          availableTargets
         });
         const weight = evaluator.weight ?? 1;
         scored.push({ score: score2, name: evaluator.name, type: "code_judge", weight });
@@ -7523,7 +7988,8 @@ async function runEvaluatorList(options) {
           hits: score2.hits,
           misses: score2.misses,
           reasoning: score2.reasoning,
-          evaluatorProviderRequest: score2.evaluatorRawRequest
+          evaluatorProviderRequest: score2.evaluatorRawRequest,
+          details: score2.details
         });
       }
       if (evaluator.type === "composite") {
@@ -7537,7 +8003,8 @@ async function runEvaluatorList(options) {
                 script: memberConfig.script,
                 cwd: memberConfig.resolvedCwd ?? memberConfig.cwd,
                 agentTimeoutMs,
-                config: memberConfig.config
+                config: memberConfig.config,
+                target: memberConfig.target
               });
             case "composite":
               return new CompositeEvaluator({
@@ -7586,7 +8053,9 @@ async function runEvaluatorList(options) {
           now,
           judgeProvider,
           outputMessages,
-          traceSummary
+          traceSummary,
+          targetResolver,
+          availableTargets
         });
         const weight = evaluator.weight ?? 1;
         scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
@@ -7782,11 +8251,11 @@ async function runEvaluatorList(options) {
     (total, entry) => total + (entry.score.expectedAspectCount ?? 0),
     0
   );
-  const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString2);
+  const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString);
   const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
   const score = {
     score: aggregateScore,
-    verdict: scoreToVerdict2(aggregateScore),
+    verdict: scoreToVerdict(aggregateScore),
     hits,
     misses,
     expectedAspectCount,
@@ -7833,18 +8302,6 @@ async function resolveCustomPrompt(config) {
   }
   return config.prompt;
 }
-function isNonEmptyString2(value) {
-  return typeof value === "string" && value.trim().length > 0;
-}
-function scoreToVerdict2(score) {
-  if (score >= 0.8) {
-    return "pass";
-  }
-  if (score >= 0.6) {
-    return "borderline";
-  }
-  return "fail";
-}
 function filterEvalCases(evalCases, evalId) {
   if (!evalId) {
     return evalCases;
@@ -7987,7 +8444,8 @@ function mapChildResults(children) {
     misses: child.misses,
     reasoning: child.reasoning,
     evaluatorProviderRequest: child.evaluatorRawRequest,
-    evaluatorResults: mapChildResults(child.evaluatorResults)
+    evaluatorResults: mapChildResults(child.evaluatorResults),
+    details: child.details
   }));
 }
 function computeWeightedMean(entries) {
@@ -8002,7 +8460,7 @@ function computeWeightedMean(entries) {
 }
 // src/evaluation/generators/rubric-generator.ts
-import { generateText as generateText3 } from "ai";
+import { generateText as generateText4 } from "ai";
 import { z as z3 } from "zod";
 var rubricItemSchema = z3.object({
   id: z3.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
@@ -8036,7 +8494,7 @@ You must return a valid JSON object matching this schema:
   let lastError;
   for (let attempt = 1; attempt <= 3; attempt++) {
     try {
-      const { text } = await generateText3({
+      const { text } = await generateText4({
         model,
         system,
         prompt
@@ -8081,17 +8539,6 @@ function buildPrompt(expectedOutcome, question, referenceAnswer) {
   return parts.join("\n");
 }
-// src/evaluation/code-judge-sdk.ts
-import { readFileSync } from "node:fs";
-function parseCodeJudgePayload(payload) {
-  const parsed = JSON.parse(payload);
-  return toCamelCaseDeep(parsed);
-}
-function readCodeJudgePayload() {
-  const stdin = readFileSync(0, "utf8");
-  return parseCodeJudgePayload(stdin);
-}
 // src/index.ts
 function createAgentKernel() {
   return { status: "stub" };
@@ -8109,33 +8556,39 @@ export {
   ToolTrajectoryEvaluator,
   avgToolDurationMs,
   buildDirectoryChain,
+  buildOutputSchema,
   buildPromptInputs,
   buildSearchRoots,
+  clampScore,
   computeTraceSummary,
   consumeClaudeCodeLogEntries,
   consumeCodexLogEntries,
   consumePiLogEntries,
   createAgentKernel,
   createProvider,
+  deepEqual,
   ensureVSCodeSubagents,
+  executeScript,
   explorationRatio,
-  extractCodeBlocks,
+  extractJsonBlob,
   fileExists,
   findGitRoot,
+  freeformEvaluationSchema,
   generateRubrics,
   getHitCount,
   isEvaluatorKind,
   isGuidelineFile,
   isJsonObject,
   isJsonValue,
+  isNonEmptyString,
   isTestMessage,
   isTestMessageRole,
   listTargetNames,
   loadEvalCases,
   mergeExecutionMetrics,
   normalizeLineEndings,
-  parseCodeJudgePayload,
-  readCodeJudgePayload,
+  parseJsonFromText,
+  parseJsonSafe,
   readJsonFile,
   readTargetDefinitions,
   readTestSuiteMetadata,
@@ -8145,6 +8598,7 @@ export {
   resolveTargetDefinition,
   runEvalCase,
   runEvaluation,
+  scoreToVerdict,
   subscribeToClaudeCodeLogEntries,
   subscribeToCodexLogEntries,
   subscribeToPiLogEntries,