npm - @agentv/core - Versions diffs - 2.0.1 → 2.1.0 - Mend

@agentv/core 2.0.1 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/dist/{chunk-IBTKEEOT.js → chunk-KDEP4I7G.js} +44 -1
package/dist/chunk-KDEP4I7G.js.map +1 -0
package/dist/evaluation/validation/index.cjs +1 -0
package/dist/evaluation/validation/index.cjs.map +1 -1
package/dist/evaluation/validation/index.js +1 -1
package/dist/index.cjs +1641 -1138
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +157 -100
package/dist/index.d.ts +157 -100
package/dist/index.js +1451 -997
package/dist/index.js.map +1 -1
package/package.json +4 -1
package/dist/chunk-IBTKEEOT.js.map +0 -1

package/dist/index.cjs CHANGED Viewed

@@ -42,33 +42,39 @@ __export(index_exports, {
   ToolTrajectoryEvaluator: () => ToolTrajectoryEvaluator,
   avgToolDurationMs: () => avgToolDurationMs,
   buildDirectoryChain: () => buildDirectoryChain2,
+  buildOutputSchema: () => buildOutputSchema,
   buildPromptInputs: () => buildPromptInputs,
   buildSearchRoots: () => buildSearchRoots2,
+  clampScore: () => clampScore,
   computeTraceSummary: () => computeTraceSummary,
   consumeClaudeCodeLogEntries: () => consumeClaudeCodeLogEntries,
   consumeCodexLogEntries: () => consumeCodexLogEntries,
   consumePiLogEntries: () => consumePiLogEntries,
   createAgentKernel: () => createAgentKernel,
   createProvider: () => createProvider,
+  deepEqual: () => deepEqual,
   ensureVSCodeSubagents: () => ensureVSCodeSubagents,
+  executeScript: () => executeScript,
   explorationRatio: () => explorationRatio,
-  extractCodeBlocks: () => extractCodeBlocks,
+  extractJsonBlob: () => extractJsonBlob,
   fileExists: () => fileExists2,
   findGitRoot: () => findGitRoot,
+  freeformEvaluationSchema: () => freeformEvaluationSchema,
   generateRubrics: () => generateRubrics,
   getHitCount: () => getHitCount,
   isEvaluatorKind: () => isEvaluatorKind,
   isGuidelineFile: () => isGuidelineFile,
   isJsonObject: () => isJsonObject,
   isJsonValue: () => isJsonValue,
+  isNonEmptyString: () => isNonEmptyString,
   isTestMessage: () => isTestMessage,
   isTestMessageRole: () => isTestMessageRole,
   listTargetNames: () => listTargetNames,
   loadEvalCases: () => loadEvalCases,
   mergeExecutionMetrics: () => mergeExecutionMetrics,
   normalizeLineEndings: () => normalizeLineEndings,
-  parseCodeJudgePayload: () => parseCodeJudgePayload,
-  readCodeJudgePayload: () => readCodeJudgePayload,
+  parseJsonFromText: () => parseJsonFromText,
+  parseJsonSafe: () => parseJsonSafe,
   readJsonFile: () => readJsonFile,
   readTargetDefinitions: () => readTargetDefinitions,
   readTestSuiteMetadata: () => readTestSuiteMetadata,
@@ -78,6 +84,7 @@ __export(index_exports, {
   resolveTargetDefinition: () => resolveTargetDefinition,
   runEvalCase: () => runEvalCase,
   runEvaluation: () => runEvaluation,
+  scoreToVerdict: () => scoreToVerdict,
   subscribeToClaudeCodeLogEntries: () => subscribeToClaudeCodeLogEntries,
   subscribeToCodexLogEntries: () => subscribeToCodexLogEntries,
   subscribeToPiLogEntries: () => subscribeToPiLogEntries,
@@ -223,85 +230,6 @@ var import_promises6 = require("fs/promises");
 var import_node_path6 = __toESM(require("path"), 1);
 var import_yaml2 = require("yaml");
-// src/evaluation/formatting/segment-formatter.ts
-function extractCodeBlocks(segments) {
-  const CODE_BLOCK_PATTERN = /```[\s\S]*?```/g;
-  const codeBlocks = [];
-  for (const segment of segments) {
-    const typeValue = segment.type;
-    if (typeof typeValue !== "string" || typeValue !== "text") {
-      continue;
-    }
-    const textValue = segment.value;
-    if (typeof textValue !== "string") {
-      continue;
-    }
-    const matches = textValue.match(CODE_BLOCK_PATTERN);
-    if (matches) {
-      codeBlocks.push(...matches);
-    }
-  }
-  return codeBlocks;
-}
-function formatFileContents(parts) {
-  const fileCount = parts.filter((p) => p.isFile).length;
-  if (fileCount > 0) {
-    return parts.map((part) => {
-      if (part.isFile && part.displayPath) {
-        return `<file path="${part.displayPath}">
-${part.content}
-</file>`;
-      }
-      return part.content;
-    }).join("\n\n");
-  }
-  return parts.map((p) => p.content).join(" ");
-}
-function formatSegment(segment, mode = "lm") {
-  const type = asString(segment.type);
-  if (type === "text") {
-    return asString(segment.value);
-  }
-  if (type === "guideline_ref") {
-    const refPath = asString(segment.path);
-    return refPath ? `<Attached: ${refPath}>` : void 0;
-  }
-  if (type === "file") {
-    const filePath = asString(segment.path);
-    if (!filePath) {
-      return void 0;
-    }
-    if (mode === "agent") {
-      return `<file: path="${filePath}">`;
-    }
-    const text = asString(segment.text);
-    if (text && filePath) {
-      return formatFileContents([{ content: text.trim(), isFile: true, displayPath: filePath }]);
-    }
-  }
-  return void 0;
-}
-function hasVisibleContent(segments) {
-  return segments.some((segment) => {
-    const type = asString(segment.type);
-    if (type === "text") {
-      const value = asString(segment.value);
-      return value !== void 0 && value.trim().length > 0;
-    }
-    if (type === "guideline_ref") {
-      return false;
-    }
-    if (type === "file") {
-      const text = asString(segment.text);
-      return text !== void 0 && text.trim().length > 0;
-    }
-    return false;
-  });
-}
-function asString(value) {
-  return typeof value === "string" ? value : void 0;
-}
 // src/evaluation/loaders/config-loader.ts
 var import_promises2 = require("fs/promises");
 var import_node_path2 = __toESM(require("path"), 1);
@@ -556,7 +484,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
       logWarning2(`Skipping invalid evaluator entry for '${evalId}' (expected object)`);
       continue;
     }
-    const name = asString2(rawEvaluator.name);
+    const name = asString(rawEvaluator.name);
     const typeValue = rawEvaluator.type;
     if (!name || !isEvaluatorKind(typeValue)) {
       logWarning2(`Skipping evaluator with invalid name/type in '${evalId}'`);
@@ -584,7 +512,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
         continue;
       }
       const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
-      const cwd = asString2(rawEvaluator.cwd);
+      const cwd = asString(rawEvaluator.cwd);
       let resolvedCwd;
       if (cwd) {
         const resolved = await resolveFileReference(cwd, searchRoots);
@@ -599,7 +527,29 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
       } else {
         resolvedCwd = searchRoots[0];
       }
-      const knownProps = /* @__PURE__ */ new Set(["name", "type", "script", "cwd", "weight"]);
+      const rawTarget = rawEvaluator.target;
+      let targetConfig;
+      if (rawTarget !== void 0) {
+        if (isJsonObject2(rawTarget)) {
+          const maxCalls = rawTarget.max_calls;
+          if (maxCalls !== void 0 && (typeof maxCalls !== "number" || maxCalls < 0)) {
+            logWarning2(
+              `Invalid target.max_calls for evaluator '${name}' in '${evalId}': must be a non-negative number`
+            );
+          } else {
+            targetConfig = {
+              ...typeof maxCalls === "number" ? { max_calls: maxCalls } : {}
+            };
+          }
+        } else if (rawTarget === true) {
+          targetConfig = {};
+        } else {
+          logWarning2(
+            `Invalid target config for evaluator '${name}' in '${evalId}': expected object or true`
+          );
+        }
+      }
+      const knownProps = /* @__PURE__ */ new Set(["name", "type", "script", "cwd", "weight", "target"]);
       const config = {};
       for (const [key, value] of Object.entries(rawEvaluator)) {
         if (!knownProps.has(key) && value !== void 0) {
@@ -613,7 +563,8 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
         cwd,
         resolvedCwd,
         ...weight2 !== void 0 ? { weight: weight2 } : {},
-        ...Object.keys(config).length > 0 ? { config } : {}
+        ...Object.keys(config).length > 0 ? { config } : {},
+        ...targetConfig !== void 0 ? { target: targetConfig } : {}
       });
       continue;
     }
@@ -630,7 +581,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
         logWarning2(`Skipping composite evaluator '${name}' in '${evalId}': missing aggregator`);
         continue;
       }
-      const aggregatorType = asString2(rawAggregator.type);
+      const aggregatorType = asString(rawAggregator.type);
       if (aggregatorType !== "weighted_average" && aggregatorType !== "code_judge" && aggregatorType !== "llm_judge") {
         logWarning2(
           `Skipping composite evaluator '${name}' in '${evalId}': invalid aggregator type '${aggregatorType}'`
@@ -643,7 +594,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
           logWarning2(`Skipping invalid member evaluator in composite '${name}' (expected object)`);
           continue;
         }
-        const memberName = asString2(rawMember.name);
+        const memberName = asString(rawMember.name);
         const memberType = rawMember.type;
         if (!memberName || !isEvaluatorKind(memberType)) {
           logWarning2(`Skipping member evaluator with invalid name/type in composite '${name}'`);
@@ -681,7 +632,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
           ...Object.keys(parsedWeights).length > 0 ? { weights: parsedWeights } : {}
         };
       } else if (aggregatorType === "code_judge") {
-        const aggregatorPath = asString2(rawAggregator.path);
+        const aggregatorPath = asString(rawAggregator.path);
         if (!aggregatorPath) {
           logWarning2(
             `Skipping composite evaluator '${name}' in '${evalId}': code_judge aggregator missing path`
@@ -694,7 +645,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
           cwd: searchRoots[0]
         };
       } else {
-        const aggregatorPrompt = asString2(rawAggregator.prompt);
+        const aggregatorPrompt = asString(rawAggregator.prompt);
         let promptPath2;
         if (aggregatorPrompt) {
           const resolved = await resolveFileReference(aggregatorPrompt, searchRoots);
@@ -719,7 +670,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
       continue;
     }
     if (typeValue === "tool_trajectory") {
-      const mode = asString2(rawEvaluator.mode);
+      const mode = asString(rawEvaluator.mode);
       if (mode !== "any_order" && mode !== "in_order" && mode !== "exact") {
         logWarning2(
           `Skipping tool_trajectory evaluator '${name}' in '${evalId}': invalid mode '${mode}' (must be any_order, in_order, or exact)`
@@ -810,8 +761,8 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
           );
           continue;
         }
-        const fieldPath = asString2(rawField.path);
-        const match = asString2(rawField.match);
+        const fieldPath = asString(rawField.path);
+        const match = asString(rawField.match);
         if (!fieldPath) {
           logWarning2(
             `Skipping field without path in field_accuracy evaluator '${name}' in '${evalId}'`
@@ -841,7 +792,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
         );
         continue;
       }
-      const aggregation = asString2(rawEvaluator.aggregation);
+      const aggregation = asString(rawEvaluator.aggregation);
       const validAggregation = isValidFieldAggregationType(aggregation) ? aggregation : void 0;
       const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
       evaluators.push({
@@ -922,7 +873,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
       });
       continue;
     }
-    const prompt = asString2(rawEvaluator.prompt);
+    const prompt = asString(rawEvaluator.prompt);
     let promptPath;
     if (prompt) {
       const resolved = await resolveFileReference(prompt, searchRoots);
@@ -941,11 +892,11 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
         );
       }
     }
-    const _model = asString2(rawEvaluator.model);
+    const _model = asString(rawEvaluator.model);
     const rawRubrics = rawEvaluator.rubrics;
     const parsedRubrics = Array.isArray(rawRubrics) ? rawRubrics.filter((r) => isJsonObject2(r)).map((rubric, index) => ({
-      id: asString2(rubric.id) ?? `rubric-${index + 1}`,
-      description: asString2(rubric.description) ?? "",
+      id: asString(rubric.id) ?? `rubric-${index + 1}`,
+      description: asString(rubric.description) ?? "",
       weight: typeof rubric.weight === "number" ? rubric.weight : 1,
       required: typeof rubric.required === "boolean" ? rubric.required : true
     })).filter((r) => r.description.length > 0) : void 0;
@@ -989,7 +940,7 @@ function coerceEvaluator(candidate, contextId) {
   logWarning2(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
   return void 0;
 }
-function asString2(value) {
+function asString(value) {
   return typeof value === "string" ? value : void 0;
 }
 function asStringArray(value, description) {
@@ -1065,6 +1016,68 @@ function isValidFieldAggregationType(value) {
 // src/evaluation/loaders/message-processor.ts
 var import_promises4 = require("fs/promises");
 var import_node_path4 = __toESM(require("path"), 1);
+// src/evaluation/formatting/segment-formatter.ts
+function formatFileContents(parts) {
+  const fileCount = parts.filter((p) => p.isFile).length;
+  if (fileCount > 0) {
+    return parts.map((part) => {
+      if (part.isFile && part.displayPath) {
+        return `<file path="${part.displayPath}">
+${part.content}
+</file>`;
+      }
+      return part.content;
+    }).join("\n\n");
+  }
+  return parts.map((p) => p.content).join(" ");
+}
+function formatSegment(segment, mode = "lm") {
+  const type = asString2(segment.type);
+  if (type === "text") {
+    return asString2(segment.value);
+  }
+  if (type === "guideline_ref") {
+    const refPath = asString2(segment.path);
+    return refPath ? `<Attached: ${refPath}>` : void 0;
+  }
+  if (type === "file") {
+    const filePath = asString2(segment.path);
+    if (!filePath) {
+      return void 0;
+    }
+    if (mode === "agent") {
+      return `<file: path="${filePath}">`;
+    }
+    const text = asString2(segment.text);
+    if (text && filePath) {
+      return formatFileContents([{ content: text.trim(), isFile: true, displayPath: filePath }]);
+    }
+  }
+  return void 0;
+}
+function hasVisibleContent(segments) {
+  return segments.some((segment) => {
+    const type = asString2(segment.type);
+    if (type === "text") {
+      const value = asString2(segment.value);
+      return value !== void 0 && value.trim().length > 0;
+    }
+    if (type === "guideline_ref") {
+      return false;
+    }
+    if (type === "file") {
+      const text = asString2(segment.text);
+      return text !== void 0 && text.trim().length > 0;
+    }
+    return false;
+  });
+}
+function asString2(value) {
+  return typeof value === "string" ? value : void 0;
+}
+// src/evaluation/loaders/message-processor.ts
 var ANSI_YELLOW4 = "\x1B[33m";
 var ANSI_RESET4 = "\x1B[0m";
 async function processMessages(options) {
@@ -1370,9 +1383,6 @@ ${messageContent}`);
         questionParts.push(formattedContent);
       }
     }
-    if (testCase.code_snippets.length > 0) {
-      questionParts.push(testCase.code_snippets.join("\n"));
-    }
     question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
   }
   const chatPrompt = useRoleMarkers ? buildChatPromptFromSegments({
@@ -1571,7 +1581,6 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
       repoRootPath,
       verbose
     }) : [];
-    const codeSnippets = extractCodeBlocks(inputSegments);
     let referenceAnswer = "";
     if (outputSegments.length > 0) {
       const lastMessage = outputSegments[outputSegments.length - 1];
@@ -1644,7 +1653,6 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
       guideline_paths: guidelinePaths.map((guidelinePath) => import_node_path6.default.resolve(guidelinePath)),
       guideline_patterns: guidelinePatterns,
       file_paths: allFilePaths,
-      code_snippets: codeSnippets,
       expected_outcome: outcome,
       evaluator: evalCaseEvaluatorKind,
       evaluators
@@ -4272,6 +4280,167 @@ var MockProvider = class {
   }
 };
+// src/evaluation/providers/pi-agent-sdk.ts
+var piAgentModule = null;
+var piAiModule = null;
+async function loadPiModules() {
+  if (!piAgentModule || !piAiModule) {
+    try {
+      [piAgentModule, piAiModule] = await Promise.all([
+        import("@mariozechner/pi-agent"),
+        import("@mariozechner/pi-ai")
+      ]);
+    } catch (error) {
+      throw new Error(
+        `Failed to load pi-agent-sdk dependencies. Please install them:
+  npm install @mariozechner/pi-agent @mariozechner/pi-ai
+Original error: ${error instanceof Error ? error.message : String(error)}`
+      );
+    }
+  }
+  return {
+    Agent: piAgentModule.Agent,
+    ProviderTransport: piAgentModule.ProviderTransport,
+    getModel: piAiModule.getModel,
+    getEnvApiKey: piAiModule.getEnvApiKey
+  };
+}
+var PiAgentSdkProvider = class {
+  id;
+  kind = "pi-agent-sdk";
+  targetName;
+  supportsBatch = false;
+  config;
+  constructor(targetName, config) {
+    this.id = `pi-agent-sdk:${targetName}`;
+    this.targetName = targetName;
+    this.config = config;
+  }
+  async invoke(request) {
+    if (request.signal?.aborted) {
+      throw new Error("Pi agent SDK request was aborted before execution");
+    }
+    const { Agent, ProviderTransport, getModel, getEnvApiKey } = await loadPiModules();
+    const startTime = Date.now();
+    const providerName = this.config.provider ?? "anthropic";
+    const modelId = this.config.model ?? "claude-sonnet-4-20250514";
+    const model = getModel(providerName, modelId);
+    const systemPrompt = this.config.systemPrompt ?? "Answer directly and concisely.";
+    const transport = new ProviderTransport({
+      getApiKey: async (provider) => {
+        return this.config.apiKey ?? getEnvApiKey(provider) ?? void 0;
+      }
+    });
+    const agent = new Agent({
+      initialState: {
+        systemPrompt,
+        model,
+        tools: [],
+        // No tools for simple Q&A
+        messages: []
+      },
+      transport
+    });
+    const outputMessages = [];
+    let finalAssistantContent = "";
+    const unsubscribe = agent.subscribe((event) => {
+      if (event.type === "message_end") {
+        const msg = event.message;
+        if (msg.role === "assistant") {
+          const content = extractTextContent2(msg.content);
+          if (content) {
+            finalAssistantContent = content;
+          }
+        }
+      }
+    });
+    try {
+      const timeoutMs = this.config.timeoutMs ?? 12e4;
+      const timeoutPromise = new Promise((_, reject) => {
+        setTimeout(
+          () => reject(new Error(`Pi agent SDK timed out after ${timeoutMs}ms`)),
+          timeoutMs
+        );
+      });
+      await Promise.race([agent.prompt(request.question), timeoutPromise]);
+      await agent.waitForIdle();
+      const agentMessages = agent.state.messages;
+      for (const msg of agentMessages) {
+        outputMessages.push(convertAgentMessage(msg));
+      }
+      const durationMs = Date.now() - startTime;
+      return {
+        raw: {
+          messages: agentMessages,
+          systemPrompt,
+          model: this.config.model,
+          provider: this.config.provider
+        },
+        outputMessages,
+        durationMs
+      };
+    } finally {
+      unsubscribe();
+    }
+  }
+};
+function extractTextContent2(content) {
+  if (typeof content === "string") {
+    return content;
+  }
+  if (!Array.isArray(content)) {
+    return void 0;
+  }
+  const textParts = [];
+  for (const part of content) {
+    if (!part || typeof part !== "object") {
+      continue;
+    }
+    const p = part;
+    if (p.type === "text" && typeof p.text === "string") {
+      textParts.push(p.text);
+    }
+  }
+  return textParts.length > 0 ? textParts.join("\n") : void 0;
+}
+function convertAgentMessage(message) {
+  if (!message || typeof message !== "object") {
+    return { role: "unknown", content: String(message) };
+  }
+  const msg = message;
+  const role = typeof msg.role === "string" ? msg.role : "unknown";
+  const content = extractTextContent2(msg.content);
+  const toolCalls = extractToolCalls2(msg.content);
+  const timestamp = typeof msg.timestamp === "number" ? new Date(msg.timestamp).toISOString() : typeof msg.timestamp === "string" ? msg.timestamp : void 0;
+  return {
+    role,
+    content,
+    toolCalls: toolCalls.length > 0 ? toolCalls : void 0,
+    timestamp
+  };
+}
+function extractToolCalls2(content) {
+  if (!Array.isArray(content)) {
+    return [];
+  }
+  const toolCalls = [];
+  for (const part of content) {
+    if (!part || typeof part !== "object") {
+      continue;
+    }
+    const p = part;
+    if (p.type === "tool_use" && typeof p.name === "string") {
+      toolCalls.push({
+        tool: p.name,
+        input: p.input,
+        id: typeof p.id === "string" ? p.id : void 0
+      });
+    }
+  }
+  return toolCalls;
+}
 // src/evaluation/providers/pi-coding-agent.ts
 var import_node_child_process4 = require("child_process");
 var import_node_crypto3 = require("crypto");
@@ -4787,8 +4956,8 @@ function convertPiMessage(message) {
   if (typeof role !== "string") {
     return void 0;
   }
-  const content = extractTextContent2(msg.content);
-  const toolCalls = extractToolCalls2(msg.content);
+  const content = extractTextContent3(msg.content);
+  const toolCalls = extractToolCalls3(msg.content);
   const timestamp = typeof msg.timestamp === "number" ? new Date(msg.timestamp).toISOString() : typeof msg.timestamp === "string" ? msg.timestamp : void 0;
   const metadata = {};
   if (msg.api) metadata.api = msg.api;
@@ -4804,7 +4973,7 @@ function convertPiMessage(message) {
     metadata: Object.keys(metadata).length > 0 ? metadata : void 0
   };
 }
-function extractTextContent2(content) {
+function extractTextContent3(content) {
   if (typeof content === "string") {
     return content;
   }
@@ -4823,7 +4992,7 @@ function extractTextContent2(content) {
   }
   return textParts.length > 0 ? textParts.join("\n") : void 0;
 }
-function extractToolCalls2(content) {
+function extractToolCalls3(content) {
   if (!Array.isArray(content)) {
     return [];
   }
@@ -5227,6 +5396,15 @@ function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
         providerBatching,
         config: resolvePiCodingAgentConfig(parsed, env)
       };
+    case "pi-agent-sdk":
+      return {
+        kind: "pi-agent-sdk",
+        name: parsed.name,
+        judgeTarget: parsed.judge_target,
+        workers: parsed.workers,
+        providerBatching,
+        config: resolvePiAgentSdkConfig(parsed, env)
+      };
     case "claude-code":
       return {
         kind: "claude-code",
@@ -5448,25 +5626,58 @@ function resolvePiCodingAgentConfig(target, env) {
     systemPrompt
   };
 }
-function resolveClaudeCodeConfig(target, env) {
-  const executableSource = target.executable ?? target.command ?? target.binary;
-  const modelSource = target.model;
-  const argsSource = target.args ?? target.arguments;
-  const cwdSource = target.cwd;
+function resolvePiAgentSdkConfig(target, env) {
+  const providerSource = target.pi_provider ?? target.piProvider ?? target.llm_provider;
+  const modelSource = target.model ?? target.pi_model ?? target.piModel;
+  const apiKeySource = target.api_key ?? target.apiKey;
   const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
-  const logDirSource = target.log_dir ?? target.logDir ?? target.log_directory ?? target.logDirectory;
-  const logFormatSource = target.log_format ?? target.logFormat ?? target.log_output_format ?? target.logOutputFormat ?? env.AGENTV_CLAUDE_CODE_LOG_FORMAT;
   const systemPromptSource = target.system_prompt ?? target.systemPrompt;
-  const executable = resolveOptionalString(executableSource, env, `${target.name} claude-code executable`, {
+  const provider = resolveOptionalString(
+    providerSource,
+    env,
+    `${target.name} pi-agent-sdk provider`,
+    {
+      allowLiteral: true,
+      optionalEnv: true
+    }
+  );
+  const model = resolveOptionalString(modelSource, env, `${target.name} pi-agent-sdk model`, {
     allowLiteral: true,
     optionalEnv: true
-  }) ?? "claude";
-  const model = resolveOptionalString(modelSource, env, `${target.name} claude-code model`, {
-    allowLiteral: true,
+  });
+  const apiKey = resolveOptionalString(apiKeySource, env, `${target.name} pi-agent-sdk api key`, {
+    allowLiteral: false,
     optionalEnv: true
   });
-  const args = resolveOptionalStringArray(argsSource, env, `${target.name} claude-code args`);
-  const cwd = resolveOptionalString(cwdSource, env, `${target.name} claude-code cwd`, {
+  const timeoutMs = resolveTimeoutMs(timeoutSource, `${target.name} pi-agent-sdk timeout`);
+  const systemPrompt = typeof systemPromptSource === "string" && systemPromptSource.trim().length > 0 ? systemPromptSource.trim() : void 0;
+  return {
+    provider,
+    model,
+    apiKey,
+    timeoutMs,
+    systemPrompt
+  };
+}
+function resolveClaudeCodeConfig(target, env) {
+  const executableSource = target.executable ?? target.command ?? target.binary;
+  const modelSource = target.model;
+  const argsSource = target.args ?? target.arguments;
+  const cwdSource = target.cwd;
+  const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
+  const logDirSource = target.log_dir ?? target.logDir ?? target.log_directory ?? target.logDirectory;
+  const logFormatSource = target.log_format ?? target.logFormat ?? target.log_output_format ?? target.logOutputFormat ?? env.AGENTV_CLAUDE_CODE_LOG_FORMAT;
+  const systemPromptSource = target.system_prompt ?? target.systemPrompt;
+  const executable = resolveOptionalString(executableSource, env, `${target.name} claude-code executable`, {
+    allowLiteral: true,
+    optionalEnv: true
+  }) ?? "claude";
+  const model = resolveOptionalString(modelSource, env, `${target.name} claude-code model`, {
+    allowLiteral: true,
+    optionalEnv: true
+  });
+  const args = resolveOptionalStringArray(argsSource, env, `${target.name} claude-code args`);
+  const cwd = resolveOptionalString(cwdSource, env, `${target.name} claude-code cwd`, {
     allowLiteral: true,
     optionalEnv: true
   });
@@ -6106,6 +6317,8 @@ function createProvider(target) {
       return new CodexProvider(target.name, target.config);
     case "pi-coding-agent":
       return new PiCodingAgentProvider(target.name, target.config);
+    case "pi-agent-sdk":
+      return new PiAgentSdkProvider(target.name, target.config);
     case "claude-code":
       return new ClaudeCodeProvider(target.name, target.config);
     case "mock":
@@ -6124,9 +6337,64 @@ function resolveAndCreateProvider(definition, env = process.env) {
   return createProvider(resolved);
 }
-// src/evaluation/evaluators.ts
-var import_ai2 = require("ai");
-var import_zod3 = require("zod");
+// src/evaluation/evaluators/scoring.ts
+function scoreToVerdict(score) {
+  if (score >= 0.8) {
+    return "pass";
+  }
+  if (score >= 0.6) {
+    return "borderline";
+  }
+  return "fail";
+}
+function clampScore(value) {
+  if (Number.isNaN(value) || !Number.isFinite(value)) {
+    return 0;
+  }
+  if (value < 0) {
+    return 0;
+  }
+  if (value > 1) {
+    return 1;
+  }
+  return value;
+}
+function extractJsonBlob(text) {
+  const match = text.match(/\{[\s\S]*\}/);
+  return match?.[0];
+}
+function parseJsonFromText(text) {
+  const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
+  const blob = extractJsonBlob(cleaned) ?? cleaned;
+  return JSON.parse(blob);
+}
+function isNonEmptyString(value) {
+  return typeof value === "string" && value.trim().length > 0;
+}
+function parseJsonSafe(payload) {
+  try {
+    return JSON.parse(payload);
+  } catch {
+    return void 0;
+  }
+}
+function deepEqual(a, b) {
+  if (a === b) return true;
+  if (a === null || b === null) return a === b;
+  if (typeof a !== typeof b) return false;
+  if (typeof a !== "object") return a === b;
+  if (Array.isArray(a) !== Array.isArray(b)) return false;
+  if (Array.isArray(a) && Array.isArray(b)) {
+    if (a.length !== b.length) return false;
+    return a.every((val, i) => deepEqual(val, b[i]));
+  }
+  const aObj = a;
+  const bObj = b;
+  const aKeys = Object.keys(aObj);
+  const bKeys = Object.keys(bObj);
+  if (aKeys.length !== bKeys.length) return false;
+  return aKeys.every((key) => Object.hasOwn(bObj, key) && deepEqual(aObj[key], bObj[key]));
+}
 // src/runtime/exec.ts
 function shellEscapePath(value) {
@@ -6151,7 +6419,9 @@ async function execFileWithStdinBun(argv, stdinPayload, options) {
     cwd: options.cwd,
     stdin: encoder.encode(stdinPayload),
     stdout: "pipe",
-    stderr: "pipe"
+    stderr: "pipe",
+    // Merge additional env vars with process.env
+    env: options.env ? { ...process.env, ...options.env } : process.env
   });
   let timedOut = false;
   const timeout = options.timeoutMs !== void 0 ? setTimeout(() => {
@@ -6186,7 +6456,9 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
     const [cmd, ...args] = argv;
     const child = spawn4(cmd, args, {
       cwd: options.cwd,
-      stdio: ["pipe", "pipe", "pipe"]
+      stdio: ["pipe", "pipe", "pipe"],
+      // Merge additional env vars with process.env
+      env: options.env ? { ...process.env, ...options.env } : process.env
     });
     const stdoutChunks = [];
     const stderrChunks = [];
@@ -6239,7 +6511,9 @@ async function execShellWithStdin(command, stdinPayload, options = {}) {
       const child = spawn4(wrappedCommand, {
         shell: true,
         cwd: options.cwd,
-        stdio: ["ignore", "ignore", "ignore"]
+        stdio: ["ignore", "ignore", "ignore"],
+        // Merge additional env vars with process.env
+        env: options.env ? { ...process.env, ...options.env } : process.env
       });
       const timeout = options.timeoutMs ? setTimeout(() => {
         child.kill();
@@ -6266,6 +6540,221 @@ async function execShellWithStdin(command, stdinPayload, options = {}) {
   }
 }
+// src/runtime/target-proxy.ts
+var import_node_crypto4 = require("crypto");
+var import_node_http = require("http");
+var DEFAULT_MAX_CALLS = 50;
+async function createTargetProxy(options) {
+  const { defaultProvider, targetResolver, availableTargets, maxCalls } = options;
+  const token = (0, import_node_crypto4.randomBytes)(32).toString("hex");
+  let callCount = 0;
+  let isShutdown = false;
+  const targetsList = availableTargets ?? [defaultProvider.targetName];
+  function resolveProvider(targetName) {
+    if (targetName === void 0 || targetName === defaultProvider.targetName) {
+      return defaultProvider;
+    }
+    if (targetResolver) {
+      return targetResolver(targetName);
+    }
+    return void 0;
+  }
+  const server = (0, import_node_http.createServer)(async (req, res) => {
+    res.setHeader("Access-Control-Allow-Origin", "*");
+    res.setHeader("Access-Control-Allow-Methods", "GET, POST, OPTIONS");
+    res.setHeader("Access-Control-Allow-Headers", "Content-Type, Authorization");
+    if (req.method === "OPTIONS") {
+      res.writeHead(204);
+      res.end();
+      return;
+    }
+    const authHeader = req.headers.authorization;
+    if (!authHeader || authHeader !== `Bearer ${token}`) {
+      sendJson(res, 401, { error: "Unauthorized" });
+      return;
+    }
+    if (isShutdown) {
+      sendJson(res, 503, { error: "Proxy is shutting down" });
+      return;
+    }
+    const url2 = req.url ?? "";
+    if (req.method === "GET" && url2 === "/info") {
+      handleInfo(res);
+      return;
+    }
+    if (req.method === "POST" && url2 === "/invoke") {
+      await handleInvoke(req, res);
+      return;
+    }
+    if (req.method === "POST" && url2 === "/invokeBatch") {
+      await handleInvokeBatch(req, res);
+      return;
+    }
+    sendJson(res, 404, { error: "Not found" });
+  });
+  function handleInfo(res) {
+    const response = {
+      targetName: defaultProvider.targetName,
+      maxCalls,
+      callCount,
+      availableTargets: targetsList
+    };
+    sendJson(res, 200, response);
+  }
+  async function handleInvoke(req, res) {
+    if (callCount >= maxCalls) {
+      sendJson(res, 429, { error: `Max calls exceeded (limit: ${maxCalls})` });
+      return;
+    }
+    try {
+      const body = await readBody(req);
+      const request = JSON.parse(body);
+      if (!request.question || typeof request.question !== "string") {
+        sendJson(res, 400, { error: "Missing required field: question" });
+        return;
+      }
+      const provider = resolveProvider(request.target);
+      if (!provider) {
+        sendJson(res, 400, {
+          error: `Unknown target '${request.target}'. Available: ${targetsList.join(", ")}`
+        });
+        return;
+      }
+      callCount++;
+      const response = await provider.invoke({
+        question: request.question,
+        systemPrompt: request.systemPrompt,
+        evalCaseId: request.evalCaseId ?? "proxy",
+        attempt: request.attempt ?? 1
+      });
+      const outputMessages = response.outputMessages ?? [];
+      const rawText = extractLastAssistantContent(outputMessages);
+      const result = {
+        outputMessages,
+        rawText
+      };
+      sendJson(res, 200, result);
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      sendJson(res, 500, { error: message });
+    }
+  }
+  async function handleInvokeBatch(req, res) {
+    try {
+      const body = await readBody(req);
+      const { requests } = JSON.parse(body);
+      if (!Array.isArray(requests)) {
+        sendJson(res, 400, { error: "Missing required field: requests (array)" });
+        return;
+      }
+      if (callCount + requests.length > maxCalls) {
+        sendJson(res, 429, {
+          error: `Batch would exceed max calls (current: ${callCount}, batch: ${requests.length}, limit: ${maxCalls})`
+        });
+        return;
+      }
+      const responses = [];
+      for (const request of requests) {
+        if (!request.question || typeof request.question !== "string") {
+          responses.push({
+            outputMessages: [],
+            rawText: "Error: Missing required field: question"
+          });
+          continue;
+        }
+        const provider = resolveProvider(request.target);
+        if (!provider) {
+          responses.push({
+            outputMessages: [],
+            rawText: `Error: Unknown target '${request.target}'. Available: ${targetsList.join(", ")}`
+          });
+          continue;
+        }
+        callCount++;
+        try {
+          const response = await provider.invoke({
+            question: request.question,
+            systemPrompt: request.systemPrompt,
+            evalCaseId: request.evalCaseId ?? "proxy",
+            attempt: request.attempt ?? 1
+          });
+          const outputMessages = response.outputMessages ?? [];
+          responses.push({
+            outputMessages,
+            rawText: extractLastAssistantContent(outputMessages)
+          });
+        } catch (error) {
+          const message = error instanceof Error ? error.message : String(error);
+          responses.push({
+            outputMessages: [],
+            rawText: `Error: ${message}`
+          });
+        }
+      }
+      sendJson(res, 200, { responses });
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      sendJson(res, 500, { error: message });
+    }
+  }
+  await new Promise((resolve, reject) => {
+    server.once("error", reject);
+    server.listen(0, "127.0.0.1", () => {
+      server.removeListener("error", reject);
+      resolve();
+    });
+  });
+  const address = server.address();
+  const url = `http://127.0.0.1:${address.port}`;
+  return {
+    url,
+    token,
+    shutdown: async () => {
+      isShutdown = true;
+      return new Promise((resolve, reject) => {
+        server.close((err) => {
+          if (err) reject(err);
+          else resolve();
+        });
+      });
+    },
+    getUsageMetadata: () => ({
+      callCount,
+      maxCalls
+    })
+  };
+}
+function sendJson(res, statusCode, body) {
+  res.writeHead(statusCode, { "Content-Type": "application/json" });
+  res.end(JSON.stringify(body));
+}
+function readBody(req) {
+  return new Promise((resolve, reject) => {
+    const chunks = [];
+    req.on("data", (chunk) => chunks.push(chunk));
+    req.on("end", () => resolve(Buffer.concat(chunks).toString("utf8")));
+    req.on("error", reject);
+  });
+}
+function extractLastAssistantContent(messages) {
+  for (let i = messages.length - 1; i >= 0; i--) {
+    const msg = messages[i];
+    if (msg.role === "assistant" && msg.content !== void 0) {
+      if (typeof msg.content === "string") {
+        return msg.content;
+      }
+      if (Array.isArray(msg.content)) {
+        for (const part of msg.content) {
+          if (typeof part === "object" && part !== null && "text" in part) {
+            return String(part.text);
+          }
+        }
+      }
+    }
+  }
+  return void 0;
+}
 // src/evaluation/case-conversion.ts
 function toSnakeCase(str) {
   if (/^[A-Z]/.test(str)) {
@@ -6273,12 +6762,6 @@ function toSnakeCase(str) {
   }
   return str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
 }
-function toCamelCase(str) {
-  if (/^[A-Z]/.test(str)) {
-    return str;
-  }
-  return str.replace(/_([a-z0-9])/g, (_, letter) => letter.toUpperCase());
-}
 function toSnakeCaseDeep(obj) {
   if (obj === null || obj === void 0) {
     return obj;
@@ -6296,61 +6779,184 @@ function toSnakeCaseDeep(obj) {
   }
   return obj;
 }
-function toCamelCaseDeep(obj) {
-  if (obj === null || obj === void 0) {
-    return obj;
-  }
-  if (Array.isArray(obj)) {
-    return obj.map((item) => toCamelCaseDeep(item));
-  }
-  if (typeof obj === "object") {
-    const result = {};
-    for (const [key, value] of Object.entries(obj)) {
-      const camelKey = toCamelCase(key);
-      result[camelKey] = toCamelCaseDeep(value);
-    }
-    return result;
-  }
-  return obj;
-}
-// src/evaluation/providers/types.ts
-var AGENT_PROVIDER_KINDS = [
-  "codex",
-  "pi-coding-agent",
-  "claude-code",
-  "vscode",
-  "vscode-insiders"
-];
-function extractLastAssistantContent(messages) {
-  if (!messages || messages.length === 0) {
-    return "";
+// src/evaluation/evaluators/code-evaluator.ts
+var CodeEvaluator = class {
+  kind = "code";
+  script;
+  cwd;
+  agentTimeoutMs;
+  config;
+  target;
+  constructor(options) {
+    this.script = options.script;
+    this.cwd = options.cwd;
+    this.agentTimeoutMs = options.agentTimeoutMs;
+    this.config = options.config;
+    this.target = options.target;
   }
-  for (let i = messages.length - 1; i >= 0; i--) {
-    const msg = messages[i];
-    if (msg.role === "assistant" && msg.content !== void 0) {
-      if (typeof msg.content === "string") {
-        return msg.content;
-      }
-      return JSON.stringify(msg.content);
+  async evaluate(context) {
+    const payload = {
+      question: context.evalCase.question,
+      expectedOutcome: context.evalCase.expected_outcome,
+      expectedMessages: context.evalCase.expected_messages,
+      referenceAnswer: context.evalCase.reference_answer,
+      candidateAnswer: context.candidate,
+      outputMessages: context.outputMessages ?? null,
+      guidelineFiles: context.evalCase.guideline_paths,
+      inputFiles: context.evalCase.file_paths.filter(
+        (path17) => !context.evalCase.guideline_paths.includes(path17)
+      ),
+      inputMessages: context.evalCase.input_messages,
+      traceSummary: context.traceSummary ?? null,
+      config: this.config ?? null
+    };
+    const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
+    let proxyEnv;
+    let proxyShutdown;
+    let getProxyUsage;
+    if (this.target !== void 0 && context.judgeProvider) {
+      const maxCalls = this.target.max_calls ?? DEFAULT_MAX_CALLS;
+      const proxy = await createTargetProxy({
+        defaultProvider: context.judgeProvider,
+        targetResolver: context.targetResolver,
+        availableTargets: context.availableTargets,
+        maxCalls
+      });
+      proxyEnv = {
+        AGENTV_TARGET_PROXY_URL: proxy.url,
+        AGENTV_TARGET_PROXY_TOKEN: proxy.token
+      };
+      proxyShutdown = proxy.shutdown;
+      getProxyUsage = proxy.getUsageMetadata;
     }
-  }
-  return "";
-}
-function isAgentProvider(provider) {
-  return provider ? AGENT_PROVIDER_KINDS.includes(provider.kind) : false;
-}
-// src/evaluation/evaluators.ts
-var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
-Use the reference_answer as a gold standard for a high-quality response (if provided). The reference_answer may be a simple text response, or it may contain a sequence of expected agent messages including tool calls. When it contains multiple messages, the last message represents the final expected answer. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
-Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.
-[[ ## expected_outcome ## ]]
-{{${TEMPLATE_VARIABLES.EXPECTED_OUTCOME}}}
+    try {
+      const stdout = await executeScript(
+        this.script,
+        inputPayload,
+        this.agentTimeoutMs,
+        this.cwd,
+        proxyEnv
+      );
+      const parsed = parseJsonSafe(stdout);
+      const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
+      const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
+      const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
+      const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
+      const details = parsed?.details && typeof parsed.details === "object" && !Array.isArray(parsed.details) ? parsed.details : void 0;
+      const proxyUsage = getProxyUsage?.();
+      const evaluatorRawRequest = {
+        script: this.script,
+        ...this.cwd ? { cwd: this.cwd } : {},
+        ...proxyUsage ? {
+          target_proxy: {
+            call_count: proxyUsage.callCount,
+            max_calls: proxyUsage.maxCalls
+          }
+        } : {}
+      };
+      return {
+        score,
+        verdict: scoreToVerdict(score),
+        hits,
+        misses,
+        expectedAspectCount: hits.length + misses.length || 1,
+        reasoning,
+        evaluatorRawRequest,
+        ...details ? { details } : {}
+      };
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      const proxyUsage = getProxyUsage?.();
+      return {
+        score: 0,
+        verdict: "fail",
+        hits: [],
+        misses: [`Code evaluator failed: ${message}`],
+        expectedAspectCount: 1,
+        reasoning: message,
+        evaluatorRawRequest: {
+          script: this.script,
+          ...this.cwd ? { cwd: this.cwd } : {},
+          ...proxyUsage ? {
+            target_proxy: {
+              call_count: proxyUsage.callCount,
+              max_calls: proxyUsage.maxCalls
+            }
+          } : {},
+          error: message
+        }
+      };
+    } finally {
+      if (proxyShutdown) {
+        await proxyShutdown();
+      }
+    }
+  }
+};
+async function executeScript(scriptPath, input, agentTimeoutMs, cwd, env) {
+  const { stdout, stderr, exitCode } = typeof scriptPath === "string" ? await execShellWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env }) : await execFileWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env });
+  if (exitCode !== 0) {
+    const trimmedErr = formatStderr(stderr);
+    throw new Error(
+      trimmedErr.length > 0 ? `Code evaluator exited with code ${exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${exitCode}`
+    );
+  }
+  return stdout.trim();
+}
+function formatStderr(stderr) {
+  const trimmed = stderr.trim();
+  const maxLength = 2e3;
+  if (trimmed.length <= maxLength) {
+    return trimmed;
+  }
+  const tail = trimmed.slice(-maxLength);
+  return `...(truncated, last ${maxLength} chars)
+${tail}`;
+}
+// src/evaluation/evaluators/composite.ts
+var import_ai3 = require("ai");
+// src/evaluation/providers/types.ts
+var AGENT_PROVIDER_KINDS = [
+  "codex",
+  "pi-coding-agent",
+  "claude-code",
+  "vscode",
+  "vscode-insiders"
+];
+function extractLastAssistantContent2(messages) {
+  if (!messages || messages.length === 0) {
+    return "";
+  }
+  for (let i = messages.length - 1; i >= 0; i--) {
+    const msg = messages[i];
+    if (msg.role === "assistant" && msg.content !== void 0) {
+      if (typeof msg.content === "string") {
+        return msg.content;
+      }
+      return JSON.stringify(msg.content);
+    }
+  }
+  return "";
+}
+function isAgentProvider(provider) {
+  return provider ? AGENT_PROVIDER_KINDS.includes(provider.kind) : false;
+}
+// src/evaluation/evaluators/llm-judge.ts
+var import_ai2 = require("ai");
+var import_zod3 = require("zod");
+var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
+Use the reference_answer as a gold standard for a high-quality response (if provided). The reference_answer may be a simple text response, or it may contain a sequence of expected agent messages including tool calls. When it contains multiple messages, the last message represents the final expected answer. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
+Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.
+[[ ## expected_outcome ## ]]
+{{${TEMPLATE_VARIABLES.EXPECTED_OUTCOME}}}
 [[ ## question ## ]]
 {{${TEMPLATE_VARIABLES.QUESTION}}}
@@ -6421,7 +7027,7 @@ var LlmJudgeEvaluator = class {
       target: judgeProvider.targetName
     };
     try {
-      const { data, providerResponse } = await this.runWithRetry({
+      const { data } = await this.runWithRetry({
         context,
         judgeProvider,
         systemPrompt,
@@ -6534,7 +7140,7 @@ var LlmJudgeEvaluator = class {
           temperature: this.temperature
         });
         const data = schema.parse(
-          parseJsonFromText(extractLastAssistantContent(response.outputMessages))
+          parseJsonFromText(extractLastAssistantContent2(response.outputMessages))
         );
         return { data, providerResponse: response };
       } catch (e) {
@@ -6570,105 +7176,11 @@ You must return a valid JSON object matching this schema:
   "overall_reasoning": "string (summary)"
 }`;
 }
-function scoreToVerdict(score) {
-  if (score >= 0.8) {
-    return "pass";
-  }
-  if (score >= 0.6) {
-    return "borderline";
-  }
-  return "fail";
-}
-function clampScore(value) {
-  if (Number.isNaN(value) || !Number.isFinite(value)) {
-    return 0;
-  }
-  if (value < 0) {
-    return 0;
-  }
-  if (value > 1) {
-    return 1;
-  }
-  return value;
-}
-function extractJsonBlob(text) {
-  const match = text.match(/\{[\s\S]*\}/);
-  return match?.[0];
-}
-function parseJsonFromText(text) {
-  const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
-  const blob = extractJsonBlob(cleaned) ?? cleaned;
-  return JSON.parse(blob);
-}
-function isNonEmptyString(value) {
-  return typeof value === "string" && value.trim().length > 0;
+function substituteVariables(template, variables) {
+  return template.replace(/\{\{\s*([a-zA-Z0-9_]+)\s*\}\}/g, (match, varName) => {
+    return variables[varName] ?? match;
+  });
 }
-var CodeEvaluator = class {
-  kind = "code";
-  script;
-  cwd;
-  agentTimeoutMs;
-  config;
-  constructor(options) {
-    this.script = options.script;
-    this.cwd = options.cwd;
-    this.agentTimeoutMs = options.agentTimeoutMs;
-    this.config = options.config;
-  }
-  async evaluate(context) {
-    const payload = {
-      question: context.evalCase.question,
-      expectedOutcome: context.evalCase.expected_outcome,
-      expectedMessages: context.evalCase.expected_messages,
-      referenceAnswer: context.evalCase.reference_answer,
-      candidateAnswer: context.candidate,
-      outputMessages: context.outputMessages ?? null,
-      guidelineFiles: context.evalCase.guideline_paths,
-      inputFiles: context.evalCase.file_paths.filter(
-        (path17) => !context.evalCase.guideline_paths.includes(path17)
-      ),
-      inputMessages: context.evalCase.input_messages,
-      traceSummary: context.traceSummary ?? null,
-      config: this.config ?? null
-    };
-    const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
-    try {
-      const stdout = await executeScript(this.script, inputPayload, this.agentTimeoutMs, this.cwd);
-      const parsed = parseJsonSafe(stdout);
-      const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
-      const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
-      const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
-      const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
-      return {
-        score,
-        verdict: scoreToVerdict(score),
-        hits,
-        misses,
-        expectedAspectCount: hits.length + misses.length || 1,
-        reasoning,
-        evaluatorRawRequest: {
-          script: this.script,
-          ...this.cwd ? { cwd: this.cwd } : {}
-        }
-      };
-    } catch (error) {
-      const message = error instanceof Error ? error.message : String(error);
-      return {
-        score: 0,
-        verdict: "fail",
-        hits: [],
-        misses: [`Code evaluator failed: ${message}`],
-        expectedAspectCount: 1,
-        reasoning: message,
-        evaluatorRawRequest: {
-          script: this.script,
-          ...this.cwd ? { cwd: this.cwd } : {},
-          error: message
-        }
-      };
-    }
-  }
-};
 function calculateRubricScore(result, rubrics) {
   const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
   const hits = [];
@@ -6696,273 +7208,281 @@ function calculateRubricScore(result, rubrics) {
   const verdict = failedRequired ? "fail" : scoreToVerdict(score);
   return { score, verdict, hits, misses };
 }
-async function executeScript(scriptPath, input, agentTimeoutMs, cwd) {
-  const { stdout, stderr, exitCode } = typeof scriptPath === "string" ? await execShellWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs }) : await execFileWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs });
-  if (exitCode !== 0) {
-    const trimmedErr = formatStderr(stderr);
-    throw new Error(
-      trimmedErr.length > 0 ? `Code evaluator exited with code ${exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${exitCode}`
-    );
+// src/evaluation/evaluators/composite.ts
+var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
+{{EVALUATOR_RESULTS_JSON}}
+Decide the final score and verdict based on all evaluator results.
+Return a JSON object with: score (0.0-1.0), verdict (pass/fail/borderline), and reasoning.`;
+var CompositeEvaluator = class {
+  kind = "composite";
+  config;
+  evaluatorFactory;
+  cwd;
+  constructor(options) {
+    this.config = options.config;
+    this.evaluatorFactory = options.evaluatorFactory;
+    this.cwd = options.cwd;
   }
-  return stdout.trim();
-}
-function formatStderr(stderr) {
-  const trimmed = stderr.trim();
-  const maxLength = 2e3;
-  if (trimmed.length <= maxLength) {
-    return trimmed;
+  async evaluate(context) {
+    const memberResults = await Promise.all(
+      this.config.evaluators.map(async (memberConfig) => {
+        const evaluator = this.evaluatorFactory.create(memberConfig, context);
+        return {
+          id: memberConfig.name,
+          type: memberConfig.type,
+          result: await evaluator.evaluate(context)
+        };
+      })
+    );
+    return this.aggregate(memberResults, context);
   }
-  const tail = trimmed.slice(-maxLength);
-  return `...(truncated, last ${maxLength} chars)
-${tail}`;
-}
-function parseJsonSafe(payload) {
-  try {
-    return JSON.parse(payload);
-  } catch {
-    return void 0;
+  async aggregate(results, context) {
+    const aggregator = this.config.aggregator;
+    switch (aggregator.type) {
+      case "code_judge":
+        return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
+      case "llm_judge":
+        return this.runLlmAggregator(results, context, aggregator);
+      default:
+        return this.runWeightedAverage(results, aggregator.weights);
+    }
   }
-}
-function substituteVariables(template, variables) {
-  return template.replace(/\{\{\s*([a-zA-Z0-9_]+)\s*\}\}/g, (match, varName) => {
-    return variables[varName] ?? match;
-  });
-}
-function deepEqual(a, b) {
-  if (a === b) return true;
-  if (a === null || b === null) return a === b;
-  if (typeof a !== typeof b) return false;
-  if (typeof a !== "object") return a === b;
-  if (Array.isArray(a) !== Array.isArray(b)) return false;
-  if (Array.isArray(a) && Array.isArray(b)) {
-    if (a.length !== b.length) return false;
-    return a.every((val, i) => deepEqual(val, b[i]));
-  }
-  const aObj = a;
-  const bObj = b;
-  const aKeys = Object.keys(aObj);
-  const bKeys = Object.keys(bObj);
-  if (aKeys.length !== bKeys.length) return false;
-  return aKeys.every((key) => Object.hasOwn(bObj, key) && deepEqual(aObj[key], bObj[key]));
-}
-function argsMatch(expected, actual) {
-  if (expected === void 0) return true;
-  if (expected === "any") return true;
-  if (actual === void 0) return false;
-  for (const key of Object.keys(expected)) {
-    if (!Object.hasOwn(actual, key)) return false;
-    if (!deepEqual(expected[key], actual[key])) return false;
-  }
-  return true;
-}
-var ToolTrajectoryEvaluator = class {
-  kind = "tool_trajectory";
-  config;
-  constructor(options) {
-    this.config = options.config;
+  runWeightedAverage(results, weights) {
+    let totalWeight = 0;
+    let weightedSum = 0;
+    const allHits = [];
+    const allMisses = [];
+    const reasoningParts = [];
+    const evaluatorResults = [];
+    for (const member of results) {
+      const weight = weights?.[member.id] ?? 1;
+      totalWeight += weight;
+      weightedSum += member.result.score * weight;
+      allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
+      allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
+      if (member.result.reasoning) {
+        reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
+      }
+      evaluatorResults.push({
+        name: member.id,
+        type: member.type,
+        score: member.result.score,
+        weight,
+        verdict: member.result.verdict,
+        hits: [...member.result.hits],
+        misses: [...member.result.misses],
+        reasoning: member.result.reasoning,
+        evaluatorRawRequest: member.result.evaluatorRawRequest,
+        evaluatorResults: member.result.evaluatorResults,
+        details: member.result.details
+      });
+    }
+    const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
+    return {
+      score: clampScore(finalScore),
+      verdict: scoreToVerdict(finalScore),
+      hits: allHits,
+      misses: allMisses,
+      expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
+      reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
+      evaluatorRawRequest: {
+        aggregator: "weighted_average",
+        ...weights ? { weights } : {}
+      },
+      evaluatorResults
+    };
   }
-  evaluate(context) {
-    const { outputMessages, traceSummary } = context;
-    const toolCalls = this.extractToolCallsFromMessages(outputMessages);
-    if (toolCalls.length === 0 && !traceSummary) {
+  async runCodeAggregator(results, scriptPath, cwd, weights) {
+    const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
+    const inputPayload = JSON.stringify({ results: resultsObject }, null, 2);
+    const evaluatorResults = results.map((member) => ({
+      name: member.id,
+      type: member.type,
+      score: member.result.score,
+      weight: weights?.[member.id] ?? 1,
+      verdict: member.result.verdict,
+      hits: [...member.result.hits],
+      misses: [...member.result.misses],
+      reasoning: member.result.reasoning,
+      evaluatorRawRequest: member.result.evaluatorRawRequest,
+      evaluatorResults: member.result.evaluatorResults,
+      details: member.result.details
+    }));
+    try {
+      const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
+      const parsed = parseJsonSafe(stdout);
+      const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
+      const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
+      const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
+      const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
+      const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
+      return {
+        score,
+        verdict,
+        hits,
+        misses,
+        expectedAspectCount: hits.length + misses.length || 1,
+        reasoning,
+        evaluatorRawRequest: {
+          aggregator: "code_judge",
+          script: scriptPath
+        },
+        evaluatorResults
+      };
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
       return {
         score: 0,
         verdict: "fail",
         hits: [],
-        misses: ["No trace available for evaluation"],
-        expectedAspectCount: 1
+        misses: [`Code aggregator failed: ${message}`],
+        expectedAspectCount: 1,
+        reasoning: message,
+        evaluatorRawRequest: {
+          aggregator: "code_judge",
+          script: scriptPath,
+          error: message
+        },
+        evaluatorResults
       };
     }
-    const summary = toolCalls.length > 0 ? this.buildSummary(toolCalls) : traceSummary;
-    if (!summary) {
+  }
+  async runLlmAggregator(results, context, config) {
+    const judgeProvider = context.judgeProvider;
+    if (!judgeProvider) {
+      throw new Error("No judge provider available for LLM aggregation");
+    }
+    const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
+    const resultsJson = JSON.stringify(resultsObject, null, 2);
+    const evaluatorResults = results.map((member) => ({
+      name: member.id,
+      type: member.type,
+      score: member.result.score,
+      verdict: member.result.verdict,
+      hits: [...member.result.hits],
+      misses: [...member.result.misses],
+      reasoning: member.result.reasoning,
+      evaluatorRawRequest: member.result.evaluatorRawRequest,
+      evaluatorResults: member.result.evaluatorResults,
+      details: member.result.details
+    }));
+    const promptTemplate = config.prompt ?? DEFAULT_COMPOSITE_AGGREGATOR_PROMPT;
+    const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
+    const systemPrompt = buildOutputSchema();
+    const evaluatorRawRequest = {
+      aggregator: "llm_judge",
+      userPrompt,
+      systemPrompt,
+      target: judgeProvider.targetName
+    };
+    try {
+      const model = judgeProvider.asLanguageModel?.();
+      if (model) {
+        const { text } = await (0, import_ai3.generateText)({
+          model,
+          system: systemPrompt,
+          prompt: userPrompt
+        });
+        const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text));
+        const score2 = clampScore(data2.score);
+        const hits2 = Array.isArray(data2.hits) ? data2.hits.filter(isNonEmptyString).slice(0, 4) : [];
+        const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
+        const reasoning2 = data2.reasoning;
+        return {
+          score: score2,
+          verdict: scoreToVerdict(score2),
+          hits: hits2,
+          misses: misses2,
+          expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
+          reasoning: reasoning2,
+          evaluatorRawRequest,
+          evaluatorResults
+        };
+      }
+      const response = await judgeProvider.invoke({
+        question: userPrompt,
+        systemPrompt,
+        evalCaseId: context.evalCase.id,
+        attempt: context.attempt
+      });
+      const data = freeformEvaluationSchema.parse(
+        parseJsonFromText(extractLastAssistantContent2(response.outputMessages))
+      );
+      const score = clampScore(data.score);
+      const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
+      const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
+      const reasoning = data.reasoning;
+      return {
+        score,
+        verdict: scoreToVerdict(score),
+        hits,
+        misses,
+        expectedAspectCount: Math.max(hits.length + misses.length, 1),
+        reasoning,
+        evaluatorRawRequest,
+        evaluatorResults
+      };
+    } catch {
       return {
         score: 0,
         verdict: "fail",
         hits: [],
-        misses: ["No trace available for evaluation"],
-        expectedAspectCount: 1
+        misses: [],
+        expectedAspectCount: 1,
+        evaluatorRawRequest,
+        evaluatorResults
       };
     }
-    switch (this.config.mode) {
-      case "any_order":
-        return this.evaluateAnyOrder(summary);
-      case "in_order":
-        return this.evaluateInOrder(toolCalls);
-      case "exact":
-        return this.evaluateExact(toolCalls);
-      default:
-        return {
-          score: 0,
-          verdict: "fail",
-          hits: [],
-          misses: [`Unknown mode: ${this.config.mode}`],
-          expectedAspectCount: 1
-        };
-    }
   }
-  /**
-   * Extract tool calls from output messages.
-   */
-  extractToolCallsFromMessages(messages) {
-    if (!messages) {
-      return [];
-    }
-    const toolCalls = [];
-    for (const message of messages) {
-      if (message.toolCalls) {
-        for (const call of message.toolCalls) {
-          toolCalls.push({
-            name: call.tool,
-            args: call.input
-          });
-        }
-      }
-    }
-    return toolCalls;
+};
+// src/evaluation/evaluators/cost.ts
+var CostEvaluator = class {
+  kind = "cost";
+  config;
+  constructor(options) {
+    this.config = options.config;
   }
-  /**
-   * Build a summary from extracted tool calls.
-   */
-  buildSummary(toolCalls) {
-    const toolCallsByName = {};
-    for (const call of toolCalls) {
-      toolCallsByName[call.name] = (toolCallsByName[call.name] ?? 0) + 1;
-    }
-    const toolNames = Object.keys(toolCallsByName).sort();
-    return {
-      eventCount: toolCalls.length,
-      toolNames,
-      toolCallsByName,
-      errorCount: 0
-    };
-  }
-  evaluateAnyOrder(summary) {
-    const minimums = this.config.minimums ?? {};
-    const toolNames = Object.keys(minimums);
-    if (toolNames.length === 0) {
-      return {
-        score: 1,
-        verdict: "pass",
-        hits: ["No tool requirements specified"],
-        misses: [],
-        expectedAspectCount: 0
-      };
-    }
-    const hits = [];
-    const misses = [];
-    for (const toolName of toolNames) {
-      const required = minimums[toolName];
-      const actual = summary.toolCallsByName[toolName] ?? 0;
-      if (actual >= required) {
-        hits.push(`${toolName}: called ${actual} times (required \u2265${required})`);
-      } else {
-        misses.push(`${toolName}: called ${actual} times (required \u2265${required})`);
-      }
-    }
-    const score = hits.length / toolNames.length;
-    return {
-      score,
-      verdict: scoreToVerdict(score),
-      hits,
-      misses,
-      expectedAspectCount: toolNames.length
-    };
-  }
-  evaluateInOrder(toolCalls) {
-    const expected = this.config.expected ?? [];
-    if (expected.length === 0) {
+  evaluate(context) {
+    const { budget } = this.config;
+    const costUsd = context.traceSummary?.costUsd;
+    if (costUsd === void 0) {
       return {
-        score: 1,
-        verdict: "pass",
-        hits: ["No tool sequence specified"],
-        misses: [],
-        expectedAspectCount: 0
-      };
-    }
-    const hits = [];
-    const misses = [];
-    let actualIndex = 0;
-    for (let i = 0; i < expected.length; i++) {
-      const expectedItem = expected[i];
-      const expectedTool = expectedItem.tool;
-      let found = false;
-      let argsMismatch = false;
-      while (actualIndex < toolCalls.length) {
-        const actualCall = toolCalls[actualIndex];
-        if (actualCall.name === expectedTool) {
-          if (argsMatch(expectedItem.args, actualCall.args)) {
-            hits.push(`Found ${expectedTool} at position ${actualIndex}`);
-            actualIndex++;
-            found = true;
-            break;
-          }
-          misses.push(
-            `Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`
-          );
-          actualIndex++;
-          argsMismatch = true;
-          break;
+        score: 0,
+        verdict: "fail",
+        hits: [],
+        misses: ["No cost data available in trace"],
+        expectedAspectCount: 1,
+        reasoning: "Execution cost not reported by provider",
+        evaluatorRawRequest: {
+          type: "cost",
+          budget,
+          costUsd: null
         }
-        actualIndex++;
-      }
-      if (!found && !argsMismatch) {
-        misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
-      }
-    }
-    const score = hits.length / expected.length;
-    return {
-      score,
-      verdict: scoreToVerdict(score),
-      hits,
-      misses,
-      expectedAspectCount: expected.length
-    };
-  }
-  evaluateExact(toolCalls) {
-    const expected = this.config.expected ?? [];
-    if (expected.length === 0) {
-      return {
-        score: 1,
-        verdict: "pass",
-        hits: ["No tool sequence specified"],
-        misses: [],
-        expectedAspectCount: 0
       };
     }
-    const hits = [];
-    const misses = [];
-    if (toolCalls.length !== expected.length) {
-      misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
-    }
-    const checkLength = Math.min(expected.length, toolCalls.length);
-    for (let i = 0; i < checkLength; i++) {
-      const expectedItem = expected[i];
-      const expectedTool = expectedItem.tool;
-      const actualCall = toolCalls[i];
-      const actualTool = actualCall.name;
-      if (actualTool === expectedTool) {
-        if (argsMatch(expectedItem.args, actualCall.args)) {
-          hits.push(`Position ${i}: ${expectedTool}`);
-        } else {
-          misses.push(`Position ${i}: ${expectedTool} args mismatch`);
-        }
-      } else {
-        misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
-      }
-    }
-    for (let i = checkLength; i < expected.length; i++) {
-      misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
-    }
-    const score = hits.length / expected.length;
+    const passed = costUsd <= budget;
+    const score = passed ? 1 : 0;
+    const formatCost = (n) => `$${n.toFixed(4)}`;
     return {
       score,
-      verdict: scoreToVerdict(score),
-      hits,
-      misses,
-      expectedAspectCount: expected.length
+      verdict: passed ? "pass" : "fail",
+      hits: passed ? [`Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`] : [],
+      misses: passed ? [] : [`Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`],
+      expectedAspectCount: 1,
+      reasoning: `Execution cost ${formatCost(costUsd)} (budget: ${formatCost(budget)})`,
+      evaluatorRawRequest: {
+        type: "cost",
+        budget,
+        costUsd
+      }
     };
   }
 };
+// src/evaluation/evaluators/field-accuracy.ts
 var DEFAULT_DATE_FORMATS = [
   "YYYY-MM-DDTHH:mm:ssZ",
   // ISO with timezone
@@ -7058,551 +7578,326 @@ var FieldAccuracyEvaluator = class {
           return message.content;
         }
         if (typeof message.content === "string") {
-          try {
-            return parseJsonFromTextSafe(message.content);
-          } catch {
-          }
-        }
-      }
-    }
-    return void 0;
-  }
-  /**
-   * Evaluate a single field against the expected value.
-   */
-  evaluateField(fieldConfig, candidateData, expectedData) {
-    const { path: path17, match, required = true, weight = 1 } = fieldConfig;
-    const candidateValue = resolvePath(candidateData, path17);
-    const expectedValue = resolvePath(expectedData, path17);
-    if (expectedValue === void 0) {
-      return {
-        path: path17,
-        score: 1,
-        // No expected value means no comparison needed
-        weight,
-        hit: true,
-        message: `${path17}: no expected value`
-      };
-    }
-    if (candidateValue === void 0) {
-      if (required) {
-        return {
-          path: path17,
-          score: 0,
-          weight,
-          hit: false,
-          message: `${path17} (required, missing)`
-        };
-      }
-      return {
-        path: path17,
-        score: 1,
-        // Don't penalize missing optional fields
-        weight: 0,
-        // Zero weight means it won't affect the score
-        hit: true,
-        message: `${path17}: optional field missing`
-      };
-    }
-    switch (match) {
-      case "exact":
-        return this.compareExact(path17, candidateValue, expectedValue, weight);
-      case "numeric_tolerance":
-        return this.compareNumericTolerance(
-          path17,
-          candidateValue,
-          expectedValue,
-          fieldConfig,
-          weight
-        );
-      case "date":
-        return this.compareDate(path17, candidateValue, expectedValue, fieldConfig, weight);
-      default:
-        return {
-          path: path17,
-          score: 0,
-          weight,
-          hit: false,
-          message: `${path17}: unknown match type "${match}"`
-        };
-    }
-  }
-  /**
-   * Exact equality comparison.
-   */
-  compareExact(path17, candidateValue, expectedValue, weight) {
-    if (deepEqual(candidateValue, expectedValue)) {
-      return {
-        path: path17,
-        score: 1,
-        weight,
-        hit: true,
-        message: path17
-      };
-    }
-    if (typeof candidateValue !== typeof expectedValue) {
-      return {
-        path: path17,
-        score: 0,
-        weight,
-        hit: false,
-        message: `${path17} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
-      };
-    }
-    return {
-      path: path17,
-      score: 0,
-      weight,
-      hit: false,
-      message: `${path17} (value mismatch)`
-    };
-  }
-  /**
-   * Numeric comparison with absolute or relative tolerance.
-   */
-  compareNumericTolerance(path17, candidateValue, expectedValue, fieldConfig, weight) {
-    const { tolerance = 0, relative = false } = fieldConfig;
-    const candidateNum = toNumber(candidateValue);
-    const expectedNum = toNumber(expectedValue);
-    if (candidateNum === null || expectedNum === null) {
-      return {
-        path: path17,
-        score: 0,
-        weight,
-        hit: false,
-        message: `${path17} (non-numeric value)`
-      };
-    }
-    if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
-      return {
-        path: path17,
-        score: 0,
-        weight,
-        hit: false,
-        message: `${path17} (invalid numeric value)`
-      };
-    }
-    const diff = Math.abs(candidateNum - expectedNum);
-    let withinTolerance;
-    if (relative) {
-      const relativeDiff = expectedNum === 0 ? diff : diff / Math.abs(expectedNum);
-      withinTolerance = relativeDiff <= tolerance;
-    } else {
-      withinTolerance = diff <= tolerance;
-    }
-    if (withinTolerance) {
-      return {
-        path: path17,
-        score: 1,
-        weight,
-        hit: true,
-        message: `${path17} (within tolerance: diff=${diff.toFixed(2)})`
-      };
-    }
-    return {
-      path: path17,
-      score: 0,
-      weight,
-      hit: false,
-      message: `${path17} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
-    };
-  }
-  /**
-   * Date comparison with format normalization.
-   */
-  compareDate(path17, candidateValue, expectedValue, fieldConfig, weight) {
-    const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
-    const candidateDate = parseDate(String(candidateValue), formats);
-    const expectedDate = parseDate(String(expectedValue), formats);
-    if (candidateDate === null) {
-      return {
-        path: path17,
-        score: 0,
-        weight,
-        hit: false,
-        message: `${path17} (unparseable candidate date)`
-      };
-    }
-    if (expectedDate === null) {
-      return {
-        path: path17,
-        score: 0,
-        weight,
-        hit: false,
-        message: `${path17} (unparseable expected date)`
-      };
-    }
-    if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
-      return {
-        path: path17,
-        score: 1,
-        weight,
-        hit: true,
-        message: path17
-      };
-    }
-    return {
-      path: path17,
-      score: 0,
-      weight,
-      hit: false,
-      message: `${path17} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
-    };
-  }
-  /**
-   * Aggregate field results using configured strategy.
-   */
-  aggregateResults(results) {
-    const aggregation = this.config.aggregation ?? "weighted_average";
-    const hits = [];
-    const misses = [];
-    for (const result of results) {
-      if (result.hit) {
-        hits.push(result.message);
-      } else {
-        misses.push(result.message);
-      }
-    }
-    let score;
-    if (aggregation === "all_or_nothing") {
-      score = misses.length === 0 ? 1 : 0;
-    } else {
-      const totalWeight = results.reduce((sum, r) => sum + r.weight, 0);
-      if (totalWeight === 0) {
-        score = results.length === 0 ? 1 : 0;
-      } else {
-        const weightedSum = results.reduce((sum, r) => sum + r.score * r.weight, 0);
-        score = weightedSum / totalWeight;
-      }
-    }
-    const reasoning = `${hits.length}/${results.length} fields matched`;
-    return {
-      score: clampScore(score),
-      verdict: scoreToVerdict(score),
-      hits: hits.slice(0, 4),
-      misses: misses.slice(0, 4),
-      expectedAspectCount: results.length,
-      reasoning
-    };
-  }
-};
-function resolvePath(obj, path17) {
-  if (!path17 || !obj) {
-    return void 0;
-  }
-  const parts = path17.split(/\.|\[|\]/).filter((p) => p.length > 0);
-  let current = obj;
-  for (const part of parts) {
-    if (current === null || current === void 0) {
-      return void 0;
-    }
-    if (typeof current !== "object") {
-      return void 0;
-    }
-    const isIndex = /^\d+$/.test(part);
-    if (isIndex && Array.isArray(current)) {
-      current = current[Number.parseInt(part, 10)];
-    } else {
-      current = current[part];
-    }
-  }
-  return current;
-}
-function toNumber(value) {
-  if (typeof value === "number") {
-    return value;
-  }
-  if (typeof value === "string") {
-    const num = Number.parseFloat(value);
-    return Number.isNaN(num) ? null : num;
-  }
-  return null;
-}
-function parseDate(dateStr, formats) {
-  if (!dateStr) return null;
-  const trimmed = dateStr.trim();
-  const isoDate = new Date(trimmed);
-  if (!Number.isNaN(isoDate.getTime())) {
-    return isoDate;
-  }
-  const localizedMatch = trimmed.match(/^(\d{1,2})-([A-Za-z]{3,9})-(\d{4})$/);
-  if (localizedMatch) {
-    const day = Number.parseInt(localizedMatch[1], 10);
-    const monthName = localizedMatch[2].toLowerCase();
-    const year = Number.parseInt(localizedMatch[3], 10);
-    const month = MONTH_NAMES[monthName];
-    if (month !== void 0) {
-      return new Date(year, month, day);
-    }
-  }
-  const usMatch = trimmed.match(/^(\d{1,2})[\/\-](\d{1,2})[\/\-](\d{4})$/);
-  if (usMatch) {
-    const hasUSFormat = formats.some((f) => f.includes("MM/DD") || f.includes("MM-DD"));
-    const hasEUFormat = formats.some((f) => f.includes("DD/MM") || f.includes("DD-MM"));
-    if (hasUSFormat && !hasEUFormat) {
-      const month = Number.parseInt(usMatch[1], 10) - 1;
-      const day = Number.parseInt(usMatch[2], 10);
-      const year = Number.parseInt(usMatch[3], 10);
-      if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
-        return new Date(year, month, day);
-      }
-    } else if (hasEUFormat && !hasUSFormat) {
-      const day = Number.parseInt(usMatch[1], 10);
-      const month = Number.parseInt(usMatch[2], 10) - 1;
-      const year = Number.parseInt(usMatch[3], 10);
-      if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
-        return new Date(year, month, day);
-      }
-    } else {
-      const num1 = Number.parseInt(usMatch[1], 10);
-      const num2 = Number.parseInt(usMatch[2], 10);
-      const year = Number.parseInt(usMatch[3], 10);
-      if (num1 > 12 && num2 <= 12) {
-        return new Date(year, num2 - 1, num1);
-      }
-      if (num2 > 12 && num1 <= 12) {
-        return new Date(year, num1 - 1, num2);
-      }
-      if (num1 <= 12 && num2 <= 31) {
-        return new Date(year, num1 - 1, num2);
-      }
-    }
-  }
-  return null;
-}
-function formatDateISO(date) {
-  return date.toISOString().split("T")[0];
-}
-function parseJsonFromTextSafe(text) {
-  const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
-  const match = cleaned.match(/\{[\s\S]*\}/);
-  const blob = match?.[0] ?? cleaned;
-  return JSON.parse(blob);
-}
-var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
-{{EVALUATOR_RESULTS_JSON}}
-Decide the final score and verdict based on all evaluator results.
-Return a JSON object with: score (0.0-1.0), verdict (pass/fail/borderline), and reasoning.`;
-var CompositeEvaluator = class {
-  kind = "composite";
-  config;
-  evaluatorFactory;
-  cwd;
-  constructor(options) {
-    this.config = options.config;
-    this.evaluatorFactory = options.evaluatorFactory;
-    this.cwd = options.cwd;
+          try {
+            return parseJsonFromTextSafe(message.content);
+          } catch {
+          }
+        }
+      }
+    }
+    return void 0;
   }
-  async evaluate(context) {
-    const memberResults = await Promise.all(
-      this.config.evaluators.map(async (memberConfig) => {
-        const evaluator = this.evaluatorFactory.create(memberConfig, context);
+  /**
+   * Evaluate a single field against the expected value.
+   */
+  evaluateField(fieldConfig, candidateData, expectedData) {
+    const { path: path17, match, required = true, weight = 1 } = fieldConfig;
+    const candidateValue = resolvePath(candidateData, path17);
+    const expectedValue = resolvePath(expectedData, path17);
+    if (expectedValue === void 0) {
+      return {
+        path: path17,
+        score: 1,
+        // No expected value means no comparison needed
+        weight,
+        hit: true,
+        message: `${path17}: no expected value`
+      };
+    }
+    if (candidateValue === void 0) {
+      if (required) {
         return {
-          id: memberConfig.name,
-          type: memberConfig.type,
-          result: await evaluator.evaluate(context)
+          path: path17,
+          score: 0,
+          weight,
+          hit: false,
+          message: `${path17} (required, missing)`
         };
-      })
-    );
-    return this.aggregate(memberResults, context);
-  }
-  async aggregate(results, context) {
-    const aggregator = this.config.aggregator;
-    switch (aggregator.type) {
-      case "code_judge":
-        return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
-      case "llm_judge":
-        return this.runLlmAggregator(results, context, aggregator);
+      }
+      return {
+        path: path17,
+        score: 1,
+        // Don't penalize missing optional fields
+        weight: 0,
+        // Zero weight means it won't affect the score
+        hit: true,
+        message: `${path17}: optional field missing`
+      };
+    }
+    switch (match) {
+      case "exact":
+        return this.compareExact(path17, candidateValue, expectedValue, weight);
+      case "numeric_tolerance":
+        return this.compareNumericTolerance(
+          path17,
+          candidateValue,
+          expectedValue,
+          fieldConfig,
+          weight
+        );
+      case "date":
+        return this.compareDate(path17, candidateValue, expectedValue, fieldConfig, weight);
       default:
-        return this.runWeightedAverage(results, aggregator.weights);
+        return {
+          path: path17,
+          score: 0,
+          weight,
+          hit: false,
+          message: `${path17}: unknown match type "${match}"`
+        };
     }
   }
-  runWeightedAverage(results, weights) {
-    let totalWeight = 0;
-    let weightedSum = 0;
-    const allHits = [];
-    const allMisses = [];
-    const reasoningParts = [];
-    const evaluatorResults = [];
-    for (const member of results) {
-      const weight = weights?.[member.id] ?? 1;
-      totalWeight += weight;
-      weightedSum += member.result.score * weight;
-      allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
-      allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
-      if (member.result.reasoning) {
-        reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
-      }
-      evaluatorResults.push({
-        name: member.id,
-        type: member.type,
-        score: member.result.score,
+  /**
+   * Exact equality comparison.
+   */
+  compareExact(path17, candidateValue, expectedValue, weight) {
+    if (deepEqual(candidateValue, expectedValue)) {
+      return {
+        path: path17,
+        score: 1,
         weight,
-        verdict: member.result.verdict,
-        hits: [...member.result.hits],
-        misses: [...member.result.misses],
-        reasoning: member.result.reasoning,
-        evaluatorRawRequest: member.result.evaluatorRawRequest,
-        evaluatorResults: member.result.evaluatorResults
-      });
+        hit: true,
+        message: path17
+      };
+    }
+    if (typeof candidateValue !== typeof expectedValue) {
+      return {
+        path: path17,
+        score: 0,
+        weight,
+        hit: false,
+        message: `${path17} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
+      };
     }
-    const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
     return {
-      score: clampScore(finalScore),
-      verdict: scoreToVerdict(finalScore),
-      hits: allHits,
-      misses: allMisses,
-      expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
-      reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
-      evaluatorRawRequest: {
-        aggregator: "weighted_average",
-        ...weights ? { weights } : {}
-      },
-      evaluatorResults
+      path: path17,
+      score: 0,
+      weight,
+      hit: false,
+      message: `${path17} (value mismatch)`
     };
   }
-  async runCodeAggregator(results, scriptPath, cwd, weights) {
-    const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
-    const inputPayload = JSON.stringify({ results: resultsObject }, null, 2);
-    const evaluatorResults = results.map((member) => ({
-      name: member.id,
-      type: member.type,
-      score: member.result.score,
-      weight: weights?.[member.id] ?? 1,
-      verdict: member.result.verdict,
-      hits: [...member.result.hits],
-      misses: [...member.result.misses],
-      reasoning: member.result.reasoning,
-      evaluatorRawRequest: member.result.evaluatorRawRequest,
-      evaluatorResults: member.result.evaluatorResults
-    }));
-    try {
-      const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
-      const parsed = parseJsonSafe(stdout);
-      const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
-      const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
-      const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
-      const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
-      const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
+  /**
+   * Numeric comparison with absolute or relative tolerance.
+   */
+  compareNumericTolerance(path17, candidateValue, expectedValue, fieldConfig, weight) {
+    const { tolerance = 0, relative = false } = fieldConfig;
+    const candidateNum = toNumber(candidateValue);
+    const expectedNum = toNumber(expectedValue);
+    if (candidateNum === null || expectedNum === null) {
       return {
-        score,
-        verdict,
-        hits,
-        misses,
-        expectedAspectCount: hits.length + misses.length || 1,
-        reasoning,
-        evaluatorRawRequest: {
-          aggregator: "code_judge",
-          script: scriptPath
-        },
-        evaluatorResults
+        path: path17,
+        score: 0,
+        weight,
+        hit: false,
+        message: `${path17} (non-numeric value)`
       };
-    } catch (error) {
-      const message = error instanceof Error ? error.message : String(error);
+    }
+    if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
       return {
+        path: path17,
         score: 0,
-        verdict: "fail",
-        hits: [],
-        misses: [`Code aggregator failed: ${message}`],
-        expectedAspectCount: 1,
-        reasoning: message,
-        evaluatorRawRequest: {
-          aggregator: "code_judge",
-          script: scriptPath,
-          error: message
-        },
-        evaluatorResults
+        weight,
+        hit: false,
+        message: `${path17} (invalid numeric value)`
       };
     }
-  }
-  async runLlmAggregator(results, context, config) {
-    const judgeProvider = context.judgeProvider;
-    if (!judgeProvider) {
-      throw new Error("No judge provider available for LLM aggregation");
+    const diff = Math.abs(candidateNum - expectedNum);
+    let withinTolerance;
+    if (relative) {
+      const relativeDiff = expectedNum === 0 ? diff : diff / Math.abs(expectedNum);
+      withinTolerance = relativeDiff <= tolerance;
+    } else {
+      withinTolerance = diff <= tolerance;
     }
-    const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
-    const resultsJson = JSON.stringify(resultsObject, null, 2);
-    const evaluatorResults = results.map((member) => ({
-      name: member.id,
-      type: member.type,
-      score: member.result.score,
-      verdict: member.result.verdict,
-      hits: [...member.result.hits],
-      misses: [...member.result.misses],
-      reasoning: member.result.reasoning,
-      evaluatorRawRequest: member.result.evaluatorRawRequest,
-      evaluatorResults: member.result.evaluatorResults
-    }));
-    const promptTemplate = config.prompt ?? DEFAULT_COMPOSITE_AGGREGATOR_PROMPT;
-    const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
-    const systemPrompt = buildOutputSchema();
-    const evaluatorRawRequest = {
-      aggregator: "llm_judge",
-      userPrompt,
-      systemPrompt,
-      target: judgeProvider.targetName
+    if (withinTolerance) {
+      return {
+        path: path17,
+        score: 1,
+        weight,
+        hit: true,
+        message: `${path17} (within tolerance: diff=${diff.toFixed(2)})`
+      };
+    }
+    return {
+      path: path17,
+      score: 0,
+      weight,
+      hit: false,
+      message: `${path17} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
     };
-    try {
-      const model = judgeProvider.asLanguageModel?.();
-      if (model) {
-        const { text } = await (0, import_ai2.generateText)({
-          model,
-          system: systemPrompt,
-          prompt: userPrompt
-        });
-        const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text));
-        const score2 = clampScore(data2.score);
-        const hits2 = Array.isArray(data2.hits) ? data2.hits.filter(isNonEmptyString).slice(0, 4) : [];
-        const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
-        const reasoning2 = data2.reasoning;
-        return {
-          score: score2,
-          verdict: scoreToVerdict(score2),
-          hits: hits2,
-          misses: misses2,
-          expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
-          reasoning: reasoning2,
-          evaluatorRawRequest,
-          evaluatorResults
-        };
-      }
-      const response = await judgeProvider.invoke({
-        question: userPrompt,
-        systemPrompt,
-        evalCaseId: context.evalCase.id,
-        attempt: context.attempt
-      });
-      const data = freeformEvaluationSchema.parse(
-        parseJsonFromText(extractLastAssistantContent(response.outputMessages))
-      );
-      const score = clampScore(data.score);
-      const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
-      const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
-      const reasoning = data.reasoning;
+  }
+  /**
+   * Date comparison with format normalization.
+   */
+  compareDate(path17, candidateValue, expectedValue, fieldConfig, weight) {
+    const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
+    const candidateDate = parseDate(String(candidateValue), formats);
+    const expectedDate = parseDate(String(expectedValue), formats);
+    if (candidateDate === null) {
       return {
-        score,
-        verdict: scoreToVerdict(score),
-        hits,
-        misses,
-        expectedAspectCount: Math.max(hits.length + misses.length, 1),
-        reasoning,
-        evaluatorRawRequest,
-        evaluatorResults
+        path: path17,
+        score: 0,
+        weight,
+        hit: false,
+        message: `${path17} (unparseable candidate date)`
       };
-    } catch {
+    }
+    if (expectedDate === null) {
       return {
+        path: path17,
         score: 0,
-        verdict: "fail",
-        hits: [],
-        misses: [],
-        expectedAspectCount: 1,
-        evaluatorRawRequest,
-        evaluatorResults
+        weight,
+        hit: false,
+        message: `${path17} (unparseable expected date)`
+      };
+    }
+    if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
+      return {
+        path: path17,
+        score: 1,
+        weight,
+        hit: true,
+        message: path17
       };
     }
+    return {
+      path: path17,
+      score: 0,
+      weight,
+      hit: false,
+      message: `${path17} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
+    };
+  }
+  /**
+   * Aggregate field results using configured strategy.
+   */
+  aggregateResults(results) {
+    const aggregation = this.config.aggregation ?? "weighted_average";
+    const hits = [];
+    const misses = [];
+    for (const result of results) {
+      if (result.hit) {
+        hits.push(result.message);
+      } else {
+        misses.push(result.message);
+      }
+    }
+    let score;
+    if (aggregation === "all_or_nothing") {
+      score = misses.length === 0 ? 1 : 0;
+    } else {
+      const totalWeight = results.reduce((sum, r) => sum + r.weight, 0);
+      if (totalWeight === 0) {
+        score = results.length === 0 ? 1 : 0;
+      } else {
+        const weightedSum = results.reduce((sum, r) => sum + r.score * r.weight, 0);
+        score = weightedSum / totalWeight;
+      }
+    }
+    const reasoning = `${hits.length}/${results.length} fields matched`;
+    return {
+      score: clampScore(score),
+      verdict: scoreToVerdict(score),
+      hits: hits.slice(0, 4),
+      misses: misses.slice(0, 4),
+      expectedAspectCount: results.length,
+      reasoning
+    };
   }
 };
+function resolvePath(obj, path17) {
+  if (!path17 || !obj) {
+    return void 0;
+  }
+  const parts = path17.split(/\.|\[|\]/).filter((p) => p.length > 0);
+  let current = obj;
+  for (const part of parts) {
+    if (current === null || current === void 0) {
+      return void 0;
+    }
+    if (typeof current !== "object") {
+      return void 0;
+    }
+    const isIndex = /^\d+$/.test(part);
+    if (isIndex && Array.isArray(current)) {
+      current = current[Number.parseInt(part, 10)];
+    } else {
+      current = current[part];
+    }
+  }
+  return current;
+}
+function toNumber(value) {
+  if (typeof value === "number") {
+    return value;
+  }
+  if (typeof value === "string") {
+    const num = Number.parseFloat(value);
+    return Number.isNaN(num) ? null : num;
+  }
+  return null;
+}
+function parseDate(dateStr, formats) {
+  if (!dateStr) return null;
+  const trimmed = dateStr.trim();
+  const isoDate = new Date(trimmed);
+  if (!Number.isNaN(isoDate.getTime())) {
+    return isoDate;
+  }
+  const localizedMatch = trimmed.match(/^(\d{1,2})-([A-Za-z]{3,9})-(\d{4})$/);
+  if (localizedMatch) {
+    const day = Number.parseInt(localizedMatch[1], 10);
+    const monthName = localizedMatch[2].toLowerCase();
+    const year = Number.parseInt(localizedMatch[3], 10);
+    const month = MONTH_NAMES[monthName];
+    if (month !== void 0) {
+      return new Date(year, month, day);
+    }
+  }
+  const usMatch = trimmed.match(/^(\d{1,2})[\/\-](\d{1,2})[\/\-](\d{4})$/);
+  if (usMatch) {
+    const hasUSFormat = formats.some((f) => f.includes("MM/DD") || f.includes("MM-DD"));
+    const hasEUFormat = formats.some((f) => f.includes("DD/MM") || f.includes("DD-MM"));
+    if (hasUSFormat && !hasEUFormat) {
+      const month = Number.parseInt(usMatch[1], 10) - 1;
+      const day = Number.parseInt(usMatch[2], 10);
+      const year = Number.parseInt(usMatch[3], 10);
+      if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
+        return new Date(year, month, day);
+      }
+    } else if (hasEUFormat && !hasUSFormat) {
+      const day = Number.parseInt(usMatch[1], 10);
+      const month = Number.parseInt(usMatch[2], 10) - 1;
+      const year = Number.parseInt(usMatch[3], 10);
+      if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
+        return new Date(year, month, day);
+      }
+    } else {
+      const num1 = Number.parseInt(usMatch[1], 10);
+      const num2 = Number.parseInt(usMatch[2], 10);
+      const year = Number.parseInt(usMatch[3], 10);
+      if (num1 > 12 && num2 <= 12) {
+        return new Date(year, num2 - 1, num1);
+      }
+      if (num2 > 12 && num1 <= 12) {
+        return new Date(year, num1 - 1, num2);
+      }
+      if (num1 <= 12 && num2 <= 31) {
+        return new Date(year, num1 - 1, num2);
+      }
+    }
+  }
+  return null;
+}
+function formatDateISO(date) {
+  return date.toISOString().split("T")[0];
+}
+function parseJsonFromTextSafe(text) {
+  return parseJsonFromText(text);
+}
+// src/evaluation/evaluators/latency.ts
 var LatencyEvaluator = class {
   kind = "latency";
   config;
@@ -7639,53 +7934,13 @@ var LatencyEvaluator = class {
       evaluatorRawRequest: {
         type: "latency",
         threshold,
-        durationMs
-      }
-    };
-  }
-};
-var CostEvaluator = class {
-  kind = "cost";
-  config;
-  constructor(options) {
-    this.config = options.config;
-  }
-  evaluate(context) {
-    const { budget } = this.config;
-    const costUsd = context.traceSummary?.costUsd;
-    if (costUsd === void 0) {
-      return {
-        score: 0,
-        verdict: "fail",
-        hits: [],
-        misses: ["No cost data available in trace"],
-        expectedAspectCount: 1,
-        reasoning: "Execution cost not reported by provider",
-        evaluatorRawRequest: {
-          type: "cost",
-          budget,
-          costUsd: null
-        }
-      };
-    }
-    const passed = costUsd <= budget;
-    const score = passed ? 1 : 0;
-    const formatCost = (n) => `$${n.toFixed(4)}`;
-    return {
-      score,
-      verdict: passed ? "pass" : "fail",
-      hits: passed ? [`Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`] : [],
-      misses: passed ? [] : [`Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`],
-      expectedAspectCount: 1,
-      reasoning: `Execution cost ${formatCost(costUsd)} (budget: ${formatCost(budget)})`,
-      evaluatorRawRequest: {
-        type: "cost",
-        budget,
-        costUsd
+        durationMs
       }
     };
   }
 };
+// src/evaluation/evaluators/token-usage.ts
 var TokenUsageEvaluator = class {
   kind = "token_usage";
   config;
@@ -7769,8 +8024,228 @@ var TokenUsageEvaluator = class {
   }
 };
+// src/evaluation/evaluators/tool-trajectory.ts
+function argsMatch(expected, actual) {
+  if (expected === void 0) return true;
+  if (expected === "any") return true;
+  if (actual === void 0) return false;
+  for (const key of Object.keys(expected)) {
+    if (!Object.hasOwn(actual, key)) return false;
+    if (!deepEqual(expected[key], actual[key])) return false;
+  }
+  return true;
+}
+var ToolTrajectoryEvaluator = class {
+  kind = "tool_trajectory";
+  config;
+  constructor(options) {
+    this.config = options.config;
+  }
+  evaluate(context) {
+    const { outputMessages, traceSummary } = context;
+    const toolCalls = this.extractToolCallsFromMessages(outputMessages);
+    if (toolCalls.length === 0 && !traceSummary) {
+      return {
+        score: 0,
+        verdict: "fail",
+        hits: [],
+        misses: ["No trace available for evaluation"],
+        expectedAspectCount: 1
+      };
+    }
+    const summary = toolCalls.length > 0 ? this.buildSummary(toolCalls) : traceSummary;
+    if (!summary) {
+      return {
+        score: 0,
+        verdict: "fail",
+        hits: [],
+        misses: ["No trace available for evaluation"],
+        expectedAspectCount: 1
+      };
+    }
+    switch (this.config.mode) {
+      case "any_order":
+        return this.evaluateAnyOrder(summary);
+      case "in_order":
+        return this.evaluateInOrder(toolCalls);
+      case "exact":
+        return this.evaluateExact(toolCalls);
+      default:
+        return {
+          score: 0,
+          verdict: "fail",
+          hits: [],
+          misses: [`Unknown mode: ${this.config.mode}`],
+          expectedAspectCount: 1
+        };
+    }
+  }
+  /**
+   * Extract tool calls from output messages.
+   */
+  extractToolCallsFromMessages(messages) {
+    if (!messages) {
+      return [];
+    }
+    const toolCalls = [];
+    for (const message of messages) {
+      if (message.toolCalls) {
+        for (const call of message.toolCalls) {
+          toolCalls.push({
+            name: call.tool,
+            args: call.input
+          });
+        }
+      }
+    }
+    return toolCalls;
+  }
+  /**
+   * Build a summary from extracted tool calls.
+   */
+  buildSummary(toolCalls) {
+    const toolCallsByName = {};
+    for (const call of toolCalls) {
+      toolCallsByName[call.name] = (toolCallsByName[call.name] ?? 0) + 1;
+    }
+    const toolNames = Object.keys(toolCallsByName).sort();
+    return {
+      eventCount: toolCalls.length,
+      toolNames,
+      toolCallsByName,
+      errorCount: 0
+    };
+  }
+  evaluateAnyOrder(summary) {
+    const minimums = this.config.minimums ?? {};
+    const toolNames = Object.keys(minimums);
+    if (toolNames.length === 0) {
+      return {
+        score: 1,
+        verdict: "pass",
+        hits: ["No tool requirements specified"],
+        misses: [],
+        expectedAspectCount: 0
+      };
+    }
+    const hits = [];
+    const misses = [];
+    for (const toolName of toolNames) {
+      const required = minimums[toolName];
+      const actual = summary.toolCallsByName[toolName] ?? 0;
+      if (actual >= required) {
+        hits.push(`${toolName}: called ${actual} times (required \u2265${required})`);
+      } else {
+        misses.push(`${toolName}: called ${actual} times (required \u2265${required})`);
+      }
+    }
+    const score = hits.length / toolNames.length;
+    return {
+      score,
+      verdict: scoreToVerdict(score),
+      hits,
+      misses,
+      expectedAspectCount: toolNames.length
+    };
+  }
+  evaluateInOrder(toolCalls) {
+    const expected = this.config.expected ?? [];
+    if (expected.length === 0) {
+      return {
+        score: 1,
+        verdict: "pass",
+        hits: ["No tool sequence specified"],
+        misses: [],
+        expectedAspectCount: 0
+      };
+    }
+    const hits = [];
+    const misses = [];
+    let actualIndex = 0;
+    for (let i = 0; i < expected.length; i++) {
+      const expectedItem = expected[i];
+      const expectedTool = expectedItem.tool;
+      let found = false;
+      let argsMismatch = false;
+      while (actualIndex < toolCalls.length) {
+        const actualCall = toolCalls[actualIndex];
+        if (actualCall.name === expectedTool) {
+          if (argsMatch(expectedItem.args, actualCall.args)) {
+            hits.push(`Found ${expectedTool} at position ${actualIndex}`);
+            actualIndex++;
+            found = true;
+            break;
+          }
+          misses.push(
+            `Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`
+          );
+          actualIndex++;
+          argsMismatch = true;
+          break;
+        }
+        actualIndex++;
+      }
+      if (!found && !argsMismatch) {
+        misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
+      }
+    }
+    const score = hits.length / expected.length;
+    return {
+      score,
+      verdict: scoreToVerdict(score),
+      hits,
+      misses,
+      expectedAspectCount: expected.length
+    };
+  }
+  evaluateExact(toolCalls) {
+    const expected = this.config.expected ?? [];
+    if (expected.length === 0) {
+      return {
+        score: 1,
+        verdict: "pass",
+        hits: ["No tool sequence specified"],
+        misses: [],
+        expectedAspectCount: 0
+      };
+    }
+    const hits = [];
+    const misses = [];
+    if (toolCalls.length !== expected.length) {
+      misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
+    }
+    const checkLength = Math.min(expected.length, toolCalls.length);
+    for (let i = 0; i < checkLength; i++) {
+      const expectedItem = expected[i];
+      const expectedTool = expectedItem.tool;
+      const actualCall = toolCalls[i];
+      const actualTool = actualCall.name;
+      if (actualTool === expectedTool) {
+        if (argsMatch(expectedItem.args, actualCall.args)) {
+          hits.push(`Position ${i}: ${expectedTool}`);
+        } else {
+          misses.push(`Position ${i}: ${expectedTool} args mismatch`);
+        }
+      } else {
+        misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
+      }
+    }
+    for (let i = checkLength; i < expected.length; i++) {
+      misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
+    }
+    const score = hits.length / expected.length;
+    return {
+      score,
+      verdict: scoreToVerdict(score),
+      hits,
+      misses,
+      expectedAspectCount: expected.length
+    };
+  }
+};
 // src/evaluation/orchestrator.ts
-var import_node_crypto4 = require("crypto");
+var import_node_crypto5 = require("crypto");
 var import_node_path16 = __toESM(require("path"), 1);
 // ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
@@ -7982,6 +8457,17 @@ async function runEvaluation(options) {
     }
     return getOrCreateProvider(resolvedJudge);
   };
+  const targetResolver = (name) => {
+    const resolved = resolveTargetByName(name);
+    if (!resolved) {
+      return void 0;
+    }
+    return getOrCreateProvider(resolved);
+  };
+  const availableTargets = [
+    target.name,
+    ...Array.from(targetDefinitions.keys())
+  ];
   const evaluatorRegistry = buildEvaluatorRegistry(evaluators, resolveJudgeProvider);
   const primaryProvider = getOrCreateProvider(target);
   const providerSupportsBatch = target.providerBatching === true && primaryProvider.supportsBatch === true && typeof primaryProvider.invokeBatch === "function";
@@ -8011,7 +8497,9 @@ async function runEvaluation(options) {
         onResult,
         verbose,
         resolveJudgeProvider,
-        agentTimeoutMs
+        agentTimeoutMs,
+        targetResolver,
+        availableTargets
       });
     } catch (error) {
       if (verbose) {
@@ -8050,7 +8538,9 @@ async function runEvaluation(options) {
           cache,
           useCache,
           now,
-          judgeProvider
+          judgeProvider,
+          targetResolver,
+          availableTargets
         });
         if (onProgress) {
           await onProgress({
@@ -8117,7 +8607,9 @@ async function runBatchEvaluation(options) {
     onProgress,
     onResult,
     resolveJudgeProvider,
-    agentTimeoutMs
+    agentTimeoutMs,
+    targetResolver,
+    availableTargets
   } = options;
   const promptInputsList = [];
   const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
@@ -8176,7 +8668,7 @@ async function runBatchEvaluation(options) {
       costUsd: providerResponse.costUsd,
       durationMs: providerResponse.durationMs
     }) : void 0;
-    const candidate = extractLastAssistantContent(outputMessages);
+    const candidate = extractLastAssistantContent2(outputMessages);
     const providerError = extractProviderError(providerResponse);
     let result;
     try {
@@ -8192,7 +8684,9 @@ async function runBatchEvaluation(options) {
         judgeProvider: await resolveJudgeProvider(target),
         agentTimeoutMs,
         outputMessages,
-        traceSummary
+        traceSummary,
+        targetResolver,
+        availableTargets
       });
       if (providerError) {
         result = { ...result, error: providerError };
@@ -8250,7 +8744,9 @@ async function runEvalCase(options) {
     cache,
     useCache,
     signal,
-    judgeProvider
+    judgeProvider,
+    targetResolver,
+    availableTargets
   } = options;
   const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
   const promptInputs = await buildPromptInputs(evalCase, formattingMode);
@@ -8309,7 +8805,7 @@ async function runEvalCase(options) {
     costUsd: providerResponse.costUsd,
     durationMs: providerResponse.durationMs
   }) : void 0;
-  const candidate = extractLastAssistantContent(outputMessages);
+  const candidate = extractLastAssistantContent2(outputMessages);
   const providerError = extractProviderError(providerResponse);
   try {
     const result = await evaluateCandidate({
@@ -8324,7 +8820,9 @@ async function runEvalCase(options) {
       judgeProvider,
       agentTimeoutMs,
       outputMessages,
-      traceSummary
+      traceSummary,
+      targetResolver,
+      availableTargets
     });
     return providerError ? { ...result, error: providerError } : result;
   } catch (error) {
@@ -8344,7 +8842,9 @@ async function evaluateCandidate(options) {
     judgeProvider,
     agentTimeoutMs,
     outputMessages,
-    traceSummary
+    traceSummary,
+    targetResolver,
+    availableTargets
   } = options;
   const gradeTimestamp = nowFn();
   const { score, evaluatorResults } = await runEvaluatorsForCase({
@@ -8359,7 +8859,9 @@ async function evaluateCandidate(options) {
     judgeProvider,
     agentTimeoutMs,
     outputMessages,
-    traceSummary
+    traceSummary,
+    targetResolver,
+    availableTargets
   });
   const completedAt = nowFn();
   let agentProviderRequest;
@@ -8412,7 +8914,9 @@ async function runEvaluatorsForCase(options) {
     judgeProvider,
     agentTimeoutMs,
     outputMessages,
-    traceSummary
+    traceSummary,
+    targetResolver,
+    availableTargets
   } = options;
   if (evalCase.evaluators && evalCase.evaluators.length > 0) {
     return runEvaluatorList({
@@ -8428,7 +8932,9 @@ async function runEvaluatorsForCase(options) {
       judgeProvider,
       agentTimeoutMs,
       outputMessages,
-      traceSummary
+      traceSummary,
+      targetResolver,
+      availableTargets
     });
   }
   const evaluatorKind = evalCase.evaluator ?? "llm_judge";
@@ -8446,7 +8952,9 @@ async function runEvaluatorsForCase(options) {
     now,
     judgeProvider,
     outputMessages,
-    traceSummary
+    traceSummary,
+    targetResolver,
+    availableTargets
   });
   return { score };
 }
@@ -8464,7 +8972,9 @@ async function runEvaluatorList(options) {
     judgeProvider,
     agentTimeoutMs,
     outputMessages,
-    traceSummary
+    traceSummary,
+    targetResolver,
+    availableTargets
   } = options;
   const scored = [];
   const evaluatorResults = [];
@@ -8502,7 +9012,8 @@ async function runEvaluatorList(options) {
           script: evaluator.script,
           cwd: evaluator.resolvedCwd ?? evaluator.cwd,
           agentTimeoutMs,
-          config: evaluator.config
+          config: evaluator.config,
+          target: evaluator.target
         });
         const score2 = await codeEvaluator.evaluate({
           evalCase,
@@ -8512,8 +9023,11 @@ async function runEvaluatorList(options) {
           attempt,
           promptInputs,
           now,
+          judgeProvider,
           outputMessages,
-          traceSummary
+          traceSummary,
+          targetResolver,
+          availableTargets
         });
         const weight = evaluator.weight ?? 1;
         scored.push({ score: score2, name: evaluator.name, type: "code_judge", weight });
@@ -8526,7 +9040,8 @@ async function runEvaluatorList(options) {
           hits: score2.hits,
           misses: score2.misses,
           reasoning: score2.reasoning,
-          evaluatorProviderRequest: score2.evaluatorRawRequest
+          evaluatorProviderRequest: score2.evaluatorRawRequest,
+          details: score2.details
         });
       }
       if (evaluator.type === "composite") {
@@ -8540,7 +9055,8 @@ async function runEvaluatorList(options) {
                 script: memberConfig.script,
                 cwd: memberConfig.resolvedCwd ?? memberConfig.cwd,
                 agentTimeoutMs,
-                config: memberConfig.config
+                config: memberConfig.config,
+                target: memberConfig.target
               });
             case "composite":
               return new CompositeEvaluator({
@@ -8589,7 +9105,9 @@ async function runEvaluatorList(options) {
           now,
           judgeProvider,
           outputMessages,
-          traceSummary
+          traceSummary,
+          targetResolver,
+          availableTargets
         });
         const weight = evaluator.weight ?? 1;
         scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
@@ -8785,11 +9303,11 @@ async function runEvaluatorList(options) {
     (total, entry) => total + (entry.score.expectedAspectCount ?? 0),
     0
   );
-  const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString2);
+  const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString);
   const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
   const score = {
     score: aggregateScore,
-    verdict: scoreToVerdict2(aggregateScore),
+    verdict: scoreToVerdict(aggregateScore),
     hits,
     misses,
     expectedAspectCount,
@@ -8836,18 +9354,6 @@ async function resolveCustomPrompt(config) {
   }
   return config.prompt;
 }
-function isNonEmptyString2(value) {
-  return typeof value === "string" && value.trim().length > 0;
-}
-function scoreToVerdict2(score) {
-  if (score >= 0.8) {
-    return "pass";
-  }
-  if (score >= 0.6) {
-    return "borderline";
-  }
-  return "fail";
-}
 function filterEvalCases(evalCases, evalId) {
   if (!evalId) {
     return evalCases;
@@ -8949,7 +9455,7 @@ function extractProviderError(response) {
   return trimmed.length > 0 ? trimmed : void 0;
 }
 function createCacheKey(provider, target, evalCase, promptInputs) {
-  const hash = (0, import_node_crypto4.createHash)("sha256");
+  const hash = (0, import_node_crypto5.createHash)("sha256");
   hash.update(provider.id);
   hash.update(target.name);
   hash.update(evalCase.id);
@@ -8990,7 +9496,8 @@ function mapChildResults(children) {
     misses: child.misses,
     reasoning: child.reasoning,
     evaluatorProviderRequest: child.evaluatorRawRequest,
-    evaluatorResults: mapChildResults(child.evaluatorResults)
+    evaluatorResults: mapChildResults(child.evaluatorResults),
+    details: child.details
   }));
 }
 function computeWeightedMean(entries) {
@@ -9005,7 +9512,7 @@ function computeWeightedMean(entries) {
 }
 // src/evaluation/generators/rubric-generator.ts
-var import_ai3 = require("ai");
+var import_ai4 = require("ai");
 var import_zod4 = require("zod");
 var rubricItemSchema = import_zod4.z.object({
   id: import_zod4.z.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
@@ -9039,7 +9546,7 @@ You must return a valid JSON object matching this schema:
   let lastError;
   for (let attempt = 1; attempt <= 3; attempt++) {
     try {
-      const { text } = await (0, import_ai3.generateText)({
+      const { text } = await (0, import_ai4.generateText)({
         model,
         system,
         prompt
@@ -9084,17 +9591,6 @@ function buildPrompt(expectedOutcome, question, referenceAnswer) {
   return parts.join("\n");
 }
-// src/evaluation/code-judge-sdk.ts
-var import_node_fs7 = require("fs");
-function parseCodeJudgePayload(payload) {
-  const parsed = JSON.parse(payload);
-  return toCamelCaseDeep(parsed);
-}
-function readCodeJudgePayload() {
-  const stdin = (0, import_node_fs7.readFileSync)(0, "utf8");
-  return parseCodeJudgePayload(stdin);
-}
 // src/index.ts
 function createAgentKernel() {
   return { status: "stub" };
@@ -9113,33 +9609,39 @@ function createAgentKernel() {
   ToolTrajectoryEvaluator,
   avgToolDurationMs,
   buildDirectoryChain,
+  buildOutputSchema,
   buildPromptInputs,
   buildSearchRoots,
+  clampScore,
   computeTraceSummary,
   consumeClaudeCodeLogEntries,
   consumeCodexLogEntries,
   consumePiLogEntries,
   createAgentKernel,
   createProvider,
+  deepEqual,
   ensureVSCodeSubagents,
+  executeScript,
   explorationRatio,
-  extractCodeBlocks,
+  extractJsonBlob,
   fileExists,
   findGitRoot,
+  freeformEvaluationSchema,
   generateRubrics,
   getHitCount,
   isEvaluatorKind,
   isGuidelineFile,
   isJsonObject,
   isJsonValue,
+  isNonEmptyString,
   isTestMessage,
   isTestMessageRole,
   listTargetNames,
   loadEvalCases,
   mergeExecutionMetrics,
   normalizeLineEndings,
-  parseCodeJudgePayload,
-  readCodeJudgePayload,
+  parseJsonFromText,
+  parseJsonSafe,
   readJsonFile,
   readTargetDefinitions,
   readTestSuiteMetadata,
@@ -9149,6 +9651,7 @@ function createAgentKernel() {
   resolveTargetDefinition,
   runEvalCase,
   runEvaluation,
+  scoreToVerdict,
   subscribeToClaudeCodeLogEntries,
   subscribeToCodexLogEntries,
   subscribeToPiLogEntries,