@agentv/core 2.0.2 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -42,31 +42,39 @@ __export(index_exports, {
42
42
  ToolTrajectoryEvaluator: () => ToolTrajectoryEvaluator,
43
43
  avgToolDurationMs: () => avgToolDurationMs,
44
44
  buildDirectoryChain: () => buildDirectoryChain2,
45
+ buildOutputSchema: () => buildOutputSchema,
45
46
  buildPromptInputs: () => buildPromptInputs,
46
47
  buildSearchRoots: () => buildSearchRoots2,
48
+ clampScore: () => clampScore,
47
49
  computeTraceSummary: () => computeTraceSummary,
48
50
  consumeClaudeCodeLogEntries: () => consumeClaudeCodeLogEntries,
49
51
  consumeCodexLogEntries: () => consumeCodexLogEntries,
50
52
  consumePiLogEntries: () => consumePiLogEntries,
51
53
  createAgentKernel: () => createAgentKernel,
52
54
  createProvider: () => createProvider,
55
+ deepEqual: () => deepEqual,
53
56
  ensureVSCodeSubagents: () => ensureVSCodeSubagents,
57
+ executeScript: () => executeScript,
54
58
  explorationRatio: () => explorationRatio,
55
- extractCodeBlocks: () => extractCodeBlocks,
59
+ extractJsonBlob: () => extractJsonBlob,
56
60
  fileExists: () => fileExists2,
57
61
  findGitRoot: () => findGitRoot,
62
+ freeformEvaluationSchema: () => freeformEvaluationSchema,
58
63
  generateRubrics: () => generateRubrics,
59
64
  getHitCount: () => getHitCount,
60
65
  isEvaluatorKind: () => isEvaluatorKind,
61
66
  isGuidelineFile: () => isGuidelineFile,
62
67
  isJsonObject: () => isJsonObject,
63
68
  isJsonValue: () => isJsonValue,
69
+ isNonEmptyString: () => isNonEmptyString,
64
70
  isTestMessage: () => isTestMessage,
65
71
  isTestMessageRole: () => isTestMessageRole,
66
72
  listTargetNames: () => listTargetNames,
67
73
  loadEvalCases: () => loadEvalCases,
68
74
  mergeExecutionMetrics: () => mergeExecutionMetrics,
69
75
  normalizeLineEndings: () => normalizeLineEndings,
76
+ parseJsonFromText: () => parseJsonFromText,
77
+ parseJsonSafe: () => parseJsonSafe,
70
78
  readJsonFile: () => readJsonFile,
71
79
  readTargetDefinitions: () => readTargetDefinitions,
72
80
  readTestSuiteMetadata: () => readTestSuiteMetadata,
@@ -76,6 +84,7 @@ __export(index_exports, {
76
84
  resolveTargetDefinition: () => resolveTargetDefinition,
77
85
  runEvalCase: () => runEvalCase,
78
86
  runEvaluation: () => runEvaluation,
87
+ scoreToVerdict: () => scoreToVerdict,
79
88
  subscribeToClaudeCodeLogEntries: () => subscribeToClaudeCodeLogEntries,
80
89
  subscribeToCodexLogEntries: () => subscribeToCodexLogEntries,
81
90
  subscribeToPiLogEntries: () => subscribeToPiLogEntries,
@@ -221,85 +230,6 @@ var import_promises6 = require("fs/promises");
221
230
  var import_node_path6 = __toESM(require("path"), 1);
222
231
  var import_yaml2 = require("yaml");
223
232
 
224
- // src/evaluation/formatting/segment-formatter.ts
225
- function extractCodeBlocks(segments) {
226
- const CODE_BLOCK_PATTERN = /```[\s\S]*?```/g;
227
- const codeBlocks = [];
228
- for (const segment of segments) {
229
- const typeValue = segment.type;
230
- if (typeof typeValue !== "string" || typeValue !== "text") {
231
- continue;
232
- }
233
- const textValue = segment.value;
234
- if (typeof textValue !== "string") {
235
- continue;
236
- }
237
- const matches = textValue.match(CODE_BLOCK_PATTERN);
238
- if (matches) {
239
- codeBlocks.push(...matches);
240
- }
241
- }
242
- return codeBlocks;
243
- }
244
- function formatFileContents(parts) {
245
- const fileCount = parts.filter((p) => p.isFile).length;
246
- if (fileCount > 0) {
247
- return parts.map((part) => {
248
- if (part.isFile && part.displayPath) {
249
- return `<file path="${part.displayPath}">
250
- ${part.content}
251
- </file>`;
252
- }
253
- return part.content;
254
- }).join("\n\n");
255
- }
256
- return parts.map((p) => p.content).join(" ");
257
- }
258
- function formatSegment(segment, mode = "lm") {
259
- const type = asString(segment.type);
260
- if (type === "text") {
261
- return asString(segment.value);
262
- }
263
- if (type === "guideline_ref") {
264
- const refPath = asString(segment.path);
265
- return refPath ? `<Attached: ${refPath}>` : void 0;
266
- }
267
- if (type === "file") {
268
- const filePath = asString(segment.path);
269
- if (!filePath) {
270
- return void 0;
271
- }
272
- if (mode === "agent") {
273
- return `<file: path="${filePath}">`;
274
- }
275
- const text = asString(segment.text);
276
- if (text && filePath) {
277
- return formatFileContents([{ content: text.trim(), isFile: true, displayPath: filePath }]);
278
- }
279
- }
280
- return void 0;
281
- }
282
- function hasVisibleContent(segments) {
283
- return segments.some((segment) => {
284
- const type = asString(segment.type);
285
- if (type === "text") {
286
- const value = asString(segment.value);
287
- return value !== void 0 && value.trim().length > 0;
288
- }
289
- if (type === "guideline_ref") {
290
- return false;
291
- }
292
- if (type === "file") {
293
- const text = asString(segment.text);
294
- return text !== void 0 && text.trim().length > 0;
295
- }
296
- return false;
297
- });
298
- }
299
- function asString(value) {
300
- return typeof value === "string" ? value : void 0;
301
- }
302
-
303
233
  // src/evaluation/loaders/config-loader.ts
304
234
  var import_promises2 = require("fs/promises");
305
235
  var import_node_path2 = __toESM(require("path"), 1);
@@ -554,7 +484,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
554
484
  logWarning2(`Skipping invalid evaluator entry for '${evalId}' (expected object)`);
555
485
  continue;
556
486
  }
557
- const name = asString2(rawEvaluator.name);
487
+ const name = asString(rawEvaluator.name);
558
488
  const typeValue = rawEvaluator.type;
559
489
  if (!name || !isEvaluatorKind(typeValue)) {
560
490
  logWarning2(`Skipping evaluator with invalid name/type in '${evalId}'`);
@@ -582,7 +512,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
582
512
  continue;
583
513
  }
584
514
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
585
- const cwd = asString2(rawEvaluator.cwd);
515
+ const cwd = asString(rawEvaluator.cwd);
586
516
  let resolvedCwd;
587
517
  if (cwd) {
588
518
  const resolved = await resolveFileReference(cwd, searchRoots);
@@ -597,7 +527,29 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
597
527
  } else {
598
528
  resolvedCwd = searchRoots[0];
599
529
  }
600
- const knownProps = /* @__PURE__ */ new Set(["name", "type", "script", "cwd", "weight"]);
530
+ const rawTarget = rawEvaluator.target;
531
+ let targetConfig;
532
+ if (rawTarget !== void 0) {
533
+ if (isJsonObject2(rawTarget)) {
534
+ const maxCalls = rawTarget.max_calls;
535
+ if (maxCalls !== void 0 && (typeof maxCalls !== "number" || maxCalls < 0)) {
536
+ logWarning2(
537
+ `Invalid target.max_calls for evaluator '${name}' in '${evalId}': must be a non-negative number`
538
+ );
539
+ } else {
540
+ targetConfig = {
541
+ ...typeof maxCalls === "number" ? { max_calls: maxCalls } : {}
542
+ };
543
+ }
544
+ } else if (rawTarget === true) {
545
+ targetConfig = {};
546
+ } else {
547
+ logWarning2(
548
+ `Invalid target config for evaluator '${name}' in '${evalId}': expected object or true`
549
+ );
550
+ }
551
+ }
552
+ const knownProps = /* @__PURE__ */ new Set(["name", "type", "script", "cwd", "weight", "target"]);
601
553
  const config = {};
602
554
  for (const [key, value] of Object.entries(rawEvaluator)) {
603
555
  if (!knownProps.has(key) && value !== void 0) {
@@ -611,7 +563,8 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
611
563
  cwd,
612
564
  resolvedCwd,
613
565
  ...weight2 !== void 0 ? { weight: weight2 } : {},
614
- ...Object.keys(config).length > 0 ? { config } : {}
566
+ ...Object.keys(config).length > 0 ? { config } : {},
567
+ ...targetConfig !== void 0 ? { target: targetConfig } : {}
615
568
  });
616
569
  continue;
617
570
  }
@@ -628,7 +581,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
628
581
  logWarning2(`Skipping composite evaluator '${name}' in '${evalId}': missing aggregator`);
629
582
  continue;
630
583
  }
631
- const aggregatorType = asString2(rawAggregator.type);
584
+ const aggregatorType = asString(rawAggregator.type);
632
585
  if (aggregatorType !== "weighted_average" && aggregatorType !== "code_judge" && aggregatorType !== "llm_judge") {
633
586
  logWarning2(
634
587
  `Skipping composite evaluator '${name}' in '${evalId}': invalid aggregator type '${aggregatorType}'`
@@ -641,7 +594,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
641
594
  logWarning2(`Skipping invalid member evaluator in composite '${name}' (expected object)`);
642
595
  continue;
643
596
  }
644
- const memberName = asString2(rawMember.name);
597
+ const memberName = asString(rawMember.name);
645
598
  const memberType = rawMember.type;
646
599
  if (!memberName || !isEvaluatorKind(memberType)) {
647
600
  logWarning2(`Skipping member evaluator with invalid name/type in composite '${name}'`);
@@ -679,7 +632,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
679
632
  ...Object.keys(parsedWeights).length > 0 ? { weights: parsedWeights } : {}
680
633
  };
681
634
  } else if (aggregatorType === "code_judge") {
682
- const aggregatorPath = asString2(rawAggregator.path);
635
+ const aggregatorPath = asString(rawAggregator.path);
683
636
  if (!aggregatorPath) {
684
637
  logWarning2(
685
638
  `Skipping composite evaluator '${name}' in '${evalId}': code_judge aggregator missing path`
@@ -692,7 +645,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
692
645
  cwd: searchRoots[0]
693
646
  };
694
647
  } else {
695
- const aggregatorPrompt = asString2(rawAggregator.prompt);
648
+ const aggregatorPrompt = asString(rawAggregator.prompt);
696
649
  let promptPath2;
697
650
  if (aggregatorPrompt) {
698
651
  const resolved = await resolveFileReference(aggregatorPrompt, searchRoots);
@@ -717,7 +670,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
717
670
  continue;
718
671
  }
719
672
  if (typeValue === "tool_trajectory") {
720
- const mode = asString2(rawEvaluator.mode);
673
+ const mode = asString(rawEvaluator.mode);
721
674
  if (mode !== "any_order" && mode !== "in_order" && mode !== "exact") {
722
675
  logWarning2(
723
676
  `Skipping tool_trajectory evaluator '${name}' in '${evalId}': invalid mode '${mode}' (must be any_order, in_order, or exact)`
@@ -808,8 +761,8 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
808
761
  );
809
762
  continue;
810
763
  }
811
- const fieldPath = asString2(rawField.path);
812
- const match = asString2(rawField.match);
764
+ const fieldPath = asString(rawField.path);
765
+ const match = asString(rawField.match);
813
766
  if (!fieldPath) {
814
767
  logWarning2(
815
768
  `Skipping field without path in field_accuracy evaluator '${name}' in '${evalId}'`
@@ -839,7 +792,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
839
792
  );
840
793
  continue;
841
794
  }
842
- const aggregation = asString2(rawEvaluator.aggregation);
795
+ const aggregation = asString(rawEvaluator.aggregation);
843
796
  const validAggregation = isValidFieldAggregationType(aggregation) ? aggregation : void 0;
844
797
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
845
798
  evaluators.push({
@@ -920,7 +873,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
920
873
  });
921
874
  continue;
922
875
  }
923
- const prompt = asString2(rawEvaluator.prompt);
876
+ const prompt = asString(rawEvaluator.prompt);
924
877
  let promptPath;
925
878
  if (prompt) {
926
879
  const resolved = await resolveFileReference(prompt, searchRoots);
@@ -939,11 +892,11 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
939
892
  );
940
893
  }
941
894
  }
942
- const _model = asString2(rawEvaluator.model);
895
+ const _model = asString(rawEvaluator.model);
943
896
  const rawRubrics = rawEvaluator.rubrics;
944
897
  const parsedRubrics = Array.isArray(rawRubrics) ? rawRubrics.filter((r) => isJsonObject2(r)).map((rubric, index) => ({
945
- id: asString2(rubric.id) ?? `rubric-${index + 1}`,
946
- description: asString2(rubric.description) ?? "",
898
+ id: asString(rubric.id) ?? `rubric-${index + 1}`,
899
+ description: asString(rubric.description) ?? "",
947
900
  weight: typeof rubric.weight === "number" ? rubric.weight : 1,
948
901
  required: typeof rubric.required === "boolean" ? rubric.required : true
949
902
  })).filter((r) => r.description.length > 0) : void 0;
@@ -987,7 +940,7 @@ function coerceEvaluator(candidate, contextId) {
987
940
  logWarning2(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
988
941
  return void 0;
989
942
  }
990
- function asString2(value) {
943
+ function asString(value) {
991
944
  return typeof value === "string" ? value : void 0;
992
945
  }
993
946
  function asStringArray(value, description) {
@@ -1063,6 +1016,68 @@ function isValidFieldAggregationType(value) {
1063
1016
  // src/evaluation/loaders/message-processor.ts
1064
1017
  var import_promises4 = require("fs/promises");
1065
1018
  var import_node_path4 = __toESM(require("path"), 1);
1019
+
1020
+ // src/evaluation/formatting/segment-formatter.ts
1021
+ function formatFileContents(parts) {
1022
+ const fileCount = parts.filter((p) => p.isFile).length;
1023
+ if (fileCount > 0) {
1024
+ return parts.map((part) => {
1025
+ if (part.isFile && part.displayPath) {
1026
+ return `<file path="${part.displayPath}">
1027
+ ${part.content}
1028
+ </file>`;
1029
+ }
1030
+ return part.content;
1031
+ }).join("\n\n");
1032
+ }
1033
+ return parts.map((p) => p.content).join(" ");
1034
+ }
1035
+ function formatSegment(segment, mode = "lm") {
1036
+ const type = asString2(segment.type);
1037
+ if (type === "text") {
1038
+ return asString2(segment.value);
1039
+ }
1040
+ if (type === "guideline_ref") {
1041
+ const refPath = asString2(segment.path);
1042
+ return refPath ? `<Attached: ${refPath}>` : void 0;
1043
+ }
1044
+ if (type === "file") {
1045
+ const filePath = asString2(segment.path);
1046
+ if (!filePath) {
1047
+ return void 0;
1048
+ }
1049
+ if (mode === "agent") {
1050
+ return `<file: path="${filePath}">`;
1051
+ }
1052
+ const text = asString2(segment.text);
1053
+ if (text && filePath) {
1054
+ return formatFileContents([{ content: text.trim(), isFile: true, displayPath: filePath }]);
1055
+ }
1056
+ }
1057
+ return void 0;
1058
+ }
1059
+ function hasVisibleContent(segments) {
1060
+ return segments.some((segment) => {
1061
+ const type = asString2(segment.type);
1062
+ if (type === "text") {
1063
+ const value = asString2(segment.value);
1064
+ return value !== void 0 && value.trim().length > 0;
1065
+ }
1066
+ if (type === "guideline_ref") {
1067
+ return false;
1068
+ }
1069
+ if (type === "file") {
1070
+ const text = asString2(segment.text);
1071
+ return text !== void 0 && text.trim().length > 0;
1072
+ }
1073
+ return false;
1074
+ });
1075
+ }
1076
+ function asString2(value) {
1077
+ return typeof value === "string" ? value : void 0;
1078
+ }
1079
+
1080
+ // src/evaluation/loaders/message-processor.ts
1066
1081
  var ANSI_YELLOW4 = "\x1B[33m";
1067
1082
  var ANSI_RESET4 = "\x1B[0m";
1068
1083
  async function processMessages(options) {
@@ -1368,9 +1383,6 @@ ${messageContent}`);
1368
1383
  questionParts.push(formattedContent);
1369
1384
  }
1370
1385
  }
1371
- if (testCase.code_snippets.length > 0) {
1372
- questionParts.push(testCase.code_snippets.join("\n"));
1373
- }
1374
1386
  question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
1375
1387
  }
1376
1388
  const chatPrompt = useRoleMarkers ? buildChatPromptFromSegments({
@@ -1569,7 +1581,6 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
1569
1581
  repoRootPath,
1570
1582
  verbose
1571
1583
  }) : [];
1572
- const codeSnippets = extractCodeBlocks(inputSegments);
1573
1584
  let referenceAnswer = "";
1574
1585
  if (outputSegments.length > 0) {
1575
1586
  const lastMessage = outputSegments[outputSegments.length - 1];
@@ -1642,7 +1653,6 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
1642
1653
  guideline_paths: guidelinePaths.map((guidelinePath) => import_node_path6.default.resolve(guidelinePath)),
1643
1654
  guideline_patterns: guidelinePatterns,
1644
1655
  file_paths: allFilePaths,
1645
- code_snippets: codeSnippets,
1646
1656
  expected_outcome: outcome,
1647
1657
  evaluator: evalCaseEvaluatorKind,
1648
1658
  evaluators
@@ -6327,9 +6337,64 @@ function resolveAndCreateProvider(definition, env = process.env) {
6327
6337
  return createProvider(resolved);
6328
6338
  }
6329
6339
 
6330
- // src/evaluation/evaluators.ts
6331
- var import_ai2 = require("ai");
6332
- var import_zod3 = require("zod");
6340
+ // src/evaluation/evaluators/scoring.ts
6341
+ function scoreToVerdict(score) {
6342
+ if (score >= 0.8) {
6343
+ return "pass";
6344
+ }
6345
+ if (score >= 0.6) {
6346
+ return "borderline";
6347
+ }
6348
+ return "fail";
6349
+ }
6350
+ function clampScore(value) {
6351
+ if (Number.isNaN(value) || !Number.isFinite(value)) {
6352
+ return 0;
6353
+ }
6354
+ if (value < 0) {
6355
+ return 0;
6356
+ }
6357
+ if (value > 1) {
6358
+ return 1;
6359
+ }
6360
+ return value;
6361
+ }
6362
+ function extractJsonBlob(text) {
6363
+ const match = text.match(/\{[\s\S]*\}/);
6364
+ return match?.[0];
6365
+ }
6366
+ function parseJsonFromText(text) {
6367
+ const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
6368
+ const blob = extractJsonBlob(cleaned) ?? cleaned;
6369
+ return JSON.parse(blob);
6370
+ }
6371
+ function isNonEmptyString(value) {
6372
+ return typeof value === "string" && value.trim().length > 0;
6373
+ }
6374
+ function parseJsonSafe(payload) {
6375
+ try {
6376
+ return JSON.parse(payload);
6377
+ } catch {
6378
+ return void 0;
6379
+ }
6380
+ }
6381
+ function deepEqual(a, b) {
6382
+ if (a === b) return true;
6383
+ if (a === null || b === null) return a === b;
6384
+ if (typeof a !== typeof b) return false;
6385
+ if (typeof a !== "object") return a === b;
6386
+ if (Array.isArray(a) !== Array.isArray(b)) return false;
6387
+ if (Array.isArray(a) && Array.isArray(b)) {
6388
+ if (a.length !== b.length) return false;
6389
+ return a.every((val, i) => deepEqual(val, b[i]));
6390
+ }
6391
+ const aObj = a;
6392
+ const bObj = b;
6393
+ const aKeys = Object.keys(aObj);
6394
+ const bKeys = Object.keys(bObj);
6395
+ if (aKeys.length !== bKeys.length) return false;
6396
+ return aKeys.every((key) => Object.hasOwn(bObj, key) && deepEqual(aObj[key], bObj[key]));
6397
+ }
6333
6398
 
6334
6399
  // src/runtime/exec.ts
6335
6400
  function shellEscapePath(value) {
@@ -6354,7 +6419,9 @@ async function execFileWithStdinBun(argv, stdinPayload, options) {
6354
6419
  cwd: options.cwd,
6355
6420
  stdin: encoder.encode(stdinPayload),
6356
6421
  stdout: "pipe",
6357
- stderr: "pipe"
6422
+ stderr: "pipe",
6423
+ // Merge additional env vars with process.env
6424
+ env: options.env ? { ...process.env, ...options.env } : process.env
6358
6425
  });
6359
6426
  let timedOut = false;
6360
6427
  const timeout = options.timeoutMs !== void 0 ? setTimeout(() => {
@@ -6389,7 +6456,9 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
6389
6456
  const [cmd, ...args] = argv;
6390
6457
  const child = spawn4(cmd, args, {
6391
6458
  cwd: options.cwd,
6392
- stdio: ["pipe", "pipe", "pipe"]
6459
+ stdio: ["pipe", "pipe", "pipe"],
6460
+ // Merge additional env vars with process.env
6461
+ env: options.env ? { ...process.env, ...options.env } : process.env
6393
6462
  });
6394
6463
  const stdoutChunks = [];
6395
6464
  const stderrChunks = [];
@@ -6442,7 +6511,9 @@ async function execShellWithStdin(command, stdinPayload, options = {}) {
6442
6511
  const child = spawn4(wrappedCommand, {
6443
6512
  shell: true,
6444
6513
  cwd: options.cwd,
6445
- stdio: ["ignore", "ignore", "ignore"]
6514
+ stdio: ["ignore", "ignore", "ignore"],
6515
+ // Merge additional env vars with process.env
6516
+ env: options.env ? { ...process.env, ...options.env } : process.env
6446
6517
  });
6447
6518
  const timeout = options.timeoutMs ? setTimeout(() => {
6448
6519
  child.kill();
@@ -6469,59 +6540,414 @@ async function execShellWithStdin(command, stdinPayload, options = {}) {
6469
6540
  }
6470
6541
  }
6471
6542
 
6472
- // src/evaluation/case-conversion.ts
6473
- function toSnakeCase(str) {
6474
- if (/^[A-Z]/.test(str)) {
6475
- return str;
6476
- }
6477
- return str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
6478
- }
6479
- function toSnakeCaseDeep(obj) {
6480
- if (obj === null || obj === void 0) {
6481
- return obj;
6482
- }
6483
- if (Array.isArray(obj)) {
6484
- return obj.map((item) => toSnakeCaseDeep(item));
6485
- }
6486
- if (typeof obj === "object") {
6487
- const result = {};
6488
- for (const [key, value] of Object.entries(obj)) {
6489
- const snakeKey = toSnakeCase(key);
6490
- result[snakeKey] = toSnakeCaseDeep(value);
6543
+ // src/runtime/target-proxy.ts
6544
+ var import_node_crypto4 = require("crypto");
6545
+ var import_node_http = require("http");
6546
+ var DEFAULT_MAX_CALLS = 50;
6547
+ async function createTargetProxy(options) {
6548
+ const { defaultProvider, targetResolver, availableTargets, maxCalls } = options;
6549
+ const token = (0, import_node_crypto4.randomBytes)(32).toString("hex");
6550
+ let callCount = 0;
6551
+ let isShutdown = false;
6552
+ const targetsList = availableTargets ?? [defaultProvider.targetName];
6553
+ function resolveProvider(targetName) {
6554
+ if (targetName === void 0 || targetName === defaultProvider.targetName) {
6555
+ return defaultProvider;
6556
+ }
6557
+ if (targetResolver) {
6558
+ return targetResolver(targetName);
6491
6559
  }
6492
- return result;
6560
+ return void 0;
6493
6561
  }
6494
- return obj;
6495
- }
6496
-
6497
- // src/evaluation/providers/types.ts
6498
- var AGENT_PROVIDER_KINDS = [
6499
- "codex",
6500
- "pi-coding-agent",
6501
- "claude-code",
6502
- "vscode",
6503
- "vscode-insiders"
6504
- ];
6505
- function extractLastAssistantContent(messages) {
6506
- if (!messages || messages.length === 0) {
6507
- return "";
6562
+ const server = (0, import_node_http.createServer)(async (req, res) => {
6563
+ res.setHeader("Access-Control-Allow-Origin", "*");
6564
+ res.setHeader("Access-Control-Allow-Methods", "GET, POST, OPTIONS");
6565
+ res.setHeader("Access-Control-Allow-Headers", "Content-Type, Authorization");
6566
+ if (req.method === "OPTIONS") {
6567
+ res.writeHead(204);
6568
+ res.end();
6569
+ return;
6570
+ }
6571
+ const authHeader = req.headers.authorization;
6572
+ if (!authHeader || authHeader !== `Bearer ${token}`) {
6573
+ sendJson(res, 401, { error: "Unauthorized" });
6574
+ return;
6575
+ }
6576
+ if (isShutdown) {
6577
+ sendJson(res, 503, { error: "Proxy is shutting down" });
6578
+ return;
6579
+ }
6580
+ const url2 = req.url ?? "";
6581
+ if (req.method === "GET" && url2 === "/info") {
6582
+ handleInfo(res);
6583
+ return;
6584
+ }
6585
+ if (req.method === "POST" && url2 === "/invoke") {
6586
+ await handleInvoke(req, res);
6587
+ return;
6588
+ }
6589
+ if (req.method === "POST" && url2 === "/invokeBatch") {
6590
+ await handleInvokeBatch(req, res);
6591
+ return;
6592
+ }
6593
+ sendJson(res, 404, { error: "Not found" });
6594
+ });
6595
+ function handleInfo(res) {
6596
+ const response = {
6597
+ targetName: defaultProvider.targetName,
6598
+ maxCalls,
6599
+ callCount,
6600
+ availableTargets: targetsList
6601
+ };
6602
+ sendJson(res, 200, response);
6508
6603
  }
6509
- for (let i = messages.length - 1; i >= 0; i--) {
6510
- const msg = messages[i];
6511
- if (msg.role === "assistant" && msg.content !== void 0) {
6512
- if (typeof msg.content === "string") {
6513
- return msg.content;
6604
+ async function handleInvoke(req, res) {
6605
+ if (callCount >= maxCalls) {
6606
+ sendJson(res, 429, { error: `Max calls exceeded (limit: ${maxCalls})` });
6607
+ return;
6608
+ }
6609
+ try {
6610
+ const body = await readBody(req);
6611
+ const request = JSON.parse(body);
6612
+ if (!request.question || typeof request.question !== "string") {
6613
+ sendJson(res, 400, { error: "Missing required field: question" });
6614
+ return;
6514
6615
  }
6515
- return JSON.stringify(msg.content);
6616
+ const provider = resolveProvider(request.target);
6617
+ if (!provider) {
6618
+ sendJson(res, 400, {
6619
+ error: `Unknown target '${request.target}'. Available: ${targetsList.join(", ")}`
6620
+ });
6621
+ return;
6622
+ }
6623
+ callCount++;
6624
+ const response = await provider.invoke({
6625
+ question: request.question,
6626
+ systemPrompt: request.systemPrompt,
6627
+ evalCaseId: request.evalCaseId ?? "proxy",
6628
+ attempt: request.attempt ?? 1
6629
+ });
6630
+ const outputMessages = response.outputMessages ?? [];
6631
+ const rawText = extractLastAssistantContent(outputMessages);
6632
+ const result = {
6633
+ outputMessages,
6634
+ rawText
6635
+ };
6636
+ sendJson(res, 200, result);
6637
+ } catch (error) {
6638
+ const message = error instanceof Error ? error.message : String(error);
6639
+ sendJson(res, 500, { error: message });
6516
6640
  }
6517
6641
  }
6518
- return "";
6519
- }
6520
- function isAgentProvider(provider) {
6521
- return provider ? AGENT_PROVIDER_KINDS.includes(provider.kind) : false;
6642
+ async function handleInvokeBatch(req, res) {
6643
+ try {
6644
+ const body = await readBody(req);
6645
+ const { requests } = JSON.parse(body);
6646
+ if (!Array.isArray(requests)) {
6647
+ sendJson(res, 400, { error: "Missing required field: requests (array)" });
6648
+ return;
6649
+ }
6650
+ if (callCount + requests.length > maxCalls) {
6651
+ sendJson(res, 429, {
6652
+ error: `Batch would exceed max calls (current: ${callCount}, batch: ${requests.length}, limit: ${maxCalls})`
6653
+ });
6654
+ return;
6655
+ }
6656
+ const responses = [];
6657
+ for (const request of requests) {
6658
+ if (!request.question || typeof request.question !== "string") {
6659
+ responses.push({
6660
+ outputMessages: [],
6661
+ rawText: "Error: Missing required field: question"
6662
+ });
6663
+ continue;
6664
+ }
6665
+ const provider = resolveProvider(request.target);
6666
+ if (!provider) {
6667
+ responses.push({
6668
+ outputMessages: [],
6669
+ rawText: `Error: Unknown target '${request.target}'. Available: ${targetsList.join(", ")}`
6670
+ });
6671
+ continue;
6672
+ }
6673
+ callCount++;
6674
+ try {
6675
+ const response = await provider.invoke({
6676
+ question: request.question,
6677
+ systemPrompt: request.systemPrompt,
6678
+ evalCaseId: request.evalCaseId ?? "proxy",
6679
+ attempt: request.attempt ?? 1
6680
+ });
6681
+ const outputMessages = response.outputMessages ?? [];
6682
+ responses.push({
6683
+ outputMessages,
6684
+ rawText: extractLastAssistantContent(outputMessages)
6685
+ });
6686
+ } catch (error) {
6687
+ const message = error instanceof Error ? error.message : String(error);
6688
+ responses.push({
6689
+ outputMessages: [],
6690
+ rawText: `Error: ${message}`
6691
+ });
6692
+ }
6693
+ }
6694
+ sendJson(res, 200, { responses });
6695
+ } catch (error) {
6696
+ const message = error instanceof Error ? error.message : String(error);
6697
+ sendJson(res, 500, { error: message });
6698
+ }
6699
+ }
6700
+ await new Promise((resolve, reject) => {
6701
+ server.once("error", reject);
6702
+ server.listen(0, "127.0.0.1", () => {
6703
+ server.removeListener("error", reject);
6704
+ resolve();
6705
+ });
6706
+ });
6707
+ const address = server.address();
6708
+ const url = `http://127.0.0.1:${address.port}`;
6709
+ return {
6710
+ url,
6711
+ token,
6712
+ shutdown: async () => {
6713
+ isShutdown = true;
6714
+ return new Promise((resolve, reject) => {
6715
+ server.close((err) => {
6716
+ if (err) reject(err);
6717
+ else resolve();
6718
+ });
6719
+ });
6720
+ },
6721
+ getUsageMetadata: () => ({
6722
+ callCount,
6723
+ maxCalls
6724
+ })
6725
+ };
6726
+ }
6727
+ function sendJson(res, statusCode, body) {
6728
+ res.writeHead(statusCode, { "Content-Type": "application/json" });
6729
+ res.end(JSON.stringify(body));
6730
+ }
6731
+ function readBody(req) {
6732
+ return new Promise((resolve, reject) => {
6733
+ const chunks = [];
6734
+ req.on("data", (chunk) => chunks.push(chunk));
6735
+ req.on("end", () => resolve(Buffer.concat(chunks).toString("utf8")));
6736
+ req.on("error", reject);
6737
+ });
6738
+ }
6739
+ function extractLastAssistantContent(messages) {
6740
+ for (let i = messages.length - 1; i >= 0; i--) {
6741
+ const msg = messages[i];
6742
+ if (msg.role === "assistant" && msg.content !== void 0) {
6743
+ if (typeof msg.content === "string") {
6744
+ return msg.content;
6745
+ }
6746
+ if (Array.isArray(msg.content)) {
6747
+ for (const part of msg.content) {
6748
+ if (typeof part === "object" && part !== null && "text" in part) {
6749
+ return String(part.text);
6750
+ }
6751
+ }
6752
+ }
6753
+ }
6754
+ }
6755
+ return void 0;
6756
+ }
6757
+
6758
+ // src/evaluation/case-conversion.ts
6759
+ function toSnakeCase(str) {
6760
+ if (/^[A-Z]/.test(str)) {
6761
+ return str;
6762
+ }
6763
+ return str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
6764
+ }
6765
+ function toSnakeCaseDeep(obj) {
6766
+ if (obj === null || obj === void 0) {
6767
+ return obj;
6768
+ }
6769
+ if (Array.isArray(obj)) {
6770
+ return obj.map((item) => toSnakeCaseDeep(item));
6771
+ }
6772
+ if (typeof obj === "object") {
6773
+ const result = {};
6774
+ for (const [key, value] of Object.entries(obj)) {
6775
+ const snakeKey = toSnakeCase(key);
6776
+ result[snakeKey] = toSnakeCaseDeep(value);
6777
+ }
6778
+ return result;
6779
+ }
6780
+ return obj;
6781
+ }
6782
+
6783
+ // src/evaluation/evaluators/code-evaluator.ts
6784
+ var CodeEvaluator = class {
6785
+ kind = "code";
6786
+ script;
6787
+ cwd;
6788
+ agentTimeoutMs;
6789
+ config;
6790
+ target;
6791
+ constructor(options) {
6792
+ this.script = options.script;
6793
+ this.cwd = options.cwd;
6794
+ this.agentTimeoutMs = options.agentTimeoutMs;
6795
+ this.config = options.config;
6796
+ this.target = options.target;
6797
+ }
6798
+ async evaluate(context) {
6799
+ const payload = {
6800
+ question: context.evalCase.question,
6801
+ expectedOutcome: context.evalCase.expected_outcome,
6802
+ expectedMessages: context.evalCase.expected_messages,
6803
+ referenceAnswer: context.evalCase.reference_answer,
6804
+ candidateAnswer: context.candidate,
6805
+ outputMessages: context.outputMessages ?? null,
6806
+ guidelineFiles: context.evalCase.guideline_paths,
6807
+ inputFiles: context.evalCase.file_paths.filter(
6808
+ (path17) => !context.evalCase.guideline_paths.includes(path17)
6809
+ ),
6810
+ inputMessages: context.evalCase.input_messages,
6811
+ traceSummary: context.traceSummary ?? null,
6812
+ config: this.config ?? null
6813
+ };
6814
+ const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
6815
+ let proxyEnv;
6816
+ let proxyShutdown;
6817
+ let getProxyUsage;
6818
+ if (this.target !== void 0 && context.judgeProvider) {
6819
+ const maxCalls = this.target.max_calls ?? DEFAULT_MAX_CALLS;
6820
+ const proxy = await createTargetProxy({
6821
+ defaultProvider: context.judgeProvider,
6822
+ targetResolver: context.targetResolver,
6823
+ availableTargets: context.availableTargets,
6824
+ maxCalls
6825
+ });
6826
+ proxyEnv = {
6827
+ AGENTV_TARGET_PROXY_URL: proxy.url,
6828
+ AGENTV_TARGET_PROXY_TOKEN: proxy.token
6829
+ };
6830
+ proxyShutdown = proxy.shutdown;
6831
+ getProxyUsage = proxy.getUsageMetadata;
6832
+ }
6833
+ try {
6834
+ const stdout = await executeScript(
6835
+ this.script,
6836
+ inputPayload,
6837
+ this.agentTimeoutMs,
6838
+ this.cwd,
6839
+ proxyEnv
6840
+ );
6841
+ const parsed = parseJsonSafe(stdout);
6842
+ const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
6843
+ const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
6844
+ const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
6845
+ const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
6846
+ const details = parsed?.details && typeof parsed.details === "object" && !Array.isArray(parsed.details) ? parsed.details : void 0;
6847
+ const proxyUsage = getProxyUsage?.();
6848
+ const evaluatorRawRequest = {
6849
+ script: this.script,
6850
+ ...this.cwd ? { cwd: this.cwd } : {},
6851
+ ...proxyUsage ? {
6852
+ target_proxy: {
6853
+ call_count: proxyUsage.callCount,
6854
+ max_calls: proxyUsage.maxCalls
6855
+ }
6856
+ } : {}
6857
+ };
6858
+ return {
6859
+ score,
6860
+ verdict: scoreToVerdict(score),
6861
+ hits,
6862
+ misses,
6863
+ expectedAspectCount: hits.length + misses.length || 1,
6864
+ reasoning,
6865
+ evaluatorRawRequest,
6866
+ ...details ? { details } : {}
6867
+ };
6868
+ } catch (error) {
6869
+ const message = error instanceof Error ? error.message : String(error);
6870
+ const proxyUsage = getProxyUsage?.();
6871
+ return {
6872
+ score: 0,
6873
+ verdict: "fail",
6874
+ hits: [],
6875
+ misses: [`Code evaluator failed: ${message}`],
6876
+ expectedAspectCount: 1,
6877
+ reasoning: message,
6878
+ evaluatorRawRequest: {
6879
+ script: this.script,
6880
+ ...this.cwd ? { cwd: this.cwd } : {},
6881
+ ...proxyUsage ? {
6882
+ target_proxy: {
6883
+ call_count: proxyUsage.callCount,
6884
+ max_calls: proxyUsage.maxCalls
6885
+ }
6886
+ } : {},
6887
+ error: message
6888
+ }
6889
+ };
6890
+ } finally {
6891
+ if (proxyShutdown) {
6892
+ await proxyShutdown();
6893
+ }
6894
+ }
6895
+ }
6896
+ };
6897
+ async function executeScript(scriptPath, input, agentTimeoutMs, cwd, env) {
6898
+ const { stdout, stderr, exitCode } = typeof scriptPath === "string" ? await execShellWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env }) : await execFileWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env });
6899
+ if (exitCode !== 0) {
6900
+ const trimmedErr = formatStderr(stderr);
6901
+ throw new Error(
6902
+ trimmedErr.length > 0 ? `Code evaluator exited with code ${exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${exitCode}`
6903
+ );
6904
+ }
6905
+ return stdout.trim();
6906
+ }
6907
+ function formatStderr(stderr) {
6908
+ const trimmed = stderr.trim();
6909
+ const maxLength = 2e3;
6910
+ if (trimmed.length <= maxLength) {
6911
+ return trimmed;
6912
+ }
6913
+ const tail = trimmed.slice(-maxLength);
6914
+ return `...(truncated, last ${maxLength} chars)
6915
+ ${tail}`;
6916
+ }
6917
+
6918
+ // src/evaluation/evaluators/composite.ts
6919
+ var import_ai3 = require("ai");
6920
+
6921
+ // src/evaluation/providers/types.ts
6922
+ var AGENT_PROVIDER_KINDS = [
6923
+ "codex",
6924
+ "pi-coding-agent",
6925
+ "claude-code",
6926
+ "vscode",
6927
+ "vscode-insiders"
6928
+ ];
6929
+ function extractLastAssistantContent2(messages) {
6930
+ if (!messages || messages.length === 0) {
6931
+ return "";
6932
+ }
6933
+ for (let i = messages.length - 1; i >= 0; i--) {
6934
+ const msg = messages[i];
6935
+ if (msg.role === "assistant" && msg.content !== void 0) {
6936
+ if (typeof msg.content === "string") {
6937
+ return msg.content;
6938
+ }
6939
+ return JSON.stringify(msg.content);
6940
+ }
6941
+ }
6942
+ return "";
6943
+ }
6944
+ function isAgentProvider(provider) {
6945
+ return provider ? AGENT_PROVIDER_KINDS.includes(provider.kind) : false;
6522
6946
  }
6523
6947
 
6524
- // src/evaluation/evaluators.ts
6948
+ // src/evaluation/evaluators/llm-judge.ts
6949
+ var import_ai2 = require("ai");
6950
+ var import_zod3 = require("zod");
6525
6951
  var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
6526
6952
 
6527
6953
  Use the reference_answer as a gold standard for a high-quality response (if provided). The reference_answer may be a simple text response, or it may contain a sequence of expected agent messages including tool calls. When it contains multiple messages, the last message represents the final expected answer. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
@@ -6601,7 +7027,7 @@ var LlmJudgeEvaluator = class {
6601
7027
  target: judgeProvider.targetName
6602
7028
  };
6603
7029
  try {
6604
- const { data, providerResponse } = await this.runWithRetry({
7030
+ const { data } = await this.runWithRetry({
6605
7031
  context,
6606
7032
  judgeProvider,
6607
7033
  systemPrompt,
@@ -6714,7 +7140,7 @@ var LlmJudgeEvaluator = class {
6714
7140
  temperature: this.temperature
6715
7141
  });
6716
7142
  const data = schema.parse(
6717
- parseJsonFromText(extractLastAssistantContent(response.outputMessages))
7143
+ parseJsonFromText(extractLastAssistantContent2(response.outputMessages))
6718
7144
  );
6719
7145
  return { data, providerResponse: response };
6720
7146
  } catch (e) {
@@ -6750,86 +7176,160 @@ You must return a valid JSON object matching this schema:
6750
7176
  "overall_reasoning": "string (summary)"
6751
7177
  }`;
6752
7178
  }
6753
- function scoreToVerdict(score) {
6754
- if (score >= 0.8) {
6755
- return "pass";
6756
- }
6757
- if (score >= 0.6) {
6758
- return "borderline";
6759
- }
6760
- return "fail";
7179
+ function substituteVariables(template, variables) {
7180
+ return template.replace(/\{\{\s*([a-zA-Z0-9_]+)\s*\}\}/g, (match, varName) => {
7181
+ return variables[varName] ?? match;
7182
+ });
6761
7183
  }
6762
- function clampScore(value) {
6763
- if (Number.isNaN(value) || !Number.isFinite(value)) {
6764
- return 0;
6765
- }
6766
- if (value < 0) {
6767
- return 0;
6768
- }
6769
- if (value > 1) {
6770
- return 1;
7184
+ function calculateRubricScore(result, rubrics) {
7185
+ const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
7186
+ const hits = [];
7187
+ const misses = [];
7188
+ let totalWeight = 0;
7189
+ let earnedWeight = 0;
7190
+ let failedRequired = false;
7191
+ for (const check of result.checks) {
7192
+ const rubric = rubricMap.get(check.id);
7193
+ if (!rubric) {
7194
+ continue;
7195
+ }
7196
+ totalWeight += rubric.weight;
7197
+ if (check.satisfied) {
7198
+ earnedWeight += rubric.weight;
7199
+ hits.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
7200
+ } else {
7201
+ misses.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
7202
+ if (rubric.required) {
7203
+ failedRequired = true;
7204
+ }
7205
+ }
6771
7206
  }
6772
- return value;
6773
- }
6774
- function extractJsonBlob(text) {
6775
- const match = text.match(/\{[\s\S]*\}/);
6776
- return match?.[0];
6777
- }
6778
- function parseJsonFromText(text) {
6779
- const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
6780
- const blob = extractJsonBlob(cleaned) ?? cleaned;
6781
- return JSON.parse(blob);
6782
- }
6783
- function isNonEmptyString(value) {
6784
- return typeof value === "string" && value.trim().length > 0;
7207
+ const score = totalWeight > 0 ? Math.min(1, Math.max(0, earnedWeight / totalWeight)) : 0;
7208
+ const verdict = failedRequired ? "fail" : scoreToVerdict(score);
7209
+ return { score, verdict, hits, misses };
6785
7210
  }
6786
- var CodeEvaluator = class {
6787
- kind = "code";
6788
- script;
6789
- cwd;
6790
- agentTimeoutMs;
7211
+
7212
+ // src/evaluation/evaluators/composite.ts
7213
+ var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
7214
+ {{EVALUATOR_RESULTS_JSON}}
7215
+
7216
+ Decide the final score and verdict based on all evaluator results.
7217
+ Return a JSON object with: score (0.0-1.0), verdict (pass/fail/borderline), and reasoning.`;
7218
+ var CompositeEvaluator = class {
7219
+ kind = "composite";
6791
7220
  config;
7221
+ evaluatorFactory;
7222
+ cwd;
6792
7223
  constructor(options) {
6793
- this.script = options.script;
6794
- this.cwd = options.cwd;
6795
- this.agentTimeoutMs = options.agentTimeoutMs;
6796
7224
  this.config = options.config;
7225
+ this.evaluatorFactory = options.evaluatorFactory;
7226
+ this.cwd = options.cwd;
6797
7227
  }
6798
7228
  async evaluate(context) {
6799
- const payload = {
6800
- question: context.evalCase.question,
6801
- expectedOutcome: context.evalCase.expected_outcome,
6802
- expectedMessages: context.evalCase.expected_messages,
6803
- referenceAnswer: context.evalCase.reference_answer,
6804
- candidateAnswer: context.candidate,
6805
- outputMessages: context.outputMessages ?? null,
6806
- guidelineFiles: context.evalCase.guideline_paths,
6807
- inputFiles: context.evalCase.file_paths.filter(
6808
- (path17) => !context.evalCase.guideline_paths.includes(path17)
6809
- ),
6810
- inputMessages: context.evalCase.input_messages,
6811
- traceSummary: context.traceSummary ?? null,
6812
- config: this.config ?? null
7229
+ const memberResults = await Promise.all(
7230
+ this.config.evaluators.map(async (memberConfig) => {
7231
+ const evaluator = this.evaluatorFactory.create(memberConfig, context);
7232
+ return {
7233
+ id: memberConfig.name,
7234
+ type: memberConfig.type,
7235
+ result: await evaluator.evaluate(context)
7236
+ };
7237
+ })
7238
+ );
7239
+ return this.aggregate(memberResults, context);
7240
+ }
7241
+ async aggregate(results, context) {
7242
+ const aggregator = this.config.aggregator;
7243
+ switch (aggregator.type) {
7244
+ case "code_judge":
7245
+ return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
7246
+ case "llm_judge":
7247
+ return this.runLlmAggregator(results, context, aggregator);
7248
+ default:
7249
+ return this.runWeightedAverage(results, aggregator.weights);
7250
+ }
7251
+ }
7252
+ runWeightedAverage(results, weights) {
7253
+ let totalWeight = 0;
7254
+ let weightedSum = 0;
7255
+ const allHits = [];
7256
+ const allMisses = [];
7257
+ const reasoningParts = [];
7258
+ const evaluatorResults = [];
7259
+ for (const member of results) {
7260
+ const weight = weights?.[member.id] ?? 1;
7261
+ totalWeight += weight;
7262
+ weightedSum += member.result.score * weight;
7263
+ allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
7264
+ allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
7265
+ if (member.result.reasoning) {
7266
+ reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
7267
+ }
7268
+ evaluatorResults.push({
7269
+ name: member.id,
7270
+ type: member.type,
7271
+ score: member.result.score,
7272
+ weight,
7273
+ verdict: member.result.verdict,
7274
+ hits: [...member.result.hits],
7275
+ misses: [...member.result.misses],
7276
+ reasoning: member.result.reasoning,
7277
+ evaluatorRawRequest: member.result.evaluatorRawRequest,
7278
+ evaluatorResults: member.result.evaluatorResults,
7279
+ details: member.result.details
7280
+ });
7281
+ }
7282
+ const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
7283
+ return {
7284
+ score: clampScore(finalScore),
7285
+ verdict: scoreToVerdict(finalScore),
7286
+ hits: allHits,
7287
+ misses: allMisses,
7288
+ expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
7289
+ reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
7290
+ evaluatorRawRequest: {
7291
+ aggregator: "weighted_average",
7292
+ ...weights ? { weights } : {}
7293
+ },
7294
+ evaluatorResults
6813
7295
  };
6814
- const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
7296
+ }
7297
+ async runCodeAggregator(results, scriptPath, cwd, weights) {
7298
+ const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
7299
+ const inputPayload = JSON.stringify({ results: resultsObject }, null, 2);
7300
+ const evaluatorResults = results.map((member) => ({
7301
+ name: member.id,
7302
+ type: member.type,
7303
+ score: member.result.score,
7304
+ weight: weights?.[member.id] ?? 1,
7305
+ verdict: member.result.verdict,
7306
+ hits: [...member.result.hits],
7307
+ misses: [...member.result.misses],
7308
+ reasoning: member.result.reasoning,
7309
+ evaluatorRawRequest: member.result.evaluatorRawRequest,
7310
+ evaluatorResults: member.result.evaluatorResults,
7311
+ details: member.result.details
7312
+ }));
6815
7313
  try {
6816
- const stdout = await executeScript(this.script, inputPayload, this.agentTimeoutMs, this.cwd);
7314
+ const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
6817
7315
  const parsed = parseJsonSafe(stdout);
6818
7316
  const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
6819
7317
  const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
6820
7318
  const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
6821
7319
  const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
7320
+ const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
6822
7321
  return {
6823
7322
  score,
6824
- verdict: scoreToVerdict(score),
7323
+ verdict,
6825
7324
  hits,
6826
7325
  misses,
6827
7326
  expectedAspectCount: hits.length + misses.length || 1,
6828
7327
  reasoning,
6829
7328
  evaluatorRawRequest: {
6830
- script: this.script,
6831
- ...this.cwd ? { cwd: this.cwd } : {}
6832
- }
7329
+ aggregator: "code_judge",
7330
+ script: scriptPath
7331
+ },
7332
+ evaluatorResults
6833
7333
  };
6834
7334
  } catch (error) {
6835
7335
  const message = error instanceof Error ? error.message : String(error);
@@ -6837,452 +7337,292 @@ var CodeEvaluator = class {
6837
7337
  score: 0,
6838
7338
  verdict: "fail",
6839
7339
  hits: [],
6840
- misses: [`Code evaluator failed: ${message}`],
7340
+ misses: [`Code aggregator failed: ${message}`],
6841
7341
  expectedAspectCount: 1,
6842
7342
  reasoning: message,
6843
7343
  evaluatorRawRequest: {
6844
- script: this.script,
6845
- ...this.cwd ? { cwd: this.cwd } : {},
7344
+ aggregator: "code_judge",
7345
+ script: scriptPath,
6846
7346
  error: message
6847
- }
7347
+ },
7348
+ evaluatorResults
6848
7349
  };
6849
7350
  }
6850
7351
  }
6851
- };
6852
- function calculateRubricScore(result, rubrics) {
6853
- const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
6854
- const hits = [];
6855
- const misses = [];
6856
- let totalWeight = 0;
6857
- let earnedWeight = 0;
6858
- let failedRequired = false;
6859
- for (const check of result.checks) {
6860
- const rubric = rubricMap.get(check.id);
6861
- if (!rubric) {
6862
- continue;
7352
+ async runLlmAggregator(results, context, config) {
7353
+ const judgeProvider = context.judgeProvider;
7354
+ if (!judgeProvider) {
7355
+ throw new Error("No judge provider available for LLM aggregation");
6863
7356
  }
6864
- totalWeight += rubric.weight;
6865
- if (check.satisfied) {
6866
- earnedWeight += rubric.weight;
6867
- hits.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
6868
- } else {
6869
- misses.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
6870
- if (rubric.required) {
6871
- failedRequired = true;
7357
+ const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
7358
+ const resultsJson = JSON.stringify(resultsObject, null, 2);
7359
+ const evaluatorResults = results.map((member) => ({
7360
+ name: member.id,
7361
+ type: member.type,
7362
+ score: member.result.score,
7363
+ verdict: member.result.verdict,
7364
+ hits: [...member.result.hits],
7365
+ misses: [...member.result.misses],
7366
+ reasoning: member.result.reasoning,
7367
+ evaluatorRawRequest: member.result.evaluatorRawRequest,
7368
+ evaluatorResults: member.result.evaluatorResults,
7369
+ details: member.result.details
7370
+ }));
7371
+ const promptTemplate = config.prompt ?? DEFAULT_COMPOSITE_AGGREGATOR_PROMPT;
7372
+ const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
7373
+ const systemPrompt = buildOutputSchema();
7374
+ const evaluatorRawRequest = {
7375
+ aggregator: "llm_judge",
7376
+ userPrompt,
7377
+ systemPrompt,
7378
+ target: judgeProvider.targetName
7379
+ };
7380
+ try {
7381
+ const model = judgeProvider.asLanguageModel?.();
7382
+ if (model) {
7383
+ const { text } = await (0, import_ai3.generateText)({
7384
+ model,
7385
+ system: systemPrompt,
7386
+ prompt: userPrompt
7387
+ });
7388
+ const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text));
7389
+ const score2 = clampScore(data2.score);
7390
+ const hits2 = Array.isArray(data2.hits) ? data2.hits.filter(isNonEmptyString).slice(0, 4) : [];
7391
+ const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
7392
+ const reasoning2 = data2.reasoning;
7393
+ return {
7394
+ score: score2,
7395
+ verdict: scoreToVerdict(score2),
7396
+ hits: hits2,
7397
+ misses: misses2,
7398
+ expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
7399
+ reasoning: reasoning2,
7400
+ evaluatorRawRequest,
7401
+ evaluatorResults
7402
+ };
6872
7403
  }
7404
+ const response = await judgeProvider.invoke({
7405
+ question: userPrompt,
7406
+ systemPrompt,
7407
+ evalCaseId: context.evalCase.id,
7408
+ attempt: context.attempt
7409
+ });
7410
+ const data = freeformEvaluationSchema.parse(
7411
+ parseJsonFromText(extractLastAssistantContent2(response.outputMessages))
7412
+ );
7413
+ const score = clampScore(data.score);
7414
+ const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
7415
+ const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
7416
+ const reasoning = data.reasoning;
7417
+ return {
7418
+ score,
7419
+ verdict: scoreToVerdict(score),
7420
+ hits,
7421
+ misses,
7422
+ expectedAspectCount: Math.max(hits.length + misses.length, 1),
7423
+ reasoning,
7424
+ evaluatorRawRequest,
7425
+ evaluatorResults
7426
+ };
7427
+ } catch {
7428
+ return {
7429
+ score: 0,
7430
+ verdict: "fail",
7431
+ hits: [],
7432
+ misses: [],
7433
+ expectedAspectCount: 1,
7434
+ evaluatorRawRequest,
7435
+ evaluatorResults
7436
+ };
6873
7437
  }
6874
7438
  }
6875
- const score = totalWeight > 0 ? Math.min(1, Math.max(0, earnedWeight / totalWeight)) : 0;
6876
- const verdict = failedRequired ? "fail" : scoreToVerdict(score);
6877
- return { score, verdict, hits, misses };
6878
- }
6879
- async function executeScript(scriptPath, input, agentTimeoutMs, cwd) {
6880
- const { stdout, stderr, exitCode } = typeof scriptPath === "string" ? await execShellWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs }) : await execFileWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs });
6881
- if (exitCode !== 0) {
6882
- const trimmedErr = formatStderr(stderr);
6883
- throw new Error(
6884
- trimmedErr.length > 0 ? `Code evaluator exited with code ${exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${exitCode}`
6885
- );
6886
- }
6887
- return stdout.trim();
6888
- }
6889
- function formatStderr(stderr) {
6890
- const trimmed = stderr.trim();
6891
- const maxLength = 2e3;
6892
- if (trimmed.length <= maxLength) {
6893
- return trimmed;
7439
+ };
7440
+
7441
+ // src/evaluation/evaluators/cost.ts
7442
+ var CostEvaluator = class {
7443
+ kind = "cost";
7444
+ config;
7445
+ constructor(options) {
7446
+ this.config = options.config;
6894
7447
  }
6895
- const tail = trimmed.slice(-maxLength);
6896
- return `...(truncated, last ${maxLength} chars)
6897
- ${tail}`;
6898
- }
6899
- function parseJsonSafe(payload) {
6900
- try {
6901
- return JSON.parse(payload);
6902
- } catch {
6903
- return void 0;
6904
- }
6905
- }
6906
- function substituteVariables(template, variables) {
6907
- return template.replace(/\{\{\s*([a-zA-Z0-9_]+)\s*\}\}/g, (match, varName) => {
6908
- return variables[varName] ?? match;
6909
- });
6910
- }
6911
- function deepEqual(a, b) {
6912
- if (a === b) return true;
6913
- if (a === null || b === null) return a === b;
6914
- if (typeof a !== typeof b) return false;
6915
- if (typeof a !== "object") return a === b;
6916
- if (Array.isArray(a) !== Array.isArray(b)) return false;
6917
- if (Array.isArray(a) && Array.isArray(b)) {
6918
- if (a.length !== b.length) return false;
6919
- return a.every((val, i) => deepEqual(val, b[i]));
6920
- }
6921
- const aObj = a;
6922
- const bObj = b;
6923
- const aKeys = Object.keys(aObj);
6924
- const bKeys = Object.keys(bObj);
6925
- if (aKeys.length !== bKeys.length) return false;
6926
- return aKeys.every((key) => Object.hasOwn(bObj, key) && deepEqual(aObj[key], bObj[key]));
6927
- }
6928
- function argsMatch(expected, actual) {
6929
- if (expected === void 0) return true;
6930
- if (expected === "any") return true;
6931
- if (actual === void 0) return false;
6932
- for (const key of Object.keys(expected)) {
6933
- if (!Object.hasOwn(actual, key)) return false;
6934
- if (!deepEqual(expected[key], actual[key])) return false;
7448
+ evaluate(context) {
7449
+ const { budget } = this.config;
7450
+ const costUsd = context.traceSummary?.costUsd;
7451
+ if (costUsd === void 0) {
7452
+ return {
7453
+ score: 0,
7454
+ verdict: "fail",
7455
+ hits: [],
7456
+ misses: ["No cost data available in trace"],
7457
+ expectedAspectCount: 1,
7458
+ reasoning: "Execution cost not reported by provider",
7459
+ evaluatorRawRequest: {
7460
+ type: "cost",
7461
+ budget,
7462
+ costUsd: null
7463
+ }
7464
+ };
7465
+ }
7466
+ const passed = costUsd <= budget;
7467
+ const score = passed ? 1 : 0;
7468
+ const formatCost = (n) => `$${n.toFixed(4)}`;
7469
+ return {
7470
+ score,
7471
+ verdict: passed ? "pass" : "fail",
7472
+ hits: passed ? [`Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`] : [],
7473
+ misses: passed ? [] : [`Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`],
7474
+ expectedAspectCount: 1,
7475
+ reasoning: `Execution cost ${formatCost(costUsd)} (budget: ${formatCost(budget)})`,
7476
+ evaluatorRawRequest: {
7477
+ type: "cost",
7478
+ budget,
7479
+ costUsd
7480
+ }
7481
+ };
6935
7482
  }
6936
- return true;
6937
- }
6938
- var ToolTrajectoryEvaluator = class {
6939
- kind = "tool_trajectory";
7483
+ };
7484
+
7485
+ // src/evaluation/evaluators/field-accuracy.ts
7486
+ var DEFAULT_DATE_FORMATS = [
7487
+ "YYYY-MM-DDTHH:mm:ssZ",
7488
+ // ISO with timezone
7489
+ "YYYY-MM-DDTHH:mm:ss",
7490
+ // ISO with time
7491
+ "YYYY-MM-DD",
7492
+ // ISO date
7493
+ "DD-MMM-YYYY",
7494
+ // Localized (e.g., "15-JAN-2025")
7495
+ "MM/DD/YYYY",
7496
+ // US format
7497
+ "DD/MM/YYYY",
7498
+ // EU format
7499
+ "MM-DD-YYYY",
7500
+ // US with dashes
7501
+ "DD-MM-YYYY"
7502
+ // EU with dashes
7503
+ ];
7504
+ var MONTH_NAMES = {
7505
+ jan: 0,
7506
+ january: 0,
7507
+ feb: 1,
7508
+ february: 1,
7509
+ mar: 2,
7510
+ march: 2,
7511
+ apr: 3,
7512
+ april: 3,
7513
+ may: 4,
7514
+ jun: 5,
7515
+ june: 5,
7516
+ jul: 6,
7517
+ july: 6,
7518
+ aug: 7,
7519
+ august: 7,
7520
+ sep: 8,
7521
+ sept: 8,
7522
+ september: 8,
7523
+ oct: 9,
7524
+ october: 9,
7525
+ nov: 10,
7526
+ november: 10,
7527
+ dec: 11,
7528
+ december: 11
7529
+ };
7530
+ var FieldAccuracyEvaluator = class {
7531
+ kind = "field_accuracy";
6940
7532
  config;
6941
7533
  constructor(options) {
6942
7534
  this.config = options.config;
6943
7535
  }
6944
7536
  evaluate(context) {
6945
- const { outputMessages, traceSummary } = context;
6946
- const toolCalls = this.extractToolCallsFromMessages(outputMessages);
6947
- if (toolCalls.length === 0 && !traceSummary) {
7537
+ const { evalCase, candidate } = context;
7538
+ let candidateData;
7539
+ try {
7540
+ candidateData = parseJsonFromTextSafe(candidate);
7541
+ } catch {
6948
7542
  return {
6949
7543
  score: 0,
6950
7544
  verdict: "fail",
6951
7545
  hits: [],
6952
- misses: ["No trace available for evaluation"],
6953
- expectedAspectCount: 1
7546
+ misses: ["Failed to parse candidate answer as JSON"],
7547
+ expectedAspectCount: this.config.fields.length,
7548
+ reasoning: "Candidate answer is not valid JSON"
6954
7549
  };
6955
7550
  }
6956
- const summary = toolCalls.length > 0 ? this.buildSummary(toolCalls) : traceSummary;
6957
- if (!summary) {
7551
+ const expectedData = this.extractExpectedData(evalCase.expected_messages);
7552
+ if (!expectedData) {
6958
7553
  return {
6959
7554
  score: 0,
6960
7555
  verdict: "fail",
6961
7556
  hits: [],
6962
- misses: ["No trace available for evaluation"],
6963
- expectedAspectCount: 1
7557
+ misses: ["No expected data found in expected_messages"],
7558
+ expectedAspectCount: this.config.fields.length,
7559
+ reasoning: "Could not extract expected data from expected_messages"
6964
7560
  };
6965
7561
  }
6966
- switch (this.config.mode) {
6967
- case "any_order":
6968
- return this.evaluateAnyOrder(summary);
6969
- case "in_order":
6970
- return this.evaluateInOrder(toolCalls);
6971
- case "exact":
6972
- return this.evaluateExact(toolCalls);
6973
- default:
6974
- return {
6975
- score: 0,
6976
- verdict: "fail",
6977
- hits: [],
6978
- misses: [`Unknown mode: ${this.config.mode}`],
6979
- expectedAspectCount: 1
6980
- };
7562
+ const fieldResults = [];
7563
+ for (const fieldConfig of this.config.fields) {
7564
+ const result = this.evaluateField(fieldConfig, candidateData, expectedData);
7565
+ fieldResults.push(result);
6981
7566
  }
7567
+ return this.aggregateResults(fieldResults);
6982
7568
  }
6983
7569
  /**
6984
- * Extract tool calls from output messages.
7570
+ * Extract expected data from expected_messages array.
7571
+ * Looks for the last assistant message with content.
6985
7572
  */
6986
- extractToolCallsFromMessages(messages) {
6987
- if (!messages) {
6988
- return [];
6989
- }
6990
- const toolCalls = [];
6991
- for (const message of messages) {
6992
- if (message.toolCalls) {
6993
- for (const call of message.toolCalls) {
6994
- toolCalls.push({
6995
- name: call.tool,
6996
- args: call.input
6997
- });
7573
+ extractExpectedData(expectedMessages) {
7574
+ for (let i = expectedMessages.length - 1; i >= 0; i--) {
7575
+ const message = expectedMessages[i];
7576
+ if (message.role === "assistant" && message.content) {
7577
+ if (typeof message.content === "object" && message.content !== null) {
7578
+ return message.content;
7579
+ }
7580
+ if (typeof message.content === "string") {
7581
+ try {
7582
+ return parseJsonFromTextSafe(message.content);
7583
+ } catch {
7584
+ }
6998
7585
  }
6999
7586
  }
7000
7587
  }
7001
- return toolCalls;
7588
+ return void 0;
7002
7589
  }
7003
7590
  /**
7004
- * Build a summary from extracted tool calls.
7591
+ * Evaluate a single field against the expected value.
7005
7592
  */
7006
- buildSummary(toolCalls) {
7007
- const toolCallsByName = {};
7008
- for (const call of toolCalls) {
7009
- toolCallsByName[call.name] = (toolCallsByName[call.name] ?? 0) + 1;
7010
- }
7011
- const toolNames = Object.keys(toolCallsByName).sort();
7012
- return {
7013
- eventCount: toolCalls.length,
7014
- toolNames,
7015
- toolCallsByName,
7016
- errorCount: 0
7017
- };
7018
- }
7019
- evaluateAnyOrder(summary) {
7020
- const minimums = this.config.minimums ?? {};
7021
- const toolNames = Object.keys(minimums);
7022
- if (toolNames.length === 0) {
7593
+ evaluateField(fieldConfig, candidateData, expectedData) {
7594
+ const { path: path17, match, required = true, weight = 1 } = fieldConfig;
7595
+ const candidateValue = resolvePath(candidateData, path17);
7596
+ const expectedValue = resolvePath(expectedData, path17);
7597
+ if (expectedValue === void 0) {
7023
7598
  return {
7599
+ path: path17,
7024
7600
  score: 1,
7025
- verdict: "pass",
7026
- hits: ["No tool requirements specified"],
7027
- misses: [],
7028
- expectedAspectCount: 0
7601
+ // No expected value means no comparison needed
7602
+ weight,
7603
+ hit: true,
7604
+ message: `${path17}: no expected value`
7029
7605
  };
7030
7606
  }
7031
- const hits = [];
7032
- const misses = [];
7033
- for (const toolName of toolNames) {
7034
- const required = minimums[toolName];
7035
- const actual = summary.toolCallsByName[toolName] ?? 0;
7036
- if (actual >= required) {
7037
- hits.push(`${toolName}: called ${actual} times (required \u2265${required})`);
7038
- } else {
7039
- misses.push(`${toolName}: called ${actual} times (required \u2265${required})`);
7607
+ if (candidateValue === void 0) {
7608
+ if (required) {
7609
+ return {
7610
+ path: path17,
7611
+ score: 0,
7612
+ weight,
7613
+ hit: false,
7614
+ message: `${path17} (required, missing)`
7615
+ };
7040
7616
  }
7041
- }
7042
- const score = hits.length / toolNames.length;
7043
- return {
7044
- score,
7045
- verdict: scoreToVerdict(score),
7046
- hits,
7047
- misses,
7048
- expectedAspectCount: toolNames.length
7049
- };
7050
- }
7051
- evaluateInOrder(toolCalls) {
7052
- const expected = this.config.expected ?? [];
7053
- if (expected.length === 0) {
7054
- return {
7055
- score: 1,
7056
- verdict: "pass",
7057
- hits: ["No tool sequence specified"],
7058
- misses: [],
7059
- expectedAspectCount: 0
7060
- };
7061
- }
7062
- const hits = [];
7063
- const misses = [];
7064
- let actualIndex = 0;
7065
- for (let i = 0; i < expected.length; i++) {
7066
- const expectedItem = expected[i];
7067
- const expectedTool = expectedItem.tool;
7068
- let found = false;
7069
- let argsMismatch = false;
7070
- while (actualIndex < toolCalls.length) {
7071
- const actualCall = toolCalls[actualIndex];
7072
- if (actualCall.name === expectedTool) {
7073
- if (argsMatch(expectedItem.args, actualCall.args)) {
7074
- hits.push(`Found ${expectedTool} at position ${actualIndex}`);
7075
- actualIndex++;
7076
- found = true;
7077
- break;
7078
- }
7079
- misses.push(
7080
- `Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`
7081
- );
7082
- actualIndex++;
7083
- argsMismatch = true;
7084
- break;
7085
- }
7086
- actualIndex++;
7087
- }
7088
- if (!found && !argsMismatch) {
7089
- misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
7090
- }
7091
- }
7092
- const score = hits.length / expected.length;
7093
- return {
7094
- score,
7095
- verdict: scoreToVerdict(score),
7096
- hits,
7097
- misses,
7098
- expectedAspectCount: expected.length
7099
- };
7100
- }
7101
- evaluateExact(toolCalls) {
7102
- const expected = this.config.expected ?? [];
7103
- if (expected.length === 0) {
7104
- return {
7105
- score: 1,
7106
- verdict: "pass",
7107
- hits: ["No tool sequence specified"],
7108
- misses: [],
7109
- expectedAspectCount: 0
7110
- };
7111
- }
7112
- const hits = [];
7113
- const misses = [];
7114
- if (toolCalls.length !== expected.length) {
7115
- misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
7116
- }
7117
- const checkLength = Math.min(expected.length, toolCalls.length);
7118
- for (let i = 0; i < checkLength; i++) {
7119
- const expectedItem = expected[i];
7120
- const expectedTool = expectedItem.tool;
7121
- const actualCall = toolCalls[i];
7122
- const actualTool = actualCall.name;
7123
- if (actualTool === expectedTool) {
7124
- if (argsMatch(expectedItem.args, actualCall.args)) {
7125
- hits.push(`Position ${i}: ${expectedTool}`);
7126
- } else {
7127
- misses.push(`Position ${i}: ${expectedTool} args mismatch`);
7128
- }
7129
- } else {
7130
- misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
7131
- }
7132
- }
7133
- for (let i = checkLength; i < expected.length; i++) {
7134
- misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
7135
- }
7136
- const score = hits.length / expected.length;
7137
- return {
7138
- score,
7139
- verdict: scoreToVerdict(score),
7140
- hits,
7141
- misses,
7142
- expectedAspectCount: expected.length
7143
- };
7144
- }
7145
- };
7146
- var DEFAULT_DATE_FORMATS = [
7147
- "YYYY-MM-DDTHH:mm:ssZ",
7148
- // ISO with timezone
7149
- "YYYY-MM-DDTHH:mm:ss",
7150
- // ISO with time
7151
- "YYYY-MM-DD",
7152
- // ISO date
7153
- "DD-MMM-YYYY",
7154
- // Localized (e.g., "15-JAN-2025")
7155
- "MM/DD/YYYY",
7156
- // US format
7157
- "DD/MM/YYYY",
7158
- // EU format
7159
- "MM-DD-YYYY",
7160
- // US with dashes
7161
- "DD-MM-YYYY"
7162
- // EU with dashes
7163
- ];
7164
- var MONTH_NAMES = {
7165
- jan: 0,
7166
- january: 0,
7167
- feb: 1,
7168
- february: 1,
7169
- mar: 2,
7170
- march: 2,
7171
- apr: 3,
7172
- april: 3,
7173
- may: 4,
7174
- jun: 5,
7175
- june: 5,
7176
- jul: 6,
7177
- july: 6,
7178
- aug: 7,
7179
- august: 7,
7180
- sep: 8,
7181
- sept: 8,
7182
- september: 8,
7183
- oct: 9,
7184
- october: 9,
7185
- nov: 10,
7186
- november: 10,
7187
- dec: 11,
7188
- december: 11
7189
- };
7190
- var FieldAccuracyEvaluator = class {
7191
- kind = "field_accuracy";
7192
- config;
7193
- constructor(options) {
7194
- this.config = options.config;
7195
- }
7196
- evaluate(context) {
7197
- const { evalCase, candidate } = context;
7198
- let candidateData;
7199
- try {
7200
- candidateData = parseJsonFromTextSafe(candidate);
7201
- } catch {
7202
- return {
7203
- score: 0,
7204
- verdict: "fail",
7205
- hits: [],
7206
- misses: ["Failed to parse candidate answer as JSON"],
7207
- expectedAspectCount: this.config.fields.length,
7208
- reasoning: "Candidate answer is not valid JSON"
7209
- };
7210
- }
7211
- const expectedData = this.extractExpectedData(evalCase.expected_messages);
7212
- if (!expectedData) {
7213
- return {
7214
- score: 0,
7215
- verdict: "fail",
7216
- hits: [],
7217
- misses: ["No expected data found in expected_messages"],
7218
- expectedAspectCount: this.config.fields.length,
7219
- reasoning: "Could not extract expected data from expected_messages"
7220
- };
7221
- }
7222
- const fieldResults = [];
7223
- for (const fieldConfig of this.config.fields) {
7224
- const result = this.evaluateField(fieldConfig, candidateData, expectedData);
7225
- fieldResults.push(result);
7226
- }
7227
- return this.aggregateResults(fieldResults);
7228
- }
7229
- /**
7230
- * Extract expected data from expected_messages array.
7231
- * Looks for the last assistant message with content.
7232
- */
7233
- extractExpectedData(expectedMessages) {
7234
- for (let i = expectedMessages.length - 1; i >= 0; i--) {
7235
- const message = expectedMessages[i];
7236
- if (message.role === "assistant" && message.content) {
7237
- if (typeof message.content === "object" && message.content !== null) {
7238
- return message.content;
7239
- }
7240
- if (typeof message.content === "string") {
7241
- try {
7242
- return parseJsonFromTextSafe(message.content);
7243
- } catch {
7244
- }
7245
- }
7246
- }
7247
- }
7248
- return void 0;
7249
- }
7250
- /**
7251
- * Evaluate a single field against the expected value.
7252
- */
7253
- evaluateField(fieldConfig, candidateData, expectedData) {
7254
- const { path: path17, match, required = true, weight = 1 } = fieldConfig;
7255
- const candidateValue = resolvePath(candidateData, path17);
7256
- const expectedValue = resolvePath(expectedData, path17);
7257
- if (expectedValue === void 0) {
7258
- return {
7259
- path: path17,
7260
- score: 1,
7261
- // No expected value means no comparison needed
7262
- weight,
7263
- hit: true,
7264
- message: `${path17}: no expected value`
7265
- };
7266
- }
7267
- if (candidateValue === void 0) {
7268
- if (required) {
7269
- return {
7270
- path: path17,
7271
- score: 0,
7272
- weight,
7273
- hit: false,
7274
- message: `${path17} (required, missing)`
7275
- };
7276
- }
7277
- return {
7278
- path: path17,
7279
- score: 1,
7280
- // Don't penalize missing optional fields
7281
- weight: 0,
7282
- // Zero weight means it won't affect the score
7283
- hit: true,
7284
- message: `${path17}: optional field missing`
7285
- };
7617
+ return {
7618
+ path: path17,
7619
+ score: 1,
7620
+ // Don't penalize missing optional fields
7621
+ weight: 0,
7622
+ // Zero weight means it won't affect the score
7623
+ hit: true,
7624
+ message: `${path17}: optional field missing`
7625
+ };
7286
7626
  }
7287
7627
  switch (match) {
7288
7628
  case "exact":
@@ -7353,436 +7693,211 @@ var FieldAccuracyEvaluator = class {
7353
7693
  message: `${path17} (non-numeric value)`
7354
7694
  };
7355
7695
  }
7356
- if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
7357
- return {
7358
- path: path17,
7359
- score: 0,
7360
- weight,
7361
- hit: false,
7362
- message: `${path17} (invalid numeric value)`
7363
- };
7364
- }
7365
- const diff = Math.abs(candidateNum - expectedNum);
7366
- let withinTolerance;
7367
- if (relative) {
7368
- const relativeDiff = expectedNum === 0 ? diff : diff / Math.abs(expectedNum);
7369
- withinTolerance = relativeDiff <= tolerance;
7370
- } else {
7371
- withinTolerance = diff <= tolerance;
7372
- }
7373
- if (withinTolerance) {
7374
- return {
7375
- path: path17,
7376
- score: 1,
7377
- weight,
7378
- hit: true,
7379
- message: `${path17} (within tolerance: diff=${diff.toFixed(2)})`
7380
- };
7381
- }
7382
- return {
7383
- path: path17,
7384
- score: 0,
7385
- weight,
7386
- hit: false,
7387
- message: `${path17} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
7388
- };
7389
- }
7390
- /**
7391
- * Date comparison with format normalization.
7392
- */
7393
- compareDate(path17, candidateValue, expectedValue, fieldConfig, weight) {
7394
- const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
7395
- const candidateDate = parseDate(String(candidateValue), formats);
7396
- const expectedDate = parseDate(String(expectedValue), formats);
7397
- if (candidateDate === null) {
7398
- return {
7399
- path: path17,
7400
- score: 0,
7401
- weight,
7402
- hit: false,
7403
- message: `${path17} (unparseable candidate date)`
7404
- };
7405
- }
7406
- if (expectedDate === null) {
7407
- return {
7408
- path: path17,
7409
- score: 0,
7410
- weight,
7411
- hit: false,
7412
- message: `${path17} (unparseable expected date)`
7413
- };
7414
- }
7415
- if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
7416
- return {
7417
- path: path17,
7418
- score: 1,
7419
- weight,
7420
- hit: true,
7421
- message: path17
7422
- };
7423
- }
7424
- return {
7425
- path: path17,
7426
- score: 0,
7427
- weight,
7428
- hit: false,
7429
- message: `${path17} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
7430
- };
7431
- }
7432
- /**
7433
- * Aggregate field results using configured strategy.
7434
- */
7435
- aggregateResults(results) {
7436
- const aggregation = this.config.aggregation ?? "weighted_average";
7437
- const hits = [];
7438
- const misses = [];
7439
- for (const result of results) {
7440
- if (result.hit) {
7441
- hits.push(result.message);
7442
- } else {
7443
- misses.push(result.message);
7444
- }
7445
- }
7446
- let score;
7447
- if (aggregation === "all_or_nothing") {
7448
- score = misses.length === 0 ? 1 : 0;
7449
- } else {
7450
- const totalWeight = results.reduce((sum, r) => sum + r.weight, 0);
7451
- if (totalWeight === 0) {
7452
- score = results.length === 0 ? 1 : 0;
7453
- } else {
7454
- const weightedSum = results.reduce((sum, r) => sum + r.score * r.weight, 0);
7455
- score = weightedSum / totalWeight;
7456
- }
7457
- }
7458
- const reasoning = `${hits.length}/${results.length} fields matched`;
7459
- return {
7460
- score: clampScore(score),
7461
- verdict: scoreToVerdict(score),
7462
- hits: hits.slice(0, 4),
7463
- misses: misses.slice(0, 4),
7464
- expectedAspectCount: results.length,
7465
- reasoning
7466
- };
7467
- }
7468
- };
7469
- function resolvePath(obj, path17) {
7470
- if (!path17 || !obj) {
7471
- return void 0;
7472
- }
7473
- const parts = path17.split(/\.|\[|\]/).filter((p) => p.length > 0);
7474
- let current = obj;
7475
- for (const part of parts) {
7476
- if (current === null || current === void 0) {
7477
- return void 0;
7478
- }
7479
- if (typeof current !== "object") {
7480
- return void 0;
7481
- }
7482
- const isIndex = /^\d+$/.test(part);
7483
- if (isIndex && Array.isArray(current)) {
7484
- current = current[Number.parseInt(part, 10)];
7485
- } else {
7486
- current = current[part];
7487
- }
7488
- }
7489
- return current;
7490
- }
7491
- function toNumber(value) {
7492
- if (typeof value === "number") {
7493
- return value;
7494
- }
7495
- if (typeof value === "string") {
7496
- const num = Number.parseFloat(value);
7497
- return Number.isNaN(num) ? null : num;
7498
- }
7499
- return null;
7500
- }
7501
- function parseDate(dateStr, formats) {
7502
- if (!dateStr) return null;
7503
- const trimmed = dateStr.trim();
7504
- const isoDate = new Date(trimmed);
7505
- if (!Number.isNaN(isoDate.getTime())) {
7506
- return isoDate;
7507
- }
7508
- const localizedMatch = trimmed.match(/^(\d{1,2})-([A-Za-z]{3,9})-(\d{4})$/);
7509
- if (localizedMatch) {
7510
- const day = Number.parseInt(localizedMatch[1], 10);
7511
- const monthName = localizedMatch[2].toLowerCase();
7512
- const year = Number.parseInt(localizedMatch[3], 10);
7513
- const month = MONTH_NAMES[monthName];
7514
- if (month !== void 0) {
7515
- return new Date(year, month, day);
7516
- }
7517
- }
7518
- const usMatch = trimmed.match(/^(\d{1,2})[\/\-](\d{1,2})[\/\-](\d{4})$/);
7519
- if (usMatch) {
7520
- const hasUSFormat = formats.some((f) => f.includes("MM/DD") || f.includes("MM-DD"));
7521
- const hasEUFormat = formats.some((f) => f.includes("DD/MM") || f.includes("DD-MM"));
7522
- if (hasUSFormat && !hasEUFormat) {
7523
- const month = Number.parseInt(usMatch[1], 10) - 1;
7524
- const day = Number.parseInt(usMatch[2], 10);
7525
- const year = Number.parseInt(usMatch[3], 10);
7526
- if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
7527
- return new Date(year, month, day);
7528
- }
7529
- } else if (hasEUFormat && !hasUSFormat) {
7530
- const day = Number.parseInt(usMatch[1], 10);
7531
- const month = Number.parseInt(usMatch[2], 10) - 1;
7532
- const year = Number.parseInt(usMatch[3], 10);
7533
- if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
7534
- return new Date(year, month, day);
7535
- }
7536
- } else {
7537
- const num1 = Number.parseInt(usMatch[1], 10);
7538
- const num2 = Number.parseInt(usMatch[2], 10);
7539
- const year = Number.parseInt(usMatch[3], 10);
7540
- if (num1 > 12 && num2 <= 12) {
7541
- return new Date(year, num2 - 1, num1);
7542
- }
7543
- if (num2 > 12 && num1 <= 12) {
7544
- return new Date(year, num1 - 1, num2);
7545
- }
7546
- if (num1 <= 12 && num2 <= 31) {
7547
- return new Date(year, num1 - 1, num2);
7548
- }
7549
- }
7550
- }
7551
- return null;
7552
- }
7553
- function formatDateISO(date) {
7554
- return date.toISOString().split("T")[0];
7555
- }
7556
- function parseJsonFromTextSafe(text) {
7557
- const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
7558
- const match = cleaned.match(/\{[\s\S]*\}/);
7559
- const blob = match?.[0] ?? cleaned;
7560
- return JSON.parse(blob);
7561
- }
7562
- var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
7563
- {{EVALUATOR_RESULTS_JSON}}
7564
-
7565
- Decide the final score and verdict based on all evaluator results.
7566
- Return a JSON object with: score (0.0-1.0), verdict (pass/fail/borderline), and reasoning.`;
7567
- var CompositeEvaluator = class {
7568
- kind = "composite";
7569
- config;
7570
- evaluatorFactory;
7571
- cwd;
7572
- constructor(options) {
7573
- this.config = options.config;
7574
- this.evaluatorFactory = options.evaluatorFactory;
7575
- this.cwd = options.cwd;
7576
- }
7577
- async evaluate(context) {
7578
- const memberResults = await Promise.all(
7579
- this.config.evaluators.map(async (memberConfig) => {
7580
- const evaluator = this.evaluatorFactory.create(memberConfig, context);
7581
- return {
7582
- id: memberConfig.name,
7583
- type: memberConfig.type,
7584
- result: await evaluator.evaluate(context)
7585
- };
7586
- })
7587
- );
7588
- return this.aggregate(memberResults, context);
7589
- }
7590
- async aggregate(results, context) {
7591
- const aggregator = this.config.aggregator;
7592
- switch (aggregator.type) {
7593
- case "code_judge":
7594
- return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
7595
- case "llm_judge":
7596
- return this.runLlmAggregator(results, context, aggregator);
7597
- default:
7598
- return this.runWeightedAverage(results, aggregator.weights);
7599
- }
7600
- }
7601
- runWeightedAverage(results, weights) {
7602
- let totalWeight = 0;
7603
- let weightedSum = 0;
7604
- const allHits = [];
7605
- const allMisses = [];
7606
- const reasoningParts = [];
7607
- const evaluatorResults = [];
7608
- for (const member of results) {
7609
- const weight = weights?.[member.id] ?? 1;
7610
- totalWeight += weight;
7611
- weightedSum += member.result.score * weight;
7612
- allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
7613
- allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
7614
- if (member.result.reasoning) {
7615
- reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
7616
- }
7617
- evaluatorResults.push({
7618
- name: member.id,
7619
- type: member.type,
7620
- score: member.result.score,
7621
- weight,
7622
- verdict: member.result.verdict,
7623
- hits: [...member.result.hits],
7624
- misses: [...member.result.misses],
7625
- reasoning: member.result.reasoning,
7626
- evaluatorRawRequest: member.result.evaluatorRawRequest,
7627
- evaluatorResults: member.result.evaluatorResults
7628
- });
7629
- }
7630
- const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
7631
- return {
7632
- score: clampScore(finalScore),
7633
- verdict: scoreToVerdict(finalScore),
7634
- hits: allHits,
7635
- misses: allMisses,
7636
- expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
7637
- reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
7638
- evaluatorRawRequest: {
7639
- aggregator: "weighted_average",
7640
- ...weights ? { weights } : {}
7641
- },
7642
- evaluatorResults
7643
- };
7644
- }
7645
- async runCodeAggregator(results, scriptPath, cwd, weights) {
7646
- const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
7647
- const inputPayload = JSON.stringify({ results: resultsObject }, null, 2);
7648
- const evaluatorResults = results.map((member) => ({
7649
- name: member.id,
7650
- type: member.type,
7651
- score: member.result.score,
7652
- weight: weights?.[member.id] ?? 1,
7653
- verdict: member.result.verdict,
7654
- hits: [...member.result.hits],
7655
- misses: [...member.result.misses],
7656
- reasoning: member.result.reasoning,
7657
- evaluatorRawRequest: member.result.evaluatorRawRequest,
7658
- evaluatorResults: member.result.evaluatorResults
7659
- }));
7660
- try {
7661
- const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
7662
- const parsed = parseJsonSafe(stdout);
7663
- const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
7664
- const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
7665
- const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
7666
- const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
7667
- const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
7668
- return {
7669
- score,
7670
- verdict,
7671
- hits,
7672
- misses,
7673
- expectedAspectCount: hits.length + misses.length || 1,
7674
- reasoning,
7675
- evaluatorRawRequest: {
7676
- aggregator: "code_judge",
7677
- script: scriptPath
7678
- },
7679
- evaluatorResults
7680
- };
7681
- } catch (error) {
7682
- const message = error instanceof Error ? error.message : String(error);
7683
- return {
7684
- score: 0,
7685
- verdict: "fail",
7686
- hits: [],
7687
- misses: [`Code aggregator failed: ${message}`],
7688
- expectedAspectCount: 1,
7689
- reasoning: message,
7690
- evaluatorRawRequest: {
7691
- aggregator: "code_judge",
7692
- script: scriptPath,
7693
- error: message
7694
- },
7695
- evaluatorResults
7696
- };
7697
- }
7698
- }
7699
- async runLlmAggregator(results, context, config) {
7700
- const judgeProvider = context.judgeProvider;
7701
- if (!judgeProvider) {
7702
- throw new Error("No judge provider available for LLM aggregation");
7703
- }
7704
- const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
7705
- const resultsJson = JSON.stringify(resultsObject, null, 2);
7706
- const evaluatorResults = results.map((member) => ({
7707
- name: member.id,
7708
- type: member.type,
7709
- score: member.result.score,
7710
- verdict: member.result.verdict,
7711
- hits: [...member.result.hits],
7712
- misses: [...member.result.misses],
7713
- reasoning: member.result.reasoning,
7714
- evaluatorRawRequest: member.result.evaluatorRawRequest,
7715
- evaluatorResults: member.result.evaluatorResults
7716
- }));
7717
- const promptTemplate = config.prompt ?? DEFAULT_COMPOSITE_AGGREGATOR_PROMPT;
7718
- const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
7719
- const systemPrompt = buildOutputSchema();
7720
- const evaluatorRawRequest = {
7721
- aggregator: "llm_judge",
7722
- userPrompt,
7723
- systemPrompt,
7724
- target: judgeProvider.targetName
7725
- };
7726
- try {
7727
- const model = judgeProvider.asLanguageModel?.();
7728
- if (model) {
7729
- const { text } = await (0, import_ai2.generateText)({
7730
- model,
7731
- system: systemPrompt,
7732
- prompt: userPrompt
7733
- });
7734
- const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text));
7735
- const score2 = clampScore(data2.score);
7736
- const hits2 = Array.isArray(data2.hits) ? data2.hits.filter(isNonEmptyString).slice(0, 4) : [];
7737
- const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
7738
- const reasoning2 = data2.reasoning;
7739
- return {
7740
- score: score2,
7741
- verdict: scoreToVerdict(score2),
7742
- hits: hits2,
7743
- misses: misses2,
7744
- expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
7745
- reasoning: reasoning2,
7746
- evaluatorRawRequest,
7747
- evaluatorResults
7748
- };
7749
- }
7750
- const response = await judgeProvider.invoke({
7751
- question: userPrompt,
7752
- systemPrompt,
7753
- evalCaseId: context.evalCase.id,
7754
- attempt: context.attempt
7755
- });
7756
- const data = freeformEvaluationSchema.parse(
7757
- parseJsonFromText(extractLastAssistantContent(response.outputMessages))
7758
- );
7759
- const score = clampScore(data.score);
7760
- const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
7761
- const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
7762
- const reasoning = data.reasoning;
7696
+ if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
7763
7697
  return {
7764
- score,
7765
- verdict: scoreToVerdict(score),
7766
- hits,
7767
- misses,
7768
- expectedAspectCount: Math.max(hits.length + misses.length, 1),
7769
- reasoning,
7770
- evaluatorRawRequest,
7771
- evaluatorResults
7698
+ path: path17,
7699
+ score: 0,
7700
+ weight,
7701
+ hit: false,
7702
+ message: `${path17} (invalid numeric value)`
7772
7703
  };
7773
- } catch {
7704
+ }
7705
+ const diff = Math.abs(candidateNum - expectedNum);
7706
+ let withinTolerance;
7707
+ if (relative) {
7708
+ const relativeDiff = expectedNum === 0 ? diff : diff / Math.abs(expectedNum);
7709
+ withinTolerance = relativeDiff <= tolerance;
7710
+ } else {
7711
+ withinTolerance = diff <= tolerance;
7712
+ }
7713
+ if (withinTolerance) {
7714
+ return {
7715
+ path: path17,
7716
+ score: 1,
7717
+ weight,
7718
+ hit: true,
7719
+ message: `${path17} (within tolerance: diff=${diff.toFixed(2)})`
7720
+ };
7721
+ }
7722
+ return {
7723
+ path: path17,
7724
+ score: 0,
7725
+ weight,
7726
+ hit: false,
7727
+ message: `${path17} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
7728
+ };
7729
+ }
7730
+ /**
7731
+ * Date comparison with format normalization.
7732
+ */
7733
+ compareDate(path17, candidateValue, expectedValue, fieldConfig, weight) {
7734
+ const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
7735
+ const candidateDate = parseDate(String(candidateValue), formats);
7736
+ const expectedDate = parseDate(String(expectedValue), formats);
7737
+ if (candidateDate === null) {
7774
7738
  return {
7739
+ path: path17,
7775
7740
  score: 0,
7776
- verdict: "fail",
7777
- hits: [],
7778
- misses: [],
7779
- expectedAspectCount: 1,
7780
- evaluatorRawRequest,
7781
- evaluatorResults
7741
+ weight,
7742
+ hit: false,
7743
+ message: `${path17} (unparseable candidate date)`
7744
+ };
7745
+ }
7746
+ if (expectedDate === null) {
7747
+ return {
7748
+ path: path17,
7749
+ score: 0,
7750
+ weight,
7751
+ hit: false,
7752
+ message: `${path17} (unparseable expected date)`
7753
+ };
7754
+ }
7755
+ if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
7756
+ return {
7757
+ path: path17,
7758
+ score: 1,
7759
+ weight,
7760
+ hit: true,
7761
+ message: path17
7782
7762
  };
7783
7763
  }
7764
+ return {
7765
+ path: path17,
7766
+ score: 0,
7767
+ weight,
7768
+ hit: false,
7769
+ message: `${path17} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
7770
+ };
7771
+ }
7772
+ /**
7773
+ * Aggregate field results using configured strategy.
7774
+ */
7775
+ aggregateResults(results) {
7776
+ const aggregation = this.config.aggregation ?? "weighted_average";
7777
+ const hits = [];
7778
+ const misses = [];
7779
+ for (const result of results) {
7780
+ if (result.hit) {
7781
+ hits.push(result.message);
7782
+ } else {
7783
+ misses.push(result.message);
7784
+ }
7785
+ }
7786
+ let score;
7787
+ if (aggregation === "all_or_nothing") {
7788
+ score = misses.length === 0 ? 1 : 0;
7789
+ } else {
7790
+ const totalWeight = results.reduce((sum, r) => sum + r.weight, 0);
7791
+ if (totalWeight === 0) {
7792
+ score = results.length === 0 ? 1 : 0;
7793
+ } else {
7794
+ const weightedSum = results.reduce((sum, r) => sum + r.score * r.weight, 0);
7795
+ score = weightedSum / totalWeight;
7796
+ }
7797
+ }
7798
+ const reasoning = `${hits.length}/${results.length} fields matched`;
7799
+ return {
7800
+ score: clampScore(score),
7801
+ verdict: scoreToVerdict(score),
7802
+ hits: hits.slice(0, 4),
7803
+ misses: misses.slice(0, 4),
7804
+ expectedAspectCount: results.length,
7805
+ reasoning
7806
+ };
7784
7807
  }
7785
7808
  };
7809
+ function resolvePath(obj, path17) {
7810
+ if (!path17 || !obj) {
7811
+ return void 0;
7812
+ }
7813
+ const parts = path17.split(/\.|\[|\]/).filter((p) => p.length > 0);
7814
+ let current = obj;
7815
+ for (const part of parts) {
7816
+ if (current === null || current === void 0) {
7817
+ return void 0;
7818
+ }
7819
+ if (typeof current !== "object") {
7820
+ return void 0;
7821
+ }
7822
+ const isIndex = /^\d+$/.test(part);
7823
+ if (isIndex && Array.isArray(current)) {
7824
+ current = current[Number.parseInt(part, 10)];
7825
+ } else {
7826
+ current = current[part];
7827
+ }
7828
+ }
7829
+ return current;
7830
+ }
7831
+ function toNumber(value) {
7832
+ if (typeof value === "number") {
7833
+ return value;
7834
+ }
7835
+ if (typeof value === "string") {
7836
+ const num = Number.parseFloat(value);
7837
+ return Number.isNaN(num) ? null : num;
7838
+ }
7839
+ return null;
7840
+ }
7841
+ function parseDate(dateStr, formats) {
7842
+ if (!dateStr) return null;
7843
+ const trimmed = dateStr.trim();
7844
+ const isoDate = new Date(trimmed);
7845
+ if (!Number.isNaN(isoDate.getTime())) {
7846
+ return isoDate;
7847
+ }
7848
+ const localizedMatch = trimmed.match(/^(\d{1,2})-([A-Za-z]{3,9})-(\d{4})$/);
7849
+ if (localizedMatch) {
7850
+ const day = Number.parseInt(localizedMatch[1], 10);
7851
+ const monthName = localizedMatch[2].toLowerCase();
7852
+ const year = Number.parseInt(localizedMatch[3], 10);
7853
+ const month = MONTH_NAMES[monthName];
7854
+ if (month !== void 0) {
7855
+ return new Date(year, month, day);
7856
+ }
7857
+ }
7858
+ const usMatch = trimmed.match(/^(\d{1,2})[\/\-](\d{1,2})[\/\-](\d{4})$/);
7859
+ if (usMatch) {
7860
+ const hasUSFormat = formats.some((f) => f.includes("MM/DD") || f.includes("MM-DD"));
7861
+ const hasEUFormat = formats.some((f) => f.includes("DD/MM") || f.includes("DD-MM"));
7862
+ if (hasUSFormat && !hasEUFormat) {
7863
+ const month = Number.parseInt(usMatch[1], 10) - 1;
7864
+ const day = Number.parseInt(usMatch[2], 10);
7865
+ const year = Number.parseInt(usMatch[3], 10);
7866
+ if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
7867
+ return new Date(year, month, day);
7868
+ }
7869
+ } else if (hasEUFormat && !hasUSFormat) {
7870
+ const day = Number.parseInt(usMatch[1], 10);
7871
+ const month = Number.parseInt(usMatch[2], 10) - 1;
7872
+ const year = Number.parseInt(usMatch[3], 10);
7873
+ if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
7874
+ return new Date(year, month, day);
7875
+ }
7876
+ } else {
7877
+ const num1 = Number.parseInt(usMatch[1], 10);
7878
+ const num2 = Number.parseInt(usMatch[2], 10);
7879
+ const year = Number.parseInt(usMatch[3], 10);
7880
+ if (num1 > 12 && num2 <= 12) {
7881
+ return new Date(year, num2 - 1, num1);
7882
+ }
7883
+ if (num2 > 12 && num1 <= 12) {
7884
+ return new Date(year, num1 - 1, num2);
7885
+ }
7886
+ if (num1 <= 12 && num2 <= 31) {
7887
+ return new Date(year, num1 - 1, num2);
7888
+ }
7889
+ }
7890
+ }
7891
+ return null;
7892
+ }
7893
+ function formatDateISO(date) {
7894
+ return date.toISOString().split("T")[0];
7895
+ }
7896
+ function parseJsonFromTextSafe(text) {
7897
+ return parseJsonFromText(text);
7898
+ }
7899
+
7900
+ // src/evaluation/evaluators/latency.ts
7786
7901
  var LatencyEvaluator = class {
7787
7902
  kind = "latency";
7788
7903
  config;
@@ -7816,56 +7931,16 @@ var LatencyEvaluator = class {
7816
7931
  misses: passed ? [] : [`Duration ${durationMs}ms > ${threshold}ms threshold`],
7817
7932
  expectedAspectCount: 1,
7818
7933
  reasoning: `Execution took ${durationMs}ms (threshold: ${threshold}ms)`,
7819
- evaluatorRawRequest: {
7820
- type: "latency",
7821
- threshold,
7822
- durationMs
7823
- }
7824
- };
7825
- }
7826
- };
7827
- var CostEvaluator = class {
7828
- kind = "cost";
7829
- config;
7830
- constructor(options) {
7831
- this.config = options.config;
7832
- }
7833
- evaluate(context) {
7834
- const { budget } = this.config;
7835
- const costUsd = context.traceSummary?.costUsd;
7836
- if (costUsd === void 0) {
7837
- return {
7838
- score: 0,
7839
- verdict: "fail",
7840
- hits: [],
7841
- misses: ["No cost data available in trace"],
7842
- expectedAspectCount: 1,
7843
- reasoning: "Execution cost not reported by provider",
7844
- evaluatorRawRequest: {
7845
- type: "cost",
7846
- budget,
7847
- costUsd: null
7848
- }
7849
- };
7850
- }
7851
- const passed = costUsd <= budget;
7852
- const score = passed ? 1 : 0;
7853
- const formatCost = (n) => `$${n.toFixed(4)}`;
7854
- return {
7855
- score,
7856
- verdict: passed ? "pass" : "fail",
7857
- hits: passed ? [`Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`] : [],
7858
- misses: passed ? [] : [`Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`],
7859
- expectedAspectCount: 1,
7860
- reasoning: `Execution cost ${formatCost(costUsd)} (budget: ${formatCost(budget)})`,
7861
- evaluatorRawRequest: {
7862
- type: "cost",
7863
- budget,
7864
- costUsd
7934
+ evaluatorRawRequest: {
7935
+ type: "latency",
7936
+ threshold,
7937
+ durationMs
7865
7938
  }
7866
7939
  };
7867
7940
  }
7868
7941
  };
7942
+
7943
+ // src/evaluation/evaluators/token-usage.ts
7869
7944
  var TokenUsageEvaluator = class {
7870
7945
  kind = "token_usage";
7871
7946
  config;
@@ -7949,8 +8024,228 @@ var TokenUsageEvaluator = class {
7949
8024
  }
7950
8025
  };
7951
8026
 
8027
+ // src/evaluation/evaluators/tool-trajectory.ts
8028
+ function argsMatch(expected, actual) {
8029
+ if (expected === void 0) return true;
8030
+ if (expected === "any") return true;
8031
+ if (actual === void 0) return false;
8032
+ for (const key of Object.keys(expected)) {
8033
+ if (!Object.hasOwn(actual, key)) return false;
8034
+ if (!deepEqual(expected[key], actual[key])) return false;
8035
+ }
8036
+ return true;
8037
+ }
8038
+ var ToolTrajectoryEvaluator = class {
8039
+ kind = "tool_trajectory";
8040
+ config;
8041
+ constructor(options) {
8042
+ this.config = options.config;
8043
+ }
8044
+ evaluate(context) {
8045
+ const { outputMessages, traceSummary } = context;
8046
+ const toolCalls = this.extractToolCallsFromMessages(outputMessages);
8047
+ if (toolCalls.length === 0 && !traceSummary) {
8048
+ return {
8049
+ score: 0,
8050
+ verdict: "fail",
8051
+ hits: [],
8052
+ misses: ["No trace available for evaluation"],
8053
+ expectedAspectCount: 1
8054
+ };
8055
+ }
8056
+ const summary = toolCalls.length > 0 ? this.buildSummary(toolCalls) : traceSummary;
8057
+ if (!summary) {
8058
+ return {
8059
+ score: 0,
8060
+ verdict: "fail",
8061
+ hits: [],
8062
+ misses: ["No trace available for evaluation"],
8063
+ expectedAspectCount: 1
8064
+ };
8065
+ }
8066
+ switch (this.config.mode) {
8067
+ case "any_order":
8068
+ return this.evaluateAnyOrder(summary);
8069
+ case "in_order":
8070
+ return this.evaluateInOrder(toolCalls);
8071
+ case "exact":
8072
+ return this.evaluateExact(toolCalls);
8073
+ default:
8074
+ return {
8075
+ score: 0,
8076
+ verdict: "fail",
8077
+ hits: [],
8078
+ misses: [`Unknown mode: ${this.config.mode}`],
8079
+ expectedAspectCount: 1
8080
+ };
8081
+ }
8082
+ }
8083
+ /**
8084
+ * Extract tool calls from output messages.
8085
+ */
8086
+ extractToolCallsFromMessages(messages) {
8087
+ if (!messages) {
8088
+ return [];
8089
+ }
8090
+ const toolCalls = [];
8091
+ for (const message of messages) {
8092
+ if (message.toolCalls) {
8093
+ for (const call of message.toolCalls) {
8094
+ toolCalls.push({
8095
+ name: call.tool,
8096
+ args: call.input
8097
+ });
8098
+ }
8099
+ }
8100
+ }
8101
+ return toolCalls;
8102
+ }
8103
+ /**
8104
+ * Build a summary from extracted tool calls.
8105
+ */
8106
+ buildSummary(toolCalls) {
8107
+ const toolCallsByName = {};
8108
+ for (const call of toolCalls) {
8109
+ toolCallsByName[call.name] = (toolCallsByName[call.name] ?? 0) + 1;
8110
+ }
8111
+ const toolNames = Object.keys(toolCallsByName).sort();
8112
+ return {
8113
+ eventCount: toolCalls.length,
8114
+ toolNames,
8115
+ toolCallsByName,
8116
+ errorCount: 0
8117
+ };
8118
+ }
8119
+ evaluateAnyOrder(summary) {
8120
+ const minimums = this.config.minimums ?? {};
8121
+ const toolNames = Object.keys(minimums);
8122
+ if (toolNames.length === 0) {
8123
+ return {
8124
+ score: 1,
8125
+ verdict: "pass",
8126
+ hits: ["No tool requirements specified"],
8127
+ misses: [],
8128
+ expectedAspectCount: 0
8129
+ };
8130
+ }
8131
+ const hits = [];
8132
+ const misses = [];
8133
+ for (const toolName of toolNames) {
8134
+ const required = minimums[toolName];
8135
+ const actual = summary.toolCallsByName[toolName] ?? 0;
8136
+ if (actual >= required) {
8137
+ hits.push(`${toolName}: called ${actual} times (required \u2265${required})`);
8138
+ } else {
8139
+ misses.push(`${toolName}: called ${actual} times (required \u2265${required})`);
8140
+ }
8141
+ }
8142
+ const score = hits.length / toolNames.length;
8143
+ return {
8144
+ score,
8145
+ verdict: scoreToVerdict(score),
8146
+ hits,
8147
+ misses,
8148
+ expectedAspectCount: toolNames.length
8149
+ };
8150
+ }
8151
+ evaluateInOrder(toolCalls) {
8152
+ const expected = this.config.expected ?? [];
8153
+ if (expected.length === 0) {
8154
+ return {
8155
+ score: 1,
8156
+ verdict: "pass",
8157
+ hits: ["No tool sequence specified"],
8158
+ misses: [],
8159
+ expectedAspectCount: 0
8160
+ };
8161
+ }
8162
+ const hits = [];
8163
+ const misses = [];
8164
+ let actualIndex = 0;
8165
+ for (let i = 0; i < expected.length; i++) {
8166
+ const expectedItem = expected[i];
8167
+ const expectedTool = expectedItem.tool;
8168
+ let found = false;
8169
+ let argsMismatch = false;
8170
+ while (actualIndex < toolCalls.length) {
8171
+ const actualCall = toolCalls[actualIndex];
8172
+ if (actualCall.name === expectedTool) {
8173
+ if (argsMatch(expectedItem.args, actualCall.args)) {
8174
+ hits.push(`Found ${expectedTool} at position ${actualIndex}`);
8175
+ actualIndex++;
8176
+ found = true;
8177
+ break;
8178
+ }
8179
+ misses.push(
8180
+ `Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`
8181
+ );
8182
+ actualIndex++;
8183
+ argsMismatch = true;
8184
+ break;
8185
+ }
8186
+ actualIndex++;
8187
+ }
8188
+ if (!found && !argsMismatch) {
8189
+ misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
8190
+ }
8191
+ }
8192
+ const score = hits.length / expected.length;
8193
+ return {
8194
+ score,
8195
+ verdict: scoreToVerdict(score),
8196
+ hits,
8197
+ misses,
8198
+ expectedAspectCount: expected.length
8199
+ };
8200
+ }
8201
+ evaluateExact(toolCalls) {
8202
+ const expected = this.config.expected ?? [];
8203
+ if (expected.length === 0) {
8204
+ return {
8205
+ score: 1,
8206
+ verdict: "pass",
8207
+ hits: ["No tool sequence specified"],
8208
+ misses: [],
8209
+ expectedAspectCount: 0
8210
+ };
8211
+ }
8212
+ const hits = [];
8213
+ const misses = [];
8214
+ if (toolCalls.length !== expected.length) {
8215
+ misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
8216
+ }
8217
+ const checkLength = Math.min(expected.length, toolCalls.length);
8218
+ for (let i = 0; i < checkLength; i++) {
8219
+ const expectedItem = expected[i];
8220
+ const expectedTool = expectedItem.tool;
8221
+ const actualCall = toolCalls[i];
8222
+ const actualTool = actualCall.name;
8223
+ if (actualTool === expectedTool) {
8224
+ if (argsMatch(expectedItem.args, actualCall.args)) {
8225
+ hits.push(`Position ${i}: ${expectedTool}`);
8226
+ } else {
8227
+ misses.push(`Position ${i}: ${expectedTool} args mismatch`);
8228
+ }
8229
+ } else {
8230
+ misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
8231
+ }
8232
+ }
8233
+ for (let i = checkLength; i < expected.length; i++) {
8234
+ misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
8235
+ }
8236
+ const score = hits.length / expected.length;
8237
+ return {
8238
+ score,
8239
+ verdict: scoreToVerdict(score),
8240
+ hits,
8241
+ misses,
8242
+ expectedAspectCount: expected.length
8243
+ };
8244
+ }
8245
+ };
8246
+
7952
8247
  // src/evaluation/orchestrator.ts
7953
- var import_node_crypto4 = require("crypto");
8248
+ var import_node_crypto5 = require("crypto");
7954
8249
  var import_node_path16 = __toESM(require("path"), 1);
7955
8250
 
7956
8251
  // ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
@@ -8162,6 +8457,17 @@ async function runEvaluation(options) {
8162
8457
  }
8163
8458
  return getOrCreateProvider(resolvedJudge);
8164
8459
  };
8460
+ const targetResolver = (name) => {
8461
+ const resolved = resolveTargetByName(name);
8462
+ if (!resolved) {
8463
+ return void 0;
8464
+ }
8465
+ return getOrCreateProvider(resolved);
8466
+ };
8467
+ const availableTargets = [
8468
+ target.name,
8469
+ ...Array.from(targetDefinitions.keys())
8470
+ ];
8165
8471
  const evaluatorRegistry = buildEvaluatorRegistry(evaluators, resolveJudgeProvider);
8166
8472
  const primaryProvider = getOrCreateProvider(target);
8167
8473
  const providerSupportsBatch = target.providerBatching === true && primaryProvider.supportsBatch === true && typeof primaryProvider.invokeBatch === "function";
@@ -8191,7 +8497,9 @@ async function runEvaluation(options) {
8191
8497
  onResult,
8192
8498
  verbose,
8193
8499
  resolveJudgeProvider,
8194
- agentTimeoutMs
8500
+ agentTimeoutMs,
8501
+ targetResolver,
8502
+ availableTargets
8195
8503
  });
8196
8504
  } catch (error) {
8197
8505
  if (verbose) {
@@ -8230,7 +8538,9 @@ async function runEvaluation(options) {
8230
8538
  cache,
8231
8539
  useCache,
8232
8540
  now,
8233
- judgeProvider
8541
+ judgeProvider,
8542
+ targetResolver,
8543
+ availableTargets
8234
8544
  });
8235
8545
  if (onProgress) {
8236
8546
  await onProgress({
@@ -8297,7 +8607,9 @@ async function runBatchEvaluation(options) {
8297
8607
  onProgress,
8298
8608
  onResult,
8299
8609
  resolveJudgeProvider,
8300
- agentTimeoutMs
8610
+ agentTimeoutMs,
8611
+ targetResolver,
8612
+ availableTargets
8301
8613
  } = options;
8302
8614
  const promptInputsList = [];
8303
8615
  const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
@@ -8356,7 +8668,7 @@ async function runBatchEvaluation(options) {
8356
8668
  costUsd: providerResponse.costUsd,
8357
8669
  durationMs: providerResponse.durationMs
8358
8670
  }) : void 0;
8359
- const candidate = extractLastAssistantContent(outputMessages);
8671
+ const candidate = extractLastAssistantContent2(outputMessages);
8360
8672
  const providerError = extractProviderError(providerResponse);
8361
8673
  let result;
8362
8674
  try {
@@ -8372,7 +8684,9 @@ async function runBatchEvaluation(options) {
8372
8684
  judgeProvider: await resolveJudgeProvider(target),
8373
8685
  agentTimeoutMs,
8374
8686
  outputMessages,
8375
- traceSummary
8687
+ traceSummary,
8688
+ targetResolver,
8689
+ availableTargets
8376
8690
  });
8377
8691
  if (providerError) {
8378
8692
  result = { ...result, error: providerError };
@@ -8430,7 +8744,9 @@ async function runEvalCase(options) {
8430
8744
  cache,
8431
8745
  useCache,
8432
8746
  signal,
8433
- judgeProvider
8747
+ judgeProvider,
8748
+ targetResolver,
8749
+ availableTargets
8434
8750
  } = options;
8435
8751
  const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
8436
8752
  const promptInputs = await buildPromptInputs(evalCase, formattingMode);
@@ -8489,7 +8805,7 @@ async function runEvalCase(options) {
8489
8805
  costUsd: providerResponse.costUsd,
8490
8806
  durationMs: providerResponse.durationMs
8491
8807
  }) : void 0;
8492
- const candidate = extractLastAssistantContent(outputMessages);
8808
+ const candidate = extractLastAssistantContent2(outputMessages);
8493
8809
  const providerError = extractProviderError(providerResponse);
8494
8810
  try {
8495
8811
  const result = await evaluateCandidate({
@@ -8504,7 +8820,9 @@ async function runEvalCase(options) {
8504
8820
  judgeProvider,
8505
8821
  agentTimeoutMs,
8506
8822
  outputMessages,
8507
- traceSummary
8823
+ traceSummary,
8824
+ targetResolver,
8825
+ availableTargets
8508
8826
  });
8509
8827
  return providerError ? { ...result, error: providerError } : result;
8510
8828
  } catch (error) {
@@ -8524,7 +8842,9 @@ async function evaluateCandidate(options) {
8524
8842
  judgeProvider,
8525
8843
  agentTimeoutMs,
8526
8844
  outputMessages,
8527
- traceSummary
8845
+ traceSummary,
8846
+ targetResolver,
8847
+ availableTargets
8528
8848
  } = options;
8529
8849
  const gradeTimestamp = nowFn();
8530
8850
  const { score, evaluatorResults } = await runEvaluatorsForCase({
@@ -8539,7 +8859,9 @@ async function evaluateCandidate(options) {
8539
8859
  judgeProvider,
8540
8860
  agentTimeoutMs,
8541
8861
  outputMessages,
8542
- traceSummary
8862
+ traceSummary,
8863
+ targetResolver,
8864
+ availableTargets
8543
8865
  });
8544
8866
  const completedAt = nowFn();
8545
8867
  let agentProviderRequest;
@@ -8592,7 +8914,9 @@ async function runEvaluatorsForCase(options) {
8592
8914
  judgeProvider,
8593
8915
  agentTimeoutMs,
8594
8916
  outputMessages,
8595
- traceSummary
8917
+ traceSummary,
8918
+ targetResolver,
8919
+ availableTargets
8596
8920
  } = options;
8597
8921
  if (evalCase.evaluators && evalCase.evaluators.length > 0) {
8598
8922
  return runEvaluatorList({
@@ -8608,7 +8932,9 @@ async function runEvaluatorsForCase(options) {
8608
8932
  judgeProvider,
8609
8933
  agentTimeoutMs,
8610
8934
  outputMessages,
8611
- traceSummary
8935
+ traceSummary,
8936
+ targetResolver,
8937
+ availableTargets
8612
8938
  });
8613
8939
  }
8614
8940
  const evaluatorKind = evalCase.evaluator ?? "llm_judge";
@@ -8626,7 +8952,9 @@ async function runEvaluatorsForCase(options) {
8626
8952
  now,
8627
8953
  judgeProvider,
8628
8954
  outputMessages,
8629
- traceSummary
8955
+ traceSummary,
8956
+ targetResolver,
8957
+ availableTargets
8630
8958
  });
8631
8959
  return { score };
8632
8960
  }
@@ -8644,7 +8972,9 @@ async function runEvaluatorList(options) {
8644
8972
  judgeProvider,
8645
8973
  agentTimeoutMs,
8646
8974
  outputMessages,
8647
- traceSummary
8975
+ traceSummary,
8976
+ targetResolver,
8977
+ availableTargets
8648
8978
  } = options;
8649
8979
  const scored = [];
8650
8980
  const evaluatorResults = [];
@@ -8682,7 +9012,8 @@ async function runEvaluatorList(options) {
8682
9012
  script: evaluator.script,
8683
9013
  cwd: evaluator.resolvedCwd ?? evaluator.cwd,
8684
9014
  agentTimeoutMs,
8685
- config: evaluator.config
9015
+ config: evaluator.config,
9016
+ target: evaluator.target
8686
9017
  });
8687
9018
  const score2 = await codeEvaluator.evaluate({
8688
9019
  evalCase,
@@ -8692,8 +9023,11 @@ async function runEvaluatorList(options) {
8692
9023
  attempt,
8693
9024
  promptInputs,
8694
9025
  now,
9026
+ judgeProvider,
8695
9027
  outputMessages,
8696
- traceSummary
9028
+ traceSummary,
9029
+ targetResolver,
9030
+ availableTargets
8697
9031
  });
8698
9032
  const weight = evaluator.weight ?? 1;
8699
9033
  scored.push({ score: score2, name: evaluator.name, type: "code_judge", weight });
@@ -8706,7 +9040,8 @@ async function runEvaluatorList(options) {
8706
9040
  hits: score2.hits,
8707
9041
  misses: score2.misses,
8708
9042
  reasoning: score2.reasoning,
8709
- evaluatorProviderRequest: score2.evaluatorRawRequest
9043
+ evaluatorProviderRequest: score2.evaluatorRawRequest,
9044
+ details: score2.details
8710
9045
  });
8711
9046
  }
8712
9047
  if (evaluator.type === "composite") {
@@ -8720,7 +9055,8 @@ async function runEvaluatorList(options) {
8720
9055
  script: memberConfig.script,
8721
9056
  cwd: memberConfig.resolvedCwd ?? memberConfig.cwd,
8722
9057
  agentTimeoutMs,
8723
- config: memberConfig.config
9058
+ config: memberConfig.config,
9059
+ target: memberConfig.target
8724
9060
  });
8725
9061
  case "composite":
8726
9062
  return new CompositeEvaluator({
@@ -8769,7 +9105,9 @@ async function runEvaluatorList(options) {
8769
9105
  now,
8770
9106
  judgeProvider,
8771
9107
  outputMessages,
8772
- traceSummary
9108
+ traceSummary,
9109
+ targetResolver,
9110
+ availableTargets
8773
9111
  });
8774
9112
  const weight = evaluator.weight ?? 1;
8775
9113
  scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
@@ -8965,11 +9303,11 @@ async function runEvaluatorList(options) {
8965
9303
  (total, entry) => total + (entry.score.expectedAspectCount ?? 0),
8966
9304
  0
8967
9305
  );
8968
- const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString2);
9306
+ const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString);
8969
9307
  const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
8970
9308
  const score = {
8971
9309
  score: aggregateScore,
8972
- verdict: scoreToVerdict2(aggregateScore),
9310
+ verdict: scoreToVerdict(aggregateScore),
8973
9311
  hits,
8974
9312
  misses,
8975
9313
  expectedAspectCount,
@@ -9016,18 +9354,6 @@ async function resolveCustomPrompt(config) {
9016
9354
  }
9017
9355
  return config.prompt;
9018
9356
  }
9019
- function isNonEmptyString2(value) {
9020
- return typeof value === "string" && value.trim().length > 0;
9021
- }
9022
- function scoreToVerdict2(score) {
9023
- if (score >= 0.8) {
9024
- return "pass";
9025
- }
9026
- if (score >= 0.6) {
9027
- return "borderline";
9028
- }
9029
- return "fail";
9030
- }
9031
9357
  function filterEvalCases(evalCases, evalId) {
9032
9358
  if (!evalId) {
9033
9359
  return evalCases;
@@ -9129,7 +9455,7 @@ function extractProviderError(response) {
9129
9455
  return trimmed.length > 0 ? trimmed : void 0;
9130
9456
  }
9131
9457
  function createCacheKey(provider, target, evalCase, promptInputs) {
9132
- const hash = (0, import_node_crypto4.createHash)("sha256");
9458
+ const hash = (0, import_node_crypto5.createHash)("sha256");
9133
9459
  hash.update(provider.id);
9134
9460
  hash.update(target.name);
9135
9461
  hash.update(evalCase.id);
@@ -9170,7 +9496,8 @@ function mapChildResults(children) {
9170
9496
  misses: child.misses,
9171
9497
  reasoning: child.reasoning,
9172
9498
  evaluatorProviderRequest: child.evaluatorRawRequest,
9173
- evaluatorResults: mapChildResults(child.evaluatorResults)
9499
+ evaluatorResults: mapChildResults(child.evaluatorResults),
9500
+ details: child.details
9174
9501
  }));
9175
9502
  }
9176
9503
  function computeWeightedMean(entries) {
@@ -9185,7 +9512,7 @@ function computeWeightedMean(entries) {
9185
9512
  }
9186
9513
 
9187
9514
  // src/evaluation/generators/rubric-generator.ts
9188
- var import_ai3 = require("ai");
9515
+ var import_ai4 = require("ai");
9189
9516
  var import_zod4 = require("zod");
9190
9517
  var rubricItemSchema = import_zod4.z.object({
9191
9518
  id: import_zod4.z.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
@@ -9219,7 +9546,7 @@ You must return a valid JSON object matching this schema:
9219
9546
  let lastError;
9220
9547
  for (let attempt = 1; attempt <= 3; attempt++) {
9221
9548
  try {
9222
- const { text } = await (0, import_ai3.generateText)({
9549
+ const { text } = await (0, import_ai4.generateText)({
9223
9550
  model,
9224
9551
  system,
9225
9552
  prompt
@@ -9282,31 +9609,39 @@ function createAgentKernel() {
9282
9609
  ToolTrajectoryEvaluator,
9283
9610
  avgToolDurationMs,
9284
9611
  buildDirectoryChain,
9612
+ buildOutputSchema,
9285
9613
  buildPromptInputs,
9286
9614
  buildSearchRoots,
9615
+ clampScore,
9287
9616
  computeTraceSummary,
9288
9617
  consumeClaudeCodeLogEntries,
9289
9618
  consumeCodexLogEntries,
9290
9619
  consumePiLogEntries,
9291
9620
  createAgentKernel,
9292
9621
  createProvider,
9622
+ deepEqual,
9293
9623
  ensureVSCodeSubagents,
9624
+ executeScript,
9294
9625
  explorationRatio,
9295
- extractCodeBlocks,
9626
+ extractJsonBlob,
9296
9627
  fileExists,
9297
9628
  findGitRoot,
9629
+ freeformEvaluationSchema,
9298
9630
  generateRubrics,
9299
9631
  getHitCount,
9300
9632
  isEvaluatorKind,
9301
9633
  isGuidelineFile,
9302
9634
  isJsonObject,
9303
9635
  isJsonValue,
9636
+ isNonEmptyString,
9304
9637
  isTestMessage,
9305
9638
  isTestMessageRole,
9306
9639
  listTargetNames,
9307
9640
  loadEvalCases,
9308
9641
  mergeExecutionMetrics,
9309
9642
  normalizeLineEndings,
9643
+ parseJsonFromText,
9644
+ parseJsonSafe,
9310
9645
  readJsonFile,
9311
9646
  readTargetDefinitions,
9312
9647
  readTestSuiteMetadata,
@@ -9316,6 +9651,7 @@ function createAgentKernel() {
9316
9651
  resolveTargetDefinition,
9317
9652
  runEvalCase,
9318
9653
  runEvaluation,
9654
+ scoreToVerdict,
9319
9655
  subscribeToClaudeCodeLogEntries,
9320
9656
  subscribeToCodexLogEntries,
9321
9657
  subscribeToPiLogEntries,