@agentv/core 2.0.2 → 2.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -42,31 +42,39 @@ __export(index_exports, {
42
42
  ToolTrajectoryEvaluator: () => ToolTrajectoryEvaluator,
43
43
  avgToolDurationMs: () => avgToolDurationMs,
44
44
  buildDirectoryChain: () => buildDirectoryChain2,
45
+ buildOutputSchema: () => buildOutputSchema,
45
46
  buildPromptInputs: () => buildPromptInputs,
46
47
  buildSearchRoots: () => buildSearchRoots2,
48
+ clampScore: () => clampScore,
47
49
  computeTraceSummary: () => computeTraceSummary,
48
50
  consumeClaudeCodeLogEntries: () => consumeClaudeCodeLogEntries,
49
51
  consumeCodexLogEntries: () => consumeCodexLogEntries,
50
52
  consumePiLogEntries: () => consumePiLogEntries,
51
53
  createAgentKernel: () => createAgentKernel,
52
54
  createProvider: () => createProvider,
55
+ deepEqual: () => deepEqual,
53
56
  ensureVSCodeSubagents: () => ensureVSCodeSubagents,
57
+ executeScript: () => executeScript,
54
58
  explorationRatio: () => explorationRatio,
55
- extractCodeBlocks: () => extractCodeBlocks,
59
+ extractJsonBlob: () => extractJsonBlob,
56
60
  fileExists: () => fileExists2,
57
61
  findGitRoot: () => findGitRoot,
62
+ freeformEvaluationSchema: () => freeformEvaluationSchema,
58
63
  generateRubrics: () => generateRubrics,
59
64
  getHitCount: () => getHitCount,
60
65
  isEvaluatorKind: () => isEvaluatorKind,
61
66
  isGuidelineFile: () => isGuidelineFile,
62
67
  isJsonObject: () => isJsonObject,
63
68
  isJsonValue: () => isJsonValue,
69
+ isNonEmptyString: () => isNonEmptyString,
64
70
  isTestMessage: () => isTestMessage,
65
71
  isTestMessageRole: () => isTestMessageRole,
66
72
  listTargetNames: () => listTargetNames,
67
73
  loadEvalCases: () => loadEvalCases,
68
74
  mergeExecutionMetrics: () => mergeExecutionMetrics,
69
75
  normalizeLineEndings: () => normalizeLineEndings,
76
+ parseJsonFromText: () => parseJsonFromText,
77
+ parseJsonSafe: () => parseJsonSafe,
70
78
  readJsonFile: () => readJsonFile,
71
79
  readTargetDefinitions: () => readTargetDefinitions,
72
80
  readTestSuiteMetadata: () => readTestSuiteMetadata,
@@ -76,6 +84,7 @@ __export(index_exports, {
76
84
  resolveTargetDefinition: () => resolveTargetDefinition,
77
85
  runEvalCase: () => runEvalCase,
78
86
  runEvaluation: () => runEvaluation,
87
+ scoreToVerdict: () => scoreToVerdict,
79
88
  subscribeToClaudeCodeLogEntries: () => subscribeToClaudeCodeLogEntries,
80
89
  subscribeToCodexLogEntries: () => subscribeToCodexLogEntries,
81
90
  subscribeToPiLogEntries: () => subscribeToPiLogEntries,
@@ -221,85 +230,6 @@ var import_promises6 = require("fs/promises");
221
230
  var import_node_path6 = __toESM(require("path"), 1);
222
231
  var import_yaml2 = require("yaml");
223
232
 
224
- // src/evaluation/formatting/segment-formatter.ts
225
- function extractCodeBlocks(segments) {
226
- const CODE_BLOCK_PATTERN = /```[\s\S]*?```/g;
227
- const codeBlocks = [];
228
- for (const segment of segments) {
229
- const typeValue = segment.type;
230
- if (typeof typeValue !== "string" || typeValue !== "text") {
231
- continue;
232
- }
233
- const textValue = segment.value;
234
- if (typeof textValue !== "string") {
235
- continue;
236
- }
237
- const matches = textValue.match(CODE_BLOCK_PATTERN);
238
- if (matches) {
239
- codeBlocks.push(...matches);
240
- }
241
- }
242
- return codeBlocks;
243
- }
244
- function formatFileContents(parts) {
245
- const fileCount = parts.filter((p) => p.isFile).length;
246
- if (fileCount > 0) {
247
- return parts.map((part) => {
248
- if (part.isFile && part.displayPath) {
249
- return `<file path="${part.displayPath}">
250
- ${part.content}
251
- </file>`;
252
- }
253
- return part.content;
254
- }).join("\n\n");
255
- }
256
- return parts.map((p) => p.content).join(" ");
257
- }
258
- function formatSegment(segment, mode = "lm") {
259
- const type = asString(segment.type);
260
- if (type === "text") {
261
- return asString(segment.value);
262
- }
263
- if (type === "guideline_ref") {
264
- const refPath = asString(segment.path);
265
- return refPath ? `<Attached: ${refPath}>` : void 0;
266
- }
267
- if (type === "file") {
268
- const filePath = asString(segment.path);
269
- if (!filePath) {
270
- return void 0;
271
- }
272
- if (mode === "agent") {
273
- return `<file: path="${filePath}">`;
274
- }
275
- const text = asString(segment.text);
276
- if (text && filePath) {
277
- return formatFileContents([{ content: text.trim(), isFile: true, displayPath: filePath }]);
278
- }
279
- }
280
- return void 0;
281
- }
282
- function hasVisibleContent(segments) {
283
- return segments.some((segment) => {
284
- const type = asString(segment.type);
285
- if (type === "text") {
286
- const value = asString(segment.value);
287
- return value !== void 0 && value.trim().length > 0;
288
- }
289
- if (type === "guideline_ref") {
290
- return false;
291
- }
292
- if (type === "file") {
293
- const text = asString(segment.text);
294
- return text !== void 0 && text.trim().length > 0;
295
- }
296
- return false;
297
- });
298
- }
299
- function asString(value) {
300
- return typeof value === "string" ? value : void 0;
301
- }
302
-
303
233
  // src/evaluation/loaders/config-loader.ts
304
234
  var import_promises2 = require("fs/promises");
305
235
  var import_node_path2 = __toESM(require("path"), 1);
@@ -407,7 +337,6 @@ async function resolveFileReference(rawValue, searchRoots) {
407
337
  }
408
338
 
409
339
  // src/evaluation/loaders/config-loader.ts
410
- var SCHEMA_CONFIG_V2 = "agentv-config-v2";
411
340
  var ANSI_YELLOW = "\x1B[33m";
412
341
  var ANSI_RESET = "\x1B[0m";
413
342
  async function loadConfig(evalFilePath, repoRoot) {
@@ -425,13 +354,6 @@ async function loadConfig(evalFilePath, repoRoot) {
425
354
  continue;
426
355
  }
427
356
  const config = parsed;
428
- const schema = config.$schema;
429
- if (schema !== SCHEMA_CONFIG_V2) {
430
- const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${configPath}. Expected '${SCHEMA_CONFIG_V2}'` : `Missing required field '$schema' in ${configPath}.
431
- Please add '$schema: ${SCHEMA_CONFIG_V2}' at the top of the file.`;
432
- logWarning(message);
433
- continue;
434
- }
435
357
  const guidelinePatterns = config.guideline_patterns;
436
358
  if (guidelinePatterns !== void 0 && !Array.isArray(guidelinePatterns)) {
437
359
  logWarning(`Invalid guideline_patterns in ${configPath}, expected array`);
@@ -540,7 +462,8 @@ var ANSI_YELLOW3 = "\x1B[33m";
540
462
  var ANSI_RESET3 = "\x1B[0m";
541
463
  async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
542
464
  const execution = rawEvalCase.execution;
543
- const candidateEvaluators = isJsonObject2(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators ?? globalExecution?.evaluators;
465
+ const executionObject = isJsonObject2(execution) ? execution : void 0;
466
+ const candidateEvaluators = (executionObject ? executionObject.evaluators : void 0) ?? rawEvalCase.evaluators ?? globalExecution?.evaluators;
544
467
  if (candidateEvaluators === void 0) {
545
468
  return void 0;
546
469
  }
@@ -554,7 +477,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
554
477
  logWarning2(`Skipping invalid evaluator entry for '${evalId}' (expected object)`);
555
478
  continue;
556
479
  }
557
- const name = asString2(rawEvaluator.name);
480
+ const name = asString(rawEvaluator.name);
558
481
  const typeValue = rawEvaluator.type;
559
482
  if (!name || !isEvaluatorKind(typeValue)) {
560
483
  logWarning2(`Skipping evaluator with invalid name/type in '${evalId}'`);
@@ -582,7 +505,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
582
505
  continue;
583
506
  }
584
507
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
585
- const cwd = asString2(rawEvaluator.cwd);
508
+ const cwd = asString(rawEvaluator.cwd);
586
509
  let resolvedCwd;
587
510
  if (cwd) {
588
511
  const resolved = await resolveFileReference(cwd, searchRoots);
@@ -597,7 +520,29 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
597
520
  } else {
598
521
  resolvedCwd = searchRoots[0];
599
522
  }
600
- const knownProps = /* @__PURE__ */ new Set(["name", "type", "script", "cwd", "weight"]);
523
+ const rawTarget = rawEvaluator.target;
524
+ let targetConfig;
525
+ if (rawTarget !== void 0) {
526
+ if (isJsonObject2(rawTarget)) {
527
+ const maxCalls = rawTarget.max_calls;
528
+ if (maxCalls !== void 0 && (typeof maxCalls !== "number" || maxCalls < 0)) {
529
+ logWarning2(
530
+ `Invalid target.max_calls for evaluator '${name}' in '${evalId}': must be a non-negative number`
531
+ );
532
+ } else {
533
+ targetConfig = {
534
+ ...typeof maxCalls === "number" ? { max_calls: maxCalls } : {}
535
+ };
536
+ }
537
+ } else if (rawTarget === true) {
538
+ targetConfig = {};
539
+ } else {
540
+ logWarning2(
541
+ `Invalid target config for evaluator '${name}' in '${evalId}': expected object or true`
542
+ );
543
+ }
544
+ }
545
+ const knownProps = /* @__PURE__ */ new Set(["name", "type", "script", "cwd", "weight", "target"]);
601
546
  const config = {};
602
547
  for (const [key, value] of Object.entries(rawEvaluator)) {
603
548
  if (!knownProps.has(key) && value !== void 0) {
@@ -611,7 +556,8 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
611
556
  cwd,
612
557
  resolvedCwd,
613
558
  ...weight2 !== void 0 ? { weight: weight2 } : {},
614
- ...Object.keys(config).length > 0 ? { config } : {}
559
+ ...Object.keys(config).length > 0 ? { config } : {},
560
+ ...targetConfig !== void 0 ? { target: targetConfig } : {}
615
561
  });
616
562
  continue;
617
563
  }
@@ -628,7 +574,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
628
574
  logWarning2(`Skipping composite evaluator '${name}' in '${evalId}': missing aggregator`);
629
575
  continue;
630
576
  }
631
- const aggregatorType = asString2(rawAggregator.type);
577
+ const aggregatorType = asString(rawAggregator.type);
632
578
  if (aggregatorType !== "weighted_average" && aggregatorType !== "code_judge" && aggregatorType !== "llm_judge") {
633
579
  logWarning2(
634
580
  `Skipping composite evaluator '${name}' in '${evalId}': invalid aggregator type '${aggregatorType}'`
@@ -641,7 +587,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
641
587
  logWarning2(`Skipping invalid member evaluator in composite '${name}' (expected object)`);
642
588
  continue;
643
589
  }
644
- const memberName = asString2(rawMember.name);
590
+ const memberName = asString(rawMember.name);
645
591
  const memberType = rawMember.type;
646
592
  if (!memberName || !isEvaluatorKind(memberType)) {
647
593
  logWarning2(`Skipping member evaluator with invalid name/type in composite '${name}'`);
@@ -679,7 +625,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
679
625
  ...Object.keys(parsedWeights).length > 0 ? { weights: parsedWeights } : {}
680
626
  };
681
627
  } else if (aggregatorType === "code_judge") {
682
- const aggregatorPath = asString2(rawAggregator.path);
628
+ const aggregatorPath = asString(rawAggregator.path);
683
629
  if (!aggregatorPath) {
684
630
  logWarning2(
685
631
  `Skipping composite evaluator '${name}' in '${evalId}': code_judge aggregator missing path`
@@ -692,7 +638,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
692
638
  cwd: searchRoots[0]
693
639
  };
694
640
  } else {
695
- const aggregatorPrompt = asString2(rawAggregator.prompt);
641
+ const aggregatorPrompt = asString(rawAggregator.prompt);
696
642
  let promptPath2;
697
643
  if (aggregatorPrompt) {
698
644
  const resolved = await resolveFileReference(aggregatorPrompt, searchRoots);
@@ -717,7 +663,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
717
663
  continue;
718
664
  }
719
665
  if (typeValue === "tool_trajectory") {
720
- const mode = asString2(rawEvaluator.mode);
666
+ const mode = asString(rawEvaluator.mode);
721
667
  if (mode !== "any_order" && mode !== "in_order" && mode !== "exact") {
722
668
  logWarning2(
723
669
  `Skipping tool_trajectory evaluator '${name}' in '${evalId}': invalid mode '${mode}' (must be any_order, in_order, or exact)`
@@ -808,8 +754,8 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
808
754
  );
809
755
  continue;
810
756
  }
811
- const fieldPath = asString2(rawField.path);
812
- const match = asString2(rawField.match);
757
+ const fieldPath = asString(rawField.path);
758
+ const match = asString(rawField.match);
813
759
  if (!fieldPath) {
814
760
  logWarning2(
815
761
  `Skipping field without path in field_accuracy evaluator '${name}' in '${evalId}'`
@@ -839,7 +785,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
839
785
  );
840
786
  continue;
841
787
  }
842
- const aggregation = asString2(rawEvaluator.aggregation);
788
+ const aggregation = asString(rawEvaluator.aggregation);
843
789
  const validAggregation = isValidFieldAggregationType(aggregation) ? aggregation : void 0;
844
790
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
845
791
  evaluators.push({
@@ -920,7 +866,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
920
866
  });
921
867
  continue;
922
868
  }
923
- const prompt = asString2(rawEvaluator.prompt);
869
+ const prompt = asString(rawEvaluator.prompt);
924
870
  let promptPath;
925
871
  if (prompt) {
926
872
  const resolved = await resolveFileReference(prompt, searchRoots);
@@ -939,11 +885,11 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
939
885
  );
940
886
  }
941
887
  }
942
- const _model = asString2(rawEvaluator.model);
888
+ const _model = asString(rawEvaluator.model);
943
889
  const rawRubrics = rawEvaluator.rubrics;
944
890
  const parsedRubrics = Array.isArray(rawRubrics) ? rawRubrics.filter((r) => isJsonObject2(r)).map((rubric, index) => ({
945
- id: asString2(rubric.id) ?? `rubric-${index + 1}`,
946
- description: asString2(rubric.description) ?? "",
891
+ id: asString(rubric.id) ?? `rubric-${index + 1}`,
892
+ description: asString(rubric.description) ?? "",
947
893
  weight: typeof rubric.weight === "number" ? rubric.weight : 1,
948
894
  required: typeof rubric.required === "boolean" ? rubric.required : true
949
895
  })).filter((r) => r.description.length > 0) : void 0;
@@ -987,7 +933,7 @@ function coerceEvaluator(candidate, contextId) {
987
933
  logWarning2(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
988
934
  return void 0;
989
935
  }
990
- function asString2(value) {
936
+ function asString(value) {
991
937
  return typeof value === "string" ? value : void 0;
992
938
  }
993
939
  function asStringArray(value, description) {
@@ -1063,6 +1009,68 @@ function isValidFieldAggregationType(value) {
1063
1009
  // src/evaluation/loaders/message-processor.ts
1064
1010
  var import_promises4 = require("fs/promises");
1065
1011
  var import_node_path4 = __toESM(require("path"), 1);
1012
+
1013
+ // src/evaluation/formatting/segment-formatter.ts
1014
+ function formatFileContents(parts) {
1015
+ const fileCount = parts.filter((p) => p.isFile).length;
1016
+ if (fileCount > 0) {
1017
+ return parts.map((part) => {
1018
+ if (part.isFile && part.displayPath) {
1019
+ return `<file path="${part.displayPath}">
1020
+ ${part.content}
1021
+ </file>`;
1022
+ }
1023
+ return part.content;
1024
+ }).join("\n\n");
1025
+ }
1026
+ return parts.map((p) => p.content).join(" ");
1027
+ }
1028
+ function formatSegment(segment, mode = "lm") {
1029
+ const type = asString2(segment.type);
1030
+ if (type === "text") {
1031
+ return asString2(segment.value);
1032
+ }
1033
+ if (type === "guideline_ref") {
1034
+ const refPath = asString2(segment.path);
1035
+ return refPath ? `<Attached: ${refPath}>` : void 0;
1036
+ }
1037
+ if (type === "file") {
1038
+ const filePath = asString2(segment.path);
1039
+ if (!filePath) {
1040
+ return void 0;
1041
+ }
1042
+ if (mode === "agent") {
1043
+ return `<file: path="${filePath}">`;
1044
+ }
1045
+ const text = asString2(segment.text);
1046
+ if (text && filePath) {
1047
+ return formatFileContents([{ content: text.trim(), isFile: true, displayPath: filePath }]);
1048
+ }
1049
+ }
1050
+ return void 0;
1051
+ }
1052
+ function hasVisibleContent(segments) {
1053
+ return segments.some((segment) => {
1054
+ const type = asString2(segment.type);
1055
+ if (type === "text") {
1056
+ const value = asString2(segment.value);
1057
+ return value !== void 0 && value.trim().length > 0;
1058
+ }
1059
+ if (type === "guideline_ref") {
1060
+ return false;
1061
+ }
1062
+ if (type === "file") {
1063
+ const text = asString2(segment.text);
1064
+ return text !== void 0 && text.trim().length > 0;
1065
+ }
1066
+ return false;
1067
+ });
1068
+ }
1069
+ function asString2(value) {
1070
+ return typeof value === "string" ? value : void 0;
1071
+ }
1072
+
1073
+ // src/evaluation/loaders/message-processor.ts
1066
1074
  var ANSI_YELLOW4 = "\x1B[33m";
1067
1075
  var ANSI_RESET4 = "\x1B[0m";
1068
1076
  async function processMessages(options) {
@@ -1368,9 +1376,6 @@ ${messageContent}`);
1368
1376
  questionParts.push(formattedContent);
1369
1377
  }
1370
1378
  }
1371
- if (testCase.code_snippets.length > 0) {
1372
- questionParts.push(testCase.code_snippets.join("\n"));
1373
- }
1374
1379
  question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
1375
1380
  }
1376
1381
  const chatPrompt = useRoleMarkers ? buildChatPromptFromSegments({
@@ -1569,7 +1574,6 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
1569
1574
  repoRootPath,
1570
1575
  verbose
1571
1576
  }) : [];
1572
- const codeSnippets = extractCodeBlocks(inputSegments);
1573
1577
  let referenceAnswer = "";
1574
1578
  if (outputSegments.length > 0) {
1575
1579
  const lastMessage = outputSegments[outputSegments.length - 1];
@@ -1642,7 +1646,6 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
1642
1646
  guideline_paths: guidelinePaths.map((guidelinePath) => import_node_path6.default.resolve(guidelinePath)),
1643
1647
  guideline_patterns: guidelinePatterns,
1644
1648
  file_paths: allFilePaths,
1645
- code_snippets: codeSnippets,
1646
1649
  expected_outcome: outcome,
1647
1650
  evaluator: evalCaseEvaluatorKind,
1648
1651
  evaluators
@@ -6327,9 +6330,64 @@ function resolveAndCreateProvider(definition, env = process.env) {
6327
6330
  return createProvider(resolved);
6328
6331
  }
6329
6332
 
6330
- // src/evaluation/evaluators.ts
6331
- var import_ai2 = require("ai");
6332
- var import_zod3 = require("zod");
6333
+ // src/evaluation/evaluators/scoring.ts
6334
+ function scoreToVerdict(score) {
6335
+ if (score >= 0.8) {
6336
+ return "pass";
6337
+ }
6338
+ if (score >= 0.6) {
6339
+ return "borderline";
6340
+ }
6341
+ return "fail";
6342
+ }
6343
+ function clampScore(value) {
6344
+ if (Number.isNaN(value) || !Number.isFinite(value)) {
6345
+ return 0;
6346
+ }
6347
+ if (value < 0) {
6348
+ return 0;
6349
+ }
6350
+ if (value > 1) {
6351
+ return 1;
6352
+ }
6353
+ return value;
6354
+ }
6355
+ function extractJsonBlob(text) {
6356
+ const match = text.match(/\{[\s\S]*\}/);
6357
+ return match?.[0];
6358
+ }
6359
+ function parseJsonFromText(text) {
6360
+ const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
6361
+ const blob = extractJsonBlob(cleaned) ?? cleaned;
6362
+ return JSON.parse(blob);
6363
+ }
6364
+ function isNonEmptyString(value) {
6365
+ return typeof value === "string" && value.trim().length > 0;
6366
+ }
6367
+ function parseJsonSafe(payload) {
6368
+ try {
6369
+ return JSON.parse(payload);
6370
+ } catch {
6371
+ return void 0;
6372
+ }
6373
+ }
6374
+ function deepEqual(a, b) {
6375
+ if (a === b) return true;
6376
+ if (a === null || b === null) return a === b;
6377
+ if (typeof a !== typeof b) return false;
6378
+ if (typeof a !== "object") return a === b;
6379
+ if (Array.isArray(a) !== Array.isArray(b)) return false;
6380
+ if (Array.isArray(a) && Array.isArray(b)) {
6381
+ if (a.length !== b.length) return false;
6382
+ return a.every((val, i) => deepEqual(val, b[i]));
6383
+ }
6384
+ const aObj = a;
6385
+ const bObj = b;
6386
+ const aKeys = Object.keys(aObj);
6387
+ const bKeys = Object.keys(bObj);
6388
+ if (aKeys.length !== bKeys.length) return false;
6389
+ return aKeys.every((key) => Object.hasOwn(bObj, key) && deepEqual(aObj[key], bObj[key]));
6390
+ }
6333
6391
 
6334
6392
  // src/runtime/exec.ts
6335
6393
  function shellEscapePath(value) {
@@ -6354,7 +6412,9 @@ async function execFileWithStdinBun(argv, stdinPayload, options) {
6354
6412
  cwd: options.cwd,
6355
6413
  stdin: encoder.encode(stdinPayload),
6356
6414
  stdout: "pipe",
6357
- stderr: "pipe"
6415
+ stderr: "pipe",
6416
+ // Merge additional env vars with process.env
6417
+ env: options.env ? { ...process.env, ...options.env } : process.env
6358
6418
  });
6359
6419
  let timedOut = false;
6360
6420
  const timeout = options.timeoutMs !== void 0 ? setTimeout(() => {
@@ -6389,7 +6449,9 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
6389
6449
  const [cmd, ...args] = argv;
6390
6450
  const child = spawn4(cmd, args, {
6391
6451
  cwd: options.cwd,
6392
- stdio: ["pipe", "pipe", "pipe"]
6452
+ stdio: ["pipe", "pipe", "pipe"],
6453
+ // Merge additional env vars with process.env
6454
+ env: options.env ? { ...process.env, ...options.env } : process.env
6393
6455
  });
6394
6456
  const stdoutChunks = [];
6395
6457
  const stderrChunks = [];
@@ -6442,7 +6504,9 @@ async function execShellWithStdin(command, stdinPayload, options = {}) {
6442
6504
  const child = spawn4(wrappedCommand, {
6443
6505
  shell: true,
6444
6506
  cwd: options.cwd,
6445
- stdio: ["ignore", "ignore", "ignore"]
6507
+ stdio: ["ignore", "ignore", "ignore"],
6508
+ // Merge additional env vars with process.env
6509
+ env: options.env ? { ...process.env, ...options.env } : process.env
6446
6510
  });
6447
6511
  const timeout = options.timeoutMs ? setTimeout(() => {
6448
6512
  child.kill();
@@ -6469,59 +6533,414 @@ async function execShellWithStdin(command, stdinPayload, options = {}) {
6469
6533
  }
6470
6534
  }
6471
6535
 
6472
- // src/evaluation/case-conversion.ts
6473
- function toSnakeCase(str) {
6474
- if (/^[A-Z]/.test(str)) {
6475
- return str;
6476
- }
6477
- return str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
6478
- }
6479
- function toSnakeCaseDeep(obj) {
6480
- if (obj === null || obj === void 0) {
6481
- return obj;
6482
- }
6483
- if (Array.isArray(obj)) {
6484
- return obj.map((item) => toSnakeCaseDeep(item));
6485
- }
6486
- if (typeof obj === "object") {
6487
- const result = {};
6488
- for (const [key, value] of Object.entries(obj)) {
6489
- const snakeKey = toSnakeCase(key);
6490
- result[snakeKey] = toSnakeCaseDeep(value);
6536
+ // src/runtime/target-proxy.ts
6537
+ var import_node_crypto4 = require("crypto");
6538
+ var import_node_http = require("http");
6539
+ var DEFAULT_MAX_CALLS = 50;
6540
+ async function createTargetProxy(options) {
6541
+ const { defaultProvider, targetResolver, availableTargets, maxCalls } = options;
6542
+ const token = (0, import_node_crypto4.randomBytes)(32).toString("hex");
6543
+ let callCount = 0;
6544
+ let isShutdown = false;
6545
+ const targetsList = availableTargets ?? [defaultProvider.targetName];
6546
+ function resolveProvider(targetName) {
6547
+ if (targetName === void 0 || targetName === defaultProvider.targetName) {
6548
+ return defaultProvider;
6549
+ }
6550
+ if (targetResolver) {
6551
+ return targetResolver(targetName);
6491
6552
  }
6492
- return result;
6553
+ return void 0;
6493
6554
  }
6494
- return obj;
6495
- }
6496
-
6497
- // src/evaluation/providers/types.ts
6498
- var AGENT_PROVIDER_KINDS = [
6499
- "codex",
6500
- "pi-coding-agent",
6501
- "claude-code",
6502
- "vscode",
6503
- "vscode-insiders"
6504
- ];
6505
- function extractLastAssistantContent(messages) {
6506
- if (!messages || messages.length === 0) {
6507
- return "";
6555
+ const server = (0, import_node_http.createServer)(async (req, res) => {
6556
+ res.setHeader("Access-Control-Allow-Origin", "*");
6557
+ res.setHeader("Access-Control-Allow-Methods", "GET, POST, OPTIONS");
6558
+ res.setHeader("Access-Control-Allow-Headers", "Content-Type, Authorization");
6559
+ if (req.method === "OPTIONS") {
6560
+ res.writeHead(204);
6561
+ res.end();
6562
+ return;
6563
+ }
6564
+ const authHeader = req.headers.authorization;
6565
+ if (!authHeader || authHeader !== `Bearer ${token}`) {
6566
+ sendJson(res, 401, { error: "Unauthorized" });
6567
+ return;
6568
+ }
6569
+ if (isShutdown) {
6570
+ sendJson(res, 503, { error: "Proxy is shutting down" });
6571
+ return;
6572
+ }
6573
+ const url2 = req.url ?? "";
6574
+ if (req.method === "GET" && url2 === "/info") {
6575
+ handleInfo(res);
6576
+ return;
6577
+ }
6578
+ if (req.method === "POST" && url2 === "/invoke") {
6579
+ await handleInvoke(req, res);
6580
+ return;
6581
+ }
6582
+ if (req.method === "POST" && url2 === "/invokeBatch") {
6583
+ await handleInvokeBatch(req, res);
6584
+ return;
6585
+ }
6586
+ sendJson(res, 404, { error: "Not found" });
6587
+ });
6588
+ function handleInfo(res) {
6589
+ const response = {
6590
+ targetName: defaultProvider.targetName,
6591
+ maxCalls,
6592
+ callCount,
6593
+ availableTargets: targetsList
6594
+ };
6595
+ sendJson(res, 200, response);
6508
6596
  }
6509
- for (let i = messages.length - 1; i >= 0; i--) {
6510
- const msg = messages[i];
6511
- if (msg.role === "assistant" && msg.content !== void 0) {
6512
- if (typeof msg.content === "string") {
6513
- return msg.content;
6597
+ async function handleInvoke(req, res) {
6598
+ if (callCount >= maxCalls) {
6599
+ sendJson(res, 429, { error: `Max calls exceeded (limit: ${maxCalls})` });
6600
+ return;
6601
+ }
6602
+ try {
6603
+ const body = await readBody(req);
6604
+ const request = JSON.parse(body);
6605
+ if (!request.question || typeof request.question !== "string") {
6606
+ sendJson(res, 400, { error: "Missing required field: question" });
6607
+ return;
6514
6608
  }
6515
- return JSON.stringify(msg.content);
6609
+ const provider = resolveProvider(request.target);
6610
+ if (!provider) {
6611
+ sendJson(res, 400, {
6612
+ error: `Unknown target '${request.target}'. Available: ${targetsList.join(", ")}`
6613
+ });
6614
+ return;
6615
+ }
6616
+ callCount++;
6617
+ const response = await provider.invoke({
6618
+ question: request.question,
6619
+ systemPrompt: request.systemPrompt,
6620
+ evalCaseId: request.evalCaseId ?? "proxy",
6621
+ attempt: request.attempt ?? 1
6622
+ });
6623
+ const outputMessages = response.outputMessages ?? [];
6624
+ const rawText = extractLastAssistantContent(outputMessages);
6625
+ const result = {
6626
+ outputMessages,
6627
+ rawText
6628
+ };
6629
+ sendJson(res, 200, result);
6630
+ } catch (error) {
6631
+ const message = error instanceof Error ? error.message : String(error);
6632
+ sendJson(res, 500, { error: message });
6516
6633
  }
6517
6634
  }
6518
- return "";
6519
- }
6635
+ async function handleInvokeBatch(req, res) {
6636
+ try {
6637
+ const body = await readBody(req);
6638
+ const { requests } = JSON.parse(body);
6639
+ if (!Array.isArray(requests)) {
6640
+ sendJson(res, 400, { error: "Missing required field: requests (array)" });
6641
+ return;
6642
+ }
6643
+ if (callCount + requests.length > maxCalls) {
6644
+ sendJson(res, 429, {
6645
+ error: `Batch would exceed max calls (current: ${callCount}, batch: ${requests.length}, limit: ${maxCalls})`
6646
+ });
6647
+ return;
6648
+ }
6649
+ const responses = [];
6650
+ for (const request of requests) {
6651
+ if (!request.question || typeof request.question !== "string") {
6652
+ responses.push({
6653
+ outputMessages: [],
6654
+ rawText: "Error: Missing required field: question"
6655
+ });
6656
+ continue;
6657
+ }
6658
+ const provider = resolveProvider(request.target);
6659
+ if (!provider) {
6660
+ responses.push({
6661
+ outputMessages: [],
6662
+ rawText: `Error: Unknown target '${request.target}'. Available: ${targetsList.join(", ")}`
6663
+ });
6664
+ continue;
6665
+ }
6666
+ callCount++;
6667
+ try {
6668
+ const response = await provider.invoke({
6669
+ question: request.question,
6670
+ systemPrompt: request.systemPrompt,
6671
+ evalCaseId: request.evalCaseId ?? "proxy",
6672
+ attempt: request.attempt ?? 1
6673
+ });
6674
+ const outputMessages = response.outputMessages ?? [];
6675
+ responses.push({
6676
+ outputMessages,
6677
+ rawText: extractLastAssistantContent(outputMessages)
6678
+ });
6679
+ } catch (error) {
6680
+ const message = error instanceof Error ? error.message : String(error);
6681
+ responses.push({
6682
+ outputMessages: [],
6683
+ rawText: `Error: ${message}`
6684
+ });
6685
+ }
6686
+ }
6687
+ sendJson(res, 200, { responses });
6688
+ } catch (error) {
6689
+ const message = error instanceof Error ? error.message : String(error);
6690
+ sendJson(res, 500, { error: message });
6691
+ }
6692
+ }
6693
+ await new Promise((resolve, reject) => {
6694
+ server.once("error", reject);
6695
+ server.listen(0, "127.0.0.1", () => {
6696
+ server.removeListener("error", reject);
6697
+ resolve();
6698
+ });
6699
+ });
6700
+ const address = server.address();
6701
+ const url = `http://127.0.0.1:${address.port}`;
6702
+ return {
6703
+ url,
6704
+ token,
6705
+ shutdown: async () => {
6706
+ isShutdown = true;
6707
+ return new Promise((resolve, reject) => {
6708
+ server.close((err) => {
6709
+ if (err) reject(err);
6710
+ else resolve();
6711
+ });
6712
+ });
6713
+ },
6714
+ getUsageMetadata: () => ({
6715
+ callCount,
6716
+ maxCalls
6717
+ })
6718
+ };
6719
+ }
6720
+ function sendJson(res, statusCode, body) {
6721
+ res.writeHead(statusCode, { "Content-Type": "application/json" });
6722
+ res.end(JSON.stringify(body));
6723
+ }
6724
+ function readBody(req) {
6725
+ return new Promise((resolve, reject) => {
6726
+ const chunks = [];
6727
+ req.on("data", (chunk) => chunks.push(chunk));
6728
+ req.on("end", () => resolve(Buffer.concat(chunks).toString("utf8")));
6729
+ req.on("error", reject);
6730
+ });
6731
+ }
6732
+ function extractLastAssistantContent(messages) {
6733
+ for (let i = messages.length - 1; i >= 0; i--) {
6734
+ const msg = messages[i];
6735
+ if (msg.role === "assistant" && msg.content !== void 0) {
6736
+ if (typeof msg.content === "string") {
6737
+ return msg.content;
6738
+ }
6739
+ if (Array.isArray(msg.content)) {
6740
+ for (const part of msg.content) {
6741
+ if (typeof part === "object" && part !== null && "text" in part) {
6742
+ return String(part.text);
6743
+ }
6744
+ }
6745
+ }
6746
+ }
6747
+ }
6748
+ return void 0;
6749
+ }
6750
+
6751
+ // src/evaluation/case-conversion.ts
6752
+ function toSnakeCase(str) {
6753
+ if (/^[A-Z]/.test(str)) {
6754
+ return str;
6755
+ }
6756
+ return str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
6757
+ }
6758
+ function toSnakeCaseDeep(obj) {
6759
+ if (obj === null || obj === void 0) {
6760
+ return obj;
6761
+ }
6762
+ if (Array.isArray(obj)) {
6763
+ return obj.map((item) => toSnakeCaseDeep(item));
6764
+ }
6765
+ if (typeof obj === "object") {
6766
+ const result = {};
6767
+ for (const [key, value] of Object.entries(obj)) {
6768
+ const snakeKey = toSnakeCase(key);
6769
+ result[snakeKey] = toSnakeCaseDeep(value);
6770
+ }
6771
+ return result;
6772
+ }
6773
+ return obj;
6774
+ }
6775
+
6776
+ // src/evaluation/evaluators/code-evaluator.ts
6777
+ var CodeEvaluator = class {
6778
+ kind = "code";
6779
+ script;
6780
+ cwd;
6781
+ agentTimeoutMs;
6782
+ config;
6783
+ target;
6784
+ constructor(options) {
6785
+ this.script = options.script;
6786
+ this.cwd = options.cwd;
6787
+ this.agentTimeoutMs = options.agentTimeoutMs;
6788
+ this.config = options.config;
6789
+ this.target = options.target;
6790
+ }
6791
+ async evaluate(context) {
6792
+ const payload = {
6793
+ question: context.evalCase.question,
6794
+ expectedOutcome: context.evalCase.expected_outcome,
6795
+ expectedMessages: context.evalCase.expected_messages,
6796
+ referenceAnswer: context.evalCase.reference_answer,
6797
+ candidateAnswer: context.candidate,
6798
+ outputMessages: context.outputMessages ?? null,
6799
+ guidelineFiles: context.evalCase.guideline_paths,
6800
+ inputFiles: context.evalCase.file_paths.filter(
6801
+ (path17) => !context.evalCase.guideline_paths.includes(path17)
6802
+ ),
6803
+ inputMessages: context.evalCase.input_messages,
6804
+ traceSummary: context.traceSummary ?? null,
6805
+ config: this.config ?? null
6806
+ };
6807
+ const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
6808
+ let proxyEnv;
6809
+ let proxyShutdown;
6810
+ let getProxyUsage;
6811
+ if (this.target !== void 0 && context.judgeProvider) {
6812
+ const maxCalls = this.target.max_calls ?? DEFAULT_MAX_CALLS;
6813
+ const proxy = await createTargetProxy({
6814
+ defaultProvider: context.judgeProvider,
6815
+ targetResolver: context.targetResolver,
6816
+ availableTargets: context.availableTargets,
6817
+ maxCalls
6818
+ });
6819
+ proxyEnv = {
6820
+ AGENTV_TARGET_PROXY_URL: proxy.url,
6821
+ AGENTV_TARGET_PROXY_TOKEN: proxy.token
6822
+ };
6823
+ proxyShutdown = proxy.shutdown;
6824
+ getProxyUsage = proxy.getUsageMetadata;
6825
+ }
6826
+ try {
6827
+ const stdout = await executeScript(
6828
+ this.script,
6829
+ inputPayload,
6830
+ this.agentTimeoutMs,
6831
+ this.cwd,
6832
+ proxyEnv
6833
+ );
6834
+ const parsed = parseJsonSafe(stdout);
6835
+ const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
6836
+ const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
6837
+ const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
6838
+ const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
6839
+ const details = parsed?.details && typeof parsed.details === "object" && !Array.isArray(parsed.details) ? parsed.details : void 0;
6840
+ const proxyUsage = getProxyUsage?.();
6841
+ const evaluatorRawRequest = {
6842
+ script: this.script,
6843
+ ...this.cwd ? { cwd: this.cwd } : {},
6844
+ ...proxyUsage ? {
6845
+ target_proxy: {
6846
+ call_count: proxyUsage.callCount,
6847
+ max_calls: proxyUsage.maxCalls
6848
+ }
6849
+ } : {}
6850
+ };
6851
+ return {
6852
+ score,
6853
+ verdict: scoreToVerdict(score),
6854
+ hits,
6855
+ misses,
6856
+ expectedAspectCount: hits.length + misses.length || 1,
6857
+ reasoning,
6858
+ evaluatorRawRequest,
6859
+ ...details ? { details } : {}
6860
+ };
6861
+ } catch (error) {
6862
+ const message = error instanceof Error ? error.message : String(error);
6863
+ const proxyUsage = getProxyUsage?.();
6864
+ return {
6865
+ score: 0,
6866
+ verdict: "fail",
6867
+ hits: [],
6868
+ misses: [`Code evaluator failed: ${message}`],
6869
+ expectedAspectCount: 1,
6870
+ reasoning: message,
6871
+ evaluatorRawRequest: {
6872
+ script: this.script,
6873
+ ...this.cwd ? { cwd: this.cwd } : {},
6874
+ ...proxyUsage ? {
6875
+ target_proxy: {
6876
+ call_count: proxyUsage.callCount,
6877
+ max_calls: proxyUsage.maxCalls
6878
+ }
6879
+ } : {},
6880
+ error: message
6881
+ }
6882
+ };
6883
+ } finally {
6884
+ if (proxyShutdown) {
6885
+ await proxyShutdown();
6886
+ }
6887
+ }
6888
+ }
6889
+ };
6890
+ async function executeScript(scriptPath, input, agentTimeoutMs, cwd, env) {
6891
+ const { stdout, stderr, exitCode } = typeof scriptPath === "string" ? await execShellWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env }) : await execFileWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env });
6892
+ if (exitCode !== 0) {
6893
+ const trimmedErr = formatStderr(stderr);
6894
+ throw new Error(
6895
+ trimmedErr.length > 0 ? `Code evaluator exited with code ${exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${exitCode}`
6896
+ );
6897
+ }
6898
+ return stdout.trim();
6899
+ }
6900
+ function formatStderr(stderr) {
6901
+ const trimmed = stderr.trim();
6902
+ const maxLength = 2e3;
6903
+ if (trimmed.length <= maxLength) {
6904
+ return trimmed;
6905
+ }
6906
+ const tail = trimmed.slice(-maxLength);
6907
+ return `...(truncated, last ${maxLength} chars)
6908
+ ${tail}`;
6909
+ }
6910
+
6911
+ // src/evaluation/evaluators/composite.ts
6912
+ var import_ai3 = require("ai");
6913
+
6914
+ // src/evaluation/providers/types.ts
6915
+ var AGENT_PROVIDER_KINDS = [
6916
+ "codex",
6917
+ "pi-coding-agent",
6918
+ "claude-code",
6919
+ "vscode",
6920
+ "vscode-insiders"
6921
+ ];
6922
+ function extractLastAssistantContent2(messages) {
6923
+ if (!messages || messages.length === 0) {
6924
+ return "";
6925
+ }
6926
+ for (let i = messages.length - 1; i >= 0; i--) {
6927
+ const msg = messages[i];
6928
+ if (msg.role === "assistant" && msg.content !== void 0) {
6929
+ if (typeof msg.content === "string") {
6930
+ return msg.content;
6931
+ }
6932
+ return JSON.stringify(msg.content);
6933
+ }
6934
+ }
6935
+ return "";
6936
+ }
6520
6937
  function isAgentProvider(provider) {
6521
6938
  return provider ? AGENT_PROVIDER_KINDS.includes(provider.kind) : false;
6522
6939
  }
6523
6940
 
6524
- // src/evaluation/evaluators.ts
6941
+ // src/evaluation/evaluators/llm-judge.ts
6942
+ var import_ai2 = require("ai");
6943
+ var import_zod3 = require("zod");
6525
6944
  var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
6526
6945
 
6527
6946
  Use the reference_answer as a gold standard for a high-quality response (if provided). The reference_answer may be a simple text response, or it may contain a sequence of expected agent messages including tool calls. When it contains multiple messages, the last message represents the final expected answer. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
@@ -6601,7 +7020,7 @@ var LlmJudgeEvaluator = class {
6601
7020
  target: judgeProvider.targetName
6602
7021
  };
6603
7022
  try {
6604
- const { data, providerResponse } = await this.runWithRetry({
7023
+ const { data } = await this.runWithRetry({
6605
7024
  context,
6606
7025
  judgeProvider,
6607
7026
  systemPrompt,
@@ -6714,7 +7133,7 @@ var LlmJudgeEvaluator = class {
6714
7133
  temperature: this.temperature
6715
7134
  });
6716
7135
  const data = schema.parse(
6717
- parseJsonFromText(extractLastAssistantContent(response.outputMessages))
7136
+ parseJsonFromText(extractLastAssistantContent2(response.outputMessages))
6718
7137
  );
6719
7138
  return { data, providerResponse: response };
6720
7139
  } catch (e) {
@@ -6750,86 +7169,160 @@ You must return a valid JSON object matching this schema:
6750
7169
  "overall_reasoning": "string (summary)"
6751
7170
  }`;
6752
7171
  }
6753
- function scoreToVerdict(score) {
6754
- if (score >= 0.8) {
6755
- return "pass";
6756
- }
6757
- if (score >= 0.6) {
6758
- return "borderline";
6759
- }
6760
- return "fail";
7172
+ function substituteVariables(template, variables) {
7173
+ return template.replace(/\{\{\s*([a-zA-Z0-9_]+)\s*\}\}/g, (match, varName) => {
7174
+ return variables[varName] ?? match;
7175
+ });
6761
7176
  }
6762
- function clampScore(value) {
6763
- if (Number.isNaN(value) || !Number.isFinite(value)) {
6764
- return 0;
6765
- }
6766
- if (value < 0) {
6767
- return 0;
6768
- }
6769
- if (value > 1) {
6770
- return 1;
7177
+ function calculateRubricScore(result, rubrics) {
7178
+ const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
7179
+ const hits = [];
7180
+ const misses = [];
7181
+ let totalWeight = 0;
7182
+ let earnedWeight = 0;
7183
+ let failedRequired = false;
7184
+ for (const check of result.checks) {
7185
+ const rubric = rubricMap.get(check.id);
7186
+ if (!rubric) {
7187
+ continue;
7188
+ }
7189
+ totalWeight += rubric.weight;
7190
+ if (check.satisfied) {
7191
+ earnedWeight += rubric.weight;
7192
+ hits.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
7193
+ } else {
7194
+ misses.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
7195
+ if (rubric.required) {
7196
+ failedRequired = true;
7197
+ }
7198
+ }
6771
7199
  }
6772
- return value;
6773
- }
6774
- function extractJsonBlob(text) {
6775
- const match = text.match(/\{[\s\S]*\}/);
6776
- return match?.[0];
6777
- }
6778
- function parseJsonFromText(text) {
6779
- const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
6780
- const blob = extractJsonBlob(cleaned) ?? cleaned;
6781
- return JSON.parse(blob);
6782
- }
6783
- function isNonEmptyString(value) {
6784
- return typeof value === "string" && value.trim().length > 0;
7200
+ const score = totalWeight > 0 ? Math.min(1, Math.max(0, earnedWeight / totalWeight)) : 0;
7201
+ const verdict = failedRequired ? "fail" : scoreToVerdict(score);
7202
+ return { score, verdict, hits, misses };
6785
7203
  }
6786
- var CodeEvaluator = class {
6787
- kind = "code";
6788
- script;
6789
- cwd;
6790
- agentTimeoutMs;
7204
+
7205
+ // src/evaluation/evaluators/composite.ts
7206
+ var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
7207
+ {{EVALUATOR_RESULTS_JSON}}
7208
+
7209
+ Decide the final score and verdict based on all evaluator results.
7210
+ Return a JSON object with: score (0.0-1.0), verdict (pass/fail/borderline), and reasoning.`;
7211
+ var CompositeEvaluator = class {
7212
+ kind = "composite";
6791
7213
  config;
7214
+ evaluatorFactory;
7215
+ cwd;
6792
7216
  constructor(options) {
6793
- this.script = options.script;
6794
- this.cwd = options.cwd;
6795
- this.agentTimeoutMs = options.agentTimeoutMs;
6796
7217
  this.config = options.config;
7218
+ this.evaluatorFactory = options.evaluatorFactory;
7219
+ this.cwd = options.cwd;
6797
7220
  }
6798
7221
  async evaluate(context) {
6799
- const payload = {
6800
- question: context.evalCase.question,
6801
- expectedOutcome: context.evalCase.expected_outcome,
6802
- expectedMessages: context.evalCase.expected_messages,
6803
- referenceAnswer: context.evalCase.reference_answer,
6804
- candidateAnswer: context.candidate,
6805
- outputMessages: context.outputMessages ?? null,
6806
- guidelineFiles: context.evalCase.guideline_paths,
6807
- inputFiles: context.evalCase.file_paths.filter(
6808
- (path17) => !context.evalCase.guideline_paths.includes(path17)
6809
- ),
6810
- inputMessages: context.evalCase.input_messages,
6811
- traceSummary: context.traceSummary ?? null,
6812
- config: this.config ?? null
7222
+ const memberResults = await Promise.all(
7223
+ this.config.evaluators.map(async (memberConfig) => {
7224
+ const evaluator = this.evaluatorFactory.create(memberConfig, context);
7225
+ return {
7226
+ id: memberConfig.name,
7227
+ type: memberConfig.type,
7228
+ result: await evaluator.evaluate(context)
7229
+ };
7230
+ })
7231
+ );
7232
+ return this.aggregate(memberResults, context);
7233
+ }
7234
+ async aggregate(results, context) {
7235
+ const aggregator = this.config.aggregator;
7236
+ switch (aggregator.type) {
7237
+ case "code_judge":
7238
+ return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
7239
+ case "llm_judge":
7240
+ return this.runLlmAggregator(results, context, aggregator);
7241
+ default:
7242
+ return this.runWeightedAverage(results, aggregator.weights);
7243
+ }
7244
+ }
7245
+ runWeightedAverage(results, weights) {
7246
+ let totalWeight = 0;
7247
+ let weightedSum = 0;
7248
+ const allHits = [];
7249
+ const allMisses = [];
7250
+ const reasoningParts = [];
7251
+ const evaluatorResults = [];
7252
+ for (const member of results) {
7253
+ const weight = weights?.[member.id] ?? 1;
7254
+ totalWeight += weight;
7255
+ weightedSum += member.result.score * weight;
7256
+ allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
7257
+ allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
7258
+ if (member.result.reasoning) {
7259
+ reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
7260
+ }
7261
+ evaluatorResults.push({
7262
+ name: member.id,
7263
+ type: member.type,
7264
+ score: member.result.score,
7265
+ weight,
7266
+ verdict: member.result.verdict,
7267
+ hits: [...member.result.hits],
7268
+ misses: [...member.result.misses],
7269
+ reasoning: member.result.reasoning,
7270
+ evaluatorRawRequest: member.result.evaluatorRawRequest,
7271
+ evaluatorResults: member.result.evaluatorResults,
7272
+ details: member.result.details
7273
+ });
7274
+ }
7275
+ const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
7276
+ return {
7277
+ score: clampScore(finalScore),
7278
+ verdict: scoreToVerdict(finalScore),
7279
+ hits: allHits,
7280
+ misses: allMisses,
7281
+ expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
7282
+ reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
7283
+ evaluatorRawRequest: {
7284
+ aggregator: "weighted_average",
7285
+ ...weights ? { weights } : {}
7286
+ },
7287
+ evaluatorResults
6813
7288
  };
6814
- const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
7289
+ }
7290
+ async runCodeAggregator(results, scriptPath, cwd, weights) {
7291
+ const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
7292
+ const inputPayload = JSON.stringify({ results: resultsObject }, null, 2);
7293
+ const evaluatorResults = results.map((member) => ({
7294
+ name: member.id,
7295
+ type: member.type,
7296
+ score: member.result.score,
7297
+ weight: weights?.[member.id] ?? 1,
7298
+ verdict: member.result.verdict,
7299
+ hits: [...member.result.hits],
7300
+ misses: [...member.result.misses],
7301
+ reasoning: member.result.reasoning,
7302
+ evaluatorRawRequest: member.result.evaluatorRawRequest,
7303
+ evaluatorResults: member.result.evaluatorResults,
7304
+ details: member.result.details
7305
+ }));
6815
7306
  try {
6816
- const stdout = await executeScript(this.script, inputPayload, this.agentTimeoutMs, this.cwd);
7307
+ const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
6817
7308
  const parsed = parseJsonSafe(stdout);
6818
7309
  const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
6819
7310
  const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
6820
7311
  const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
6821
7312
  const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
7313
+ const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
6822
7314
  return {
6823
7315
  score,
6824
- verdict: scoreToVerdict(score),
7316
+ verdict,
6825
7317
  hits,
6826
7318
  misses,
6827
7319
  expectedAspectCount: hits.length + misses.length || 1,
6828
7320
  reasoning,
6829
7321
  evaluatorRawRequest: {
6830
- script: this.script,
6831
- ...this.cwd ? { cwd: this.cwd } : {}
6832
- }
7322
+ aggregator: "code_judge",
7323
+ script: scriptPath
7324
+ },
7325
+ evaluatorResults
6833
7326
  };
6834
7327
  } catch (error) {
6835
7328
  const message = error instanceof Error ? error.message : String(error);
@@ -6837,312 +7330,152 @@ var CodeEvaluator = class {
6837
7330
  score: 0,
6838
7331
  verdict: "fail",
6839
7332
  hits: [],
6840
- misses: [`Code evaluator failed: ${message}`],
7333
+ misses: [`Code aggregator failed: ${message}`],
6841
7334
  expectedAspectCount: 1,
6842
7335
  reasoning: message,
6843
7336
  evaluatorRawRequest: {
6844
- script: this.script,
6845
- ...this.cwd ? { cwd: this.cwd } : {},
7337
+ aggregator: "code_judge",
7338
+ script: scriptPath,
6846
7339
  error: message
6847
- }
7340
+ },
7341
+ evaluatorResults
6848
7342
  };
6849
7343
  }
6850
7344
  }
6851
- };
6852
- function calculateRubricScore(result, rubrics) {
6853
- const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
6854
- const hits = [];
6855
- const misses = [];
6856
- let totalWeight = 0;
6857
- let earnedWeight = 0;
6858
- let failedRequired = false;
6859
- for (const check of result.checks) {
6860
- const rubric = rubricMap.get(check.id);
6861
- if (!rubric) {
6862
- continue;
7345
+ async runLlmAggregator(results, context, config) {
7346
+ const judgeProvider = context.judgeProvider;
7347
+ if (!judgeProvider) {
7348
+ throw new Error("No judge provider available for LLM aggregation");
6863
7349
  }
6864
- totalWeight += rubric.weight;
6865
- if (check.satisfied) {
6866
- earnedWeight += rubric.weight;
6867
- hits.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
6868
- } else {
6869
- misses.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
6870
- if (rubric.required) {
6871
- failedRequired = true;
7350
+ const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
7351
+ const resultsJson = JSON.stringify(resultsObject, null, 2);
7352
+ const evaluatorResults = results.map((member) => ({
7353
+ name: member.id,
7354
+ type: member.type,
7355
+ score: member.result.score,
7356
+ verdict: member.result.verdict,
7357
+ hits: [...member.result.hits],
7358
+ misses: [...member.result.misses],
7359
+ reasoning: member.result.reasoning,
7360
+ evaluatorRawRequest: member.result.evaluatorRawRequest,
7361
+ evaluatorResults: member.result.evaluatorResults,
7362
+ details: member.result.details
7363
+ }));
7364
+ const promptTemplate = config.prompt ?? DEFAULT_COMPOSITE_AGGREGATOR_PROMPT;
7365
+ const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
7366
+ const systemPrompt = buildOutputSchema();
7367
+ const evaluatorRawRequest = {
7368
+ aggregator: "llm_judge",
7369
+ userPrompt,
7370
+ systemPrompt,
7371
+ target: judgeProvider.targetName
7372
+ };
7373
+ try {
7374
+ const model = judgeProvider.asLanguageModel?.();
7375
+ if (model) {
7376
+ const { text } = await (0, import_ai3.generateText)({
7377
+ model,
7378
+ system: systemPrompt,
7379
+ prompt: userPrompt
7380
+ });
7381
+ const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text));
7382
+ const score2 = clampScore(data2.score);
7383
+ const hits2 = Array.isArray(data2.hits) ? data2.hits.filter(isNonEmptyString).slice(0, 4) : [];
7384
+ const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
7385
+ const reasoning2 = data2.reasoning;
7386
+ return {
7387
+ score: score2,
7388
+ verdict: scoreToVerdict(score2),
7389
+ hits: hits2,
7390
+ misses: misses2,
7391
+ expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
7392
+ reasoning: reasoning2,
7393
+ evaluatorRawRequest,
7394
+ evaluatorResults
7395
+ };
6872
7396
  }
7397
+ const response = await judgeProvider.invoke({
7398
+ question: userPrompt,
7399
+ systemPrompt,
7400
+ evalCaseId: context.evalCase.id,
7401
+ attempt: context.attempt
7402
+ });
7403
+ const data = freeformEvaluationSchema.parse(
7404
+ parseJsonFromText(extractLastAssistantContent2(response.outputMessages))
7405
+ );
7406
+ const score = clampScore(data.score);
7407
+ const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
7408
+ const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
7409
+ const reasoning = data.reasoning;
7410
+ return {
7411
+ score,
7412
+ verdict: scoreToVerdict(score),
7413
+ hits,
7414
+ misses,
7415
+ expectedAspectCount: Math.max(hits.length + misses.length, 1),
7416
+ reasoning,
7417
+ evaluatorRawRequest,
7418
+ evaluatorResults
7419
+ };
7420
+ } catch {
7421
+ return {
7422
+ score: 0,
7423
+ verdict: "fail",
7424
+ hits: [],
7425
+ misses: [],
7426
+ expectedAspectCount: 1,
7427
+ evaluatorRawRequest,
7428
+ evaluatorResults
7429
+ };
6873
7430
  }
6874
7431
  }
6875
- const score = totalWeight > 0 ? Math.min(1, Math.max(0, earnedWeight / totalWeight)) : 0;
6876
- const verdict = failedRequired ? "fail" : scoreToVerdict(score);
6877
- return { score, verdict, hits, misses };
6878
- }
6879
- async function executeScript(scriptPath, input, agentTimeoutMs, cwd) {
6880
- const { stdout, stderr, exitCode } = typeof scriptPath === "string" ? await execShellWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs }) : await execFileWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs });
6881
- if (exitCode !== 0) {
6882
- const trimmedErr = formatStderr(stderr);
6883
- throw new Error(
6884
- trimmedErr.length > 0 ? `Code evaluator exited with code ${exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${exitCode}`
6885
- );
6886
- }
6887
- return stdout.trim();
6888
- }
6889
- function formatStderr(stderr) {
6890
- const trimmed = stderr.trim();
6891
- const maxLength = 2e3;
6892
- if (trimmed.length <= maxLength) {
6893
- return trimmed;
6894
- }
6895
- const tail = trimmed.slice(-maxLength);
6896
- return `...(truncated, last ${maxLength} chars)
6897
- ${tail}`;
6898
- }
6899
- function parseJsonSafe(payload) {
6900
- try {
6901
- return JSON.parse(payload);
6902
- } catch {
6903
- return void 0;
6904
- }
6905
- }
6906
- function substituteVariables(template, variables) {
6907
- return template.replace(/\{\{\s*([a-zA-Z0-9_]+)\s*\}\}/g, (match, varName) => {
6908
- return variables[varName] ?? match;
6909
- });
6910
- }
6911
- function deepEqual(a, b) {
6912
- if (a === b) return true;
6913
- if (a === null || b === null) return a === b;
6914
- if (typeof a !== typeof b) return false;
6915
- if (typeof a !== "object") return a === b;
6916
- if (Array.isArray(a) !== Array.isArray(b)) return false;
6917
- if (Array.isArray(a) && Array.isArray(b)) {
6918
- if (a.length !== b.length) return false;
6919
- return a.every((val, i) => deepEqual(val, b[i]));
6920
- }
6921
- const aObj = a;
6922
- const bObj = b;
6923
- const aKeys = Object.keys(aObj);
6924
- const bKeys = Object.keys(bObj);
6925
- if (aKeys.length !== bKeys.length) return false;
6926
- return aKeys.every((key) => Object.hasOwn(bObj, key) && deepEqual(aObj[key], bObj[key]));
6927
- }
6928
- function argsMatch(expected, actual) {
6929
- if (expected === void 0) return true;
6930
- if (expected === "any") return true;
6931
- if (actual === void 0) return false;
6932
- for (const key of Object.keys(expected)) {
6933
- if (!Object.hasOwn(actual, key)) return false;
6934
- if (!deepEqual(expected[key], actual[key])) return false;
6935
- }
6936
- return true;
6937
- }
6938
- var ToolTrajectoryEvaluator = class {
6939
- kind = "tool_trajectory";
6940
- config;
6941
- constructor(options) {
6942
- this.config = options.config;
7432
+ };
7433
+
7434
+ // src/evaluation/evaluators/cost.ts
7435
+ var CostEvaluator = class {
7436
+ kind = "cost";
7437
+ config;
7438
+ constructor(options) {
7439
+ this.config = options.config;
6943
7440
  }
6944
7441
  evaluate(context) {
6945
- const { outputMessages, traceSummary } = context;
6946
- const toolCalls = this.extractToolCallsFromMessages(outputMessages);
6947
- if (toolCalls.length === 0 && !traceSummary) {
6948
- return {
6949
- score: 0,
6950
- verdict: "fail",
6951
- hits: [],
6952
- misses: ["No trace available for evaluation"],
6953
- expectedAspectCount: 1
6954
- };
6955
- }
6956
- const summary = toolCalls.length > 0 ? this.buildSummary(toolCalls) : traceSummary;
6957
- if (!summary) {
7442
+ const { budget } = this.config;
7443
+ const costUsd = context.traceSummary?.costUsd;
7444
+ if (costUsd === void 0) {
6958
7445
  return {
6959
7446
  score: 0,
6960
7447
  verdict: "fail",
6961
7448
  hits: [],
6962
- misses: ["No trace available for evaluation"],
6963
- expectedAspectCount: 1
6964
- };
6965
- }
6966
- switch (this.config.mode) {
6967
- case "any_order":
6968
- return this.evaluateAnyOrder(summary);
6969
- case "in_order":
6970
- return this.evaluateInOrder(toolCalls);
6971
- case "exact":
6972
- return this.evaluateExact(toolCalls);
6973
- default:
6974
- return {
6975
- score: 0,
6976
- verdict: "fail",
6977
- hits: [],
6978
- misses: [`Unknown mode: ${this.config.mode}`],
6979
- expectedAspectCount: 1
6980
- };
6981
- }
6982
- }
6983
- /**
6984
- * Extract tool calls from output messages.
6985
- */
6986
- extractToolCallsFromMessages(messages) {
6987
- if (!messages) {
6988
- return [];
6989
- }
6990
- const toolCalls = [];
6991
- for (const message of messages) {
6992
- if (message.toolCalls) {
6993
- for (const call of message.toolCalls) {
6994
- toolCalls.push({
6995
- name: call.tool,
6996
- args: call.input
6997
- });
7449
+ misses: ["No cost data available in trace"],
7450
+ expectedAspectCount: 1,
7451
+ reasoning: "Execution cost not reported by provider",
7452
+ evaluatorRawRequest: {
7453
+ type: "cost",
7454
+ budget,
7455
+ costUsd: null
6998
7456
  }
6999
- }
7000
- }
7001
- return toolCalls;
7002
- }
7003
- /**
7004
- * Build a summary from extracted tool calls.
7005
- */
7006
- buildSummary(toolCalls) {
7007
- const toolCallsByName = {};
7008
- for (const call of toolCalls) {
7009
- toolCallsByName[call.name] = (toolCallsByName[call.name] ?? 0) + 1;
7010
- }
7011
- const toolNames = Object.keys(toolCallsByName).sort();
7012
- return {
7013
- eventCount: toolCalls.length,
7014
- toolNames,
7015
- toolCallsByName,
7016
- errorCount: 0
7017
- };
7018
- }
7019
- evaluateAnyOrder(summary) {
7020
- const minimums = this.config.minimums ?? {};
7021
- const toolNames = Object.keys(minimums);
7022
- if (toolNames.length === 0) {
7023
- return {
7024
- score: 1,
7025
- verdict: "pass",
7026
- hits: ["No tool requirements specified"],
7027
- misses: [],
7028
- expectedAspectCount: 0
7029
- };
7030
- }
7031
- const hits = [];
7032
- const misses = [];
7033
- for (const toolName of toolNames) {
7034
- const required = minimums[toolName];
7035
- const actual = summary.toolCallsByName[toolName] ?? 0;
7036
- if (actual >= required) {
7037
- hits.push(`${toolName}: called ${actual} times (required \u2265${required})`);
7038
- } else {
7039
- misses.push(`${toolName}: called ${actual} times (required \u2265${required})`);
7040
- }
7041
- }
7042
- const score = hits.length / toolNames.length;
7043
- return {
7044
- score,
7045
- verdict: scoreToVerdict(score),
7046
- hits,
7047
- misses,
7048
- expectedAspectCount: toolNames.length
7049
- };
7050
- }
7051
- evaluateInOrder(toolCalls) {
7052
- const expected = this.config.expected ?? [];
7053
- if (expected.length === 0) {
7054
- return {
7055
- score: 1,
7056
- verdict: "pass",
7057
- hits: ["No tool sequence specified"],
7058
- misses: [],
7059
- expectedAspectCount: 0
7060
7457
  };
7061
7458
  }
7062
- const hits = [];
7063
- const misses = [];
7064
- let actualIndex = 0;
7065
- for (let i = 0; i < expected.length; i++) {
7066
- const expectedItem = expected[i];
7067
- const expectedTool = expectedItem.tool;
7068
- let found = false;
7069
- let argsMismatch = false;
7070
- while (actualIndex < toolCalls.length) {
7071
- const actualCall = toolCalls[actualIndex];
7072
- if (actualCall.name === expectedTool) {
7073
- if (argsMatch(expectedItem.args, actualCall.args)) {
7074
- hits.push(`Found ${expectedTool} at position ${actualIndex}`);
7075
- actualIndex++;
7076
- found = true;
7077
- break;
7078
- }
7079
- misses.push(
7080
- `Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`
7081
- );
7082
- actualIndex++;
7083
- argsMismatch = true;
7084
- break;
7085
- }
7086
- actualIndex++;
7087
- }
7088
- if (!found && !argsMismatch) {
7089
- misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
7090
- }
7091
- }
7092
- const score = hits.length / expected.length;
7459
+ const passed = costUsd <= budget;
7460
+ const score = passed ? 1 : 0;
7461
+ const formatCost = (n) => `$${n.toFixed(4)}`;
7093
7462
  return {
7094
7463
  score,
7095
- verdict: scoreToVerdict(score),
7096
- hits,
7097
- misses,
7098
- expectedAspectCount: expected.length
7099
- };
7100
- }
7101
- evaluateExact(toolCalls) {
7102
- const expected = this.config.expected ?? [];
7103
- if (expected.length === 0) {
7104
- return {
7105
- score: 1,
7106
- verdict: "pass",
7107
- hits: ["No tool sequence specified"],
7108
- misses: [],
7109
- expectedAspectCount: 0
7110
- };
7111
- }
7112
- const hits = [];
7113
- const misses = [];
7114
- if (toolCalls.length !== expected.length) {
7115
- misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
7116
- }
7117
- const checkLength = Math.min(expected.length, toolCalls.length);
7118
- for (let i = 0; i < checkLength; i++) {
7119
- const expectedItem = expected[i];
7120
- const expectedTool = expectedItem.tool;
7121
- const actualCall = toolCalls[i];
7122
- const actualTool = actualCall.name;
7123
- if (actualTool === expectedTool) {
7124
- if (argsMatch(expectedItem.args, actualCall.args)) {
7125
- hits.push(`Position ${i}: ${expectedTool}`);
7126
- } else {
7127
- misses.push(`Position ${i}: ${expectedTool} args mismatch`);
7128
- }
7129
- } else {
7130
- misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
7464
+ verdict: passed ? "pass" : "fail",
7465
+ hits: passed ? [`Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`] : [],
7466
+ misses: passed ? [] : [`Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`],
7467
+ expectedAspectCount: 1,
7468
+ reasoning: `Execution cost ${formatCost(costUsd)} (budget: ${formatCost(budget)})`,
7469
+ evaluatorRawRequest: {
7470
+ type: "cost",
7471
+ budget,
7472
+ costUsd
7131
7473
  }
7132
- }
7133
- for (let i = checkLength; i < expected.length; i++) {
7134
- misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
7135
- }
7136
- const score = hits.length / expected.length;
7137
- return {
7138
- score,
7139
- verdict: scoreToVerdict(score),
7140
- hits,
7141
- misses,
7142
- expectedAspectCount: expected.length
7143
7474
  };
7144
7475
  }
7145
7476
  };
7477
+
7478
+ // src/evaluation/evaluators/field-accuracy.ts
7146
7479
  var DEFAULT_DATE_FORMATS = [
7147
7480
  "YYYY-MM-DDTHH:mm:ssZ",
7148
7481
  // ISO with timezone
@@ -7353,436 +7686,211 @@ var FieldAccuracyEvaluator = class {
7353
7686
  message: `${path17} (non-numeric value)`
7354
7687
  };
7355
7688
  }
7356
- if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
7357
- return {
7358
- path: path17,
7359
- score: 0,
7360
- weight,
7361
- hit: false,
7362
- message: `${path17} (invalid numeric value)`
7363
- };
7364
- }
7365
- const diff = Math.abs(candidateNum - expectedNum);
7366
- let withinTolerance;
7367
- if (relative) {
7368
- const relativeDiff = expectedNum === 0 ? diff : diff / Math.abs(expectedNum);
7369
- withinTolerance = relativeDiff <= tolerance;
7370
- } else {
7371
- withinTolerance = diff <= tolerance;
7372
- }
7373
- if (withinTolerance) {
7374
- return {
7375
- path: path17,
7376
- score: 1,
7377
- weight,
7378
- hit: true,
7379
- message: `${path17} (within tolerance: diff=${diff.toFixed(2)})`
7380
- };
7381
- }
7382
- return {
7383
- path: path17,
7384
- score: 0,
7385
- weight,
7386
- hit: false,
7387
- message: `${path17} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
7388
- };
7389
- }
7390
- /**
7391
- * Date comparison with format normalization.
7392
- */
7393
- compareDate(path17, candidateValue, expectedValue, fieldConfig, weight) {
7394
- const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
7395
- const candidateDate = parseDate(String(candidateValue), formats);
7396
- const expectedDate = parseDate(String(expectedValue), formats);
7397
- if (candidateDate === null) {
7398
- return {
7399
- path: path17,
7400
- score: 0,
7401
- weight,
7402
- hit: false,
7403
- message: `${path17} (unparseable candidate date)`
7404
- };
7405
- }
7406
- if (expectedDate === null) {
7407
- return {
7408
- path: path17,
7409
- score: 0,
7410
- weight,
7411
- hit: false,
7412
- message: `${path17} (unparseable expected date)`
7413
- };
7414
- }
7415
- if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
7416
- return {
7417
- path: path17,
7418
- score: 1,
7419
- weight,
7420
- hit: true,
7421
- message: path17
7422
- };
7423
- }
7424
- return {
7425
- path: path17,
7426
- score: 0,
7427
- weight,
7428
- hit: false,
7429
- message: `${path17} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
7430
- };
7431
- }
7432
- /**
7433
- * Aggregate field results using configured strategy.
7434
- */
7435
- aggregateResults(results) {
7436
- const aggregation = this.config.aggregation ?? "weighted_average";
7437
- const hits = [];
7438
- const misses = [];
7439
- for (const result of results) {
7440
- if (result.hit) {
7441
- hits.push(result.message);
7442
- } else {
7443
- misses.push(result.message);
7444
- }
7445
- }
7446
- let score;
7447
- if (aggregation === "all_or_nothing") {
7448
- score = misses.length === 0 ? 1 : 0;
7449
- } else {
7450
- const totalWeight = results.reduce((sum, r) => sum + r.weight, 0);
7451
- if (totalWeight === 0) {
7452
- score = results.length === 0 ? 1 : 0;
7453
- } else {
7454
- const weightedSum = results.reduce((sum, r) => sum + r.score * r.weight, 0);
7455
- score = weightedSum / totalWeight;
7456
- }
7457
- }
7458
- const reasoning = `${hits.length}/${results.length} fields matched`;
7459
- return {
7460
- score: clampScore(score),
7461
- verdict: scoreToVerdict(score),
7462
- hits: hits.slice(0, 4),
7463
- misses: misses.slice(0, 4),
7464
- expectedAspectCount: results.length,
7465
- reasoning
7466
- };
7467
- }
7468
- };
7469
- function resolvePath(obj, path17) {
7470
- if (!path17 || !obj) {
7471
- return void 0;
7472
- }
7473
- const parts = path17.split(/\.|\[|\]/).filter((p) => p.length > 0);
7474
- let current = obj;
7475
- for (const part of parts) {
7476
- if (current === null || current === void 0) {
7477
- return void 0;
7478
- }
7479
- if (typeof current !== "object") {
7480
- return void 0;
7481
- }
7482
- const isIndex = /^\d+$/.test(part);
7483
- if (isIndex && Array.isArray(current)) {
7484
- current = current[Number.parseInt(part, 10)];
7485
- } else {
7486
- current = current[part];
7487
- }
7488
- }
7489
- return current;
7490
- }
7491
- function toNumber(value) {
7492
- if (typeof value === "number") {
7493
- return value;
7494
- }
7495
- if (typeof value === "string") {
7496
- const num = Number.parseFloat(value);
7497
- return Number.isNaN(num) ? null : num;
7498
- }
7499
- return null;
7500
- }
7501
- function parseDate(dateStr, formats) {
7502
- if (!dateStr) return null;
7503
- const trimmed = dateStr.trim();
7504
- const isoDate = new Date(trimmed);
7505
- if (!Number.isNaN(isoDate.getTime())) {
7506
- return isoDate;
7507
- }
7508
- const localizedMatch = trimmed.match(/^(\d{1,2})-([A-Za-z]{3,9})-(\d{4})$/);
7509
- if (localizedMatch) {
7510
- const day = Number.parseInt(localizedMatch[1], 10);
7511
- const monthName = localizedMatch[2].toLowerCase();
7512
- const year = Number.parseInt(localizedMatch[3], 10);
7513
- const month = MONTH_NAMES[monthName];
7514
- if (month !== void 0) {
7515
- return new Date(year, month, day);
7516
- }
7517
- }
7518
- const usMatch = trimmed.match(/^(\d{1,2})[\/\-](\d{1,2})[\/\-](\d{4})$/);
7519
- if (usMatch) {
7520
- const hasUSFormat = formats.some((f) => f.includes("MM/DD") || f.includes("MM-DD"));
7521
- const hasEUFormat = formats.some((f) => f.includes("DD/MM") || f.includes("DD-MM"));
7522
- if (hasUSFormat && !hasEUFormat) {
7523
- const month = Number.parseInt(usMatch[1], 10) - 1;
7524
- const day = Number.parseInt(usMatch[2], 10);
7525
- const year = Number.parseInt(usMatch[3], 10);
7526
- if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
7527
- return new Date(year, month, day);
7528
- }
7529
- } else if (hasEUFormat && !hasUSFormat) {
7530
- const day = Number.parseInt(usMatch[1], 10);
7531
- const month = Number.parseInt(usMatch[2], 10) - 1;
7532
- const year = Number.parseInt(usMatch[3], 10);
7533
- if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
7534
- return new Date(year, month, day);
7535
- }
7536
- } else {
7537
- const num1 = Number.parseInt(usMatch[1], 10);
7538
- const num2 = Number.parseInt(usMatch[2], 10);
7539
- const year = Number.parseInt(usMatch[3], 10);
7540
- if (num1 > 12 && num2 <= 12) {
7541
- return new Date(year, num2 - 1, num1);
7542
- }
7543
- if (num2 > 12 && num1 <= 12) {
7544
- return new Date(year, num1 - 1, num2);
7545
- }
7546
- if (num1 <= 12 && num2 <= 31) {
7547
- return new Date(year, num1 - 1, num2);
7548
- }
7549
- }
7550
- }
7551
- return null;
7552
- }
7553
- function formatDateISO(date) {
7554
- return date.toISOString().split("T")[0];
7555
- }
7556
- function parseJsonFromTextSafe(text) {
7557
- const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
7558
- const match = cleaned.match(/\{[\s\S]*\}/);
7559
- const blob = match?.[0] ?? cleaned;
7560
- return JSON.parse(blob);
7561
- }
7562
- var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
7563
- {{EVALUATOR_RESULTS_JSON}}
7564
-
7565
- Decide the final score and verdict based on all evaluator results.
7566
- Return a JSON object with: score (0.0-1.0), verdict (pass/fail/borderline), and reasoning.`;
7567
- var CompositeEvaluator = class {
7568
- kind = "composite";
7569
- config;
7570
- evaluatorFactory;
7571
- cwd;
7572
- constructor(options) {
7573
- this.config = options.config;
7574
- this.evaluatorFactory = options.evaluatorFactory;
7575
- this.cwd = options.cwd;
7576
- }
7577
- async evaluate(context) {
7578
- const memberResults = await Promise.all(
7579
- this.config.evaluators.map(async (memberConfig) => {
7580
- const evaluator = this.evaluatorFactory.create(memberConfig, context);
7581
- return {
7582
- id: memberConfig.name,
7583
- type: memberConfig.type,
7584
- result: await evaluator.evaluate(context)
7585
- };
7586
- })
7587
- );
7588
- return this.aggregate(memberResults, context);
7589
- }
7590
- async aggregate(results, context) {
7591
- const aggregator = this.config.aggregator;
7592
- switch (aggregator.type) {
7593
- case "code_judge":
7594
- return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
7595
- case "llm_judge":
7596
- return this.runLlmAggregator(results, context, aggregator);
7597
- default:
7598
- return this.runWeightedAverage(results, aggregator.weights);
7599
- }
7600
- }
7601
- runWeightedAverage(results, weights) {
7602
- let totalWeight = 0;
7603
- let weightedSum = 0;
7604
- const allHits = [];
7605
- const allMisses = [];
7606
- const reasoningParts = [];
7607
- const evaluatorResults = [];
7608
- for (const member of results) {
7609
- const weight = weights?.[member.id] ?? 1;
7610
- totalWeight += weight;
7611
- weightedSum += member.result.score * weight;
7612
- allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
7613
- allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
7614
- if (member.result.reasoning) {
7615
- reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
7616
- }
7617
- evaluatorResults.push({
7618
- name: member.id,
7619
- type: member.type,
7620
- score: member.result.score,
7621
- weight,
7622
- verdict: member.result.verdict,
7623
- hits: [...member.result.hits],
7624
- misses: [...member.result.misses],
7625
- reasoning: member.result.reasoning,
7626
- evaluatorRawRequest: member.result.evaluatorRawRequest,
7627
- evaluatorResults: member.result.evaluatorResults
7628
- });
7629
- }
7630
- const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
7631
- return {
7632
- score: clampScore(finalScore),
7633
- verdict: scoreToVerdict(finalScore),
7634
- hits: allHits,
7635
- misses: allMisses,
7636
- expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
7637
- reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
7638
- evaluatorRawRequest: {
7639
- aggregator: "weighted_average",
7640
- ...weights ? { weights } : {}
7641
- },
7642
- evaluatorResults
7643
- };
7644
- }
7645
- async runCodeAggregator(results, scriptPath, cwd, weights) {
7646
- const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
7647
- const inputPayload = JSON.stringify({ results: resultsObject }, null, 2);
7648
- const evaluatorResults = results.map((member) => ({
7649
- name: member.id,
7650
- type: member.type,
7651
- score: member.result.score,
7652
- weight: weights?.[member.id] ?? 1,
7653
- verdict: member.result.verdict,
7654
- hits: [...member.result.hits],
7655
- misses: [...member.result.misses],
7656
- reasoning: member.result.reasoning,
7657
- evaluatorRawRequest: member.result.evaluatorRawRequest,
7658
- evaluatorResults: member.result.evaluatorResults
7659
- }));
7660
- try {
7661
- const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
7662
- const parsed = parseJsonSafe(stdout);
7663
- const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
7664
- const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
7665
- const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
7666
- const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
7667
- const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
7668
- return {
7669
- score,
7670
- verdict,
7671
- hits,
7672
- misses,
7673
- expectedAspectCount: hits.length + misses.length || 1,
7674
- reasoning,
7675
- evaluatorRawRequest: {
7676
- aggregator: "code_judge",
7677
- script: scriptPath
7678
- },
7679
- evaluatorResults
7680
- };
7681
- } catch (error) {
7682
- const message = error instanceof Error ? error.message : String(error);
7683
- return {
7684
- score: 0,
7685
- verdict: "fail",
7686
- hits: [],
7687
- misses: [`Code aggregator failed: ${message}`],
7688
- expectedAspectCount: 1,
7689
- reasoning: message,
7690
- evaluatorRawRequest: {
7691
- aggregator: "code_judge",
7692
- script: scriptPath,
7693
- error: message
7694
- },
7695
- evaluatorResults
7696
- };
7697
- }
7698
- }
7699
- async runLlmAggregator(results, context, config) {
7700
- const judgeProvider = context.judgeProvider;
7701
- if (!judgeProvider) {
7702
- throw new Error("No judge provider available for LLM aggregation");
7703
- }
7704
- const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
7705
- const resultsJson = JSON.stringify(resultsObject, null, 2);
7706
- const evaluatorResults = results.map((member) => ({
7707
- name: member.id,
7708
- type: member.type,
7709
- score: member.result.score,
7710
- verdict: member.result.verdict,
7711
- hits: [...member.result.hits],
7712
- misses: [...member.result.misses],
7713
- reasoning: member.result.reasoning,
7714
- evaluatorRawRequest: member.result.evaluatorRawRequest,
7715
- evaluatorResults: member.result.evaluatorResults
7716
- }));
7717
- const promptTemplate = config.prompt ?? DEFAULT_COMPOSITE_AGGREGATOR_PROMPT;
7718
- const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
7719
- const systemPrompt = buildOutputSchema();
7720
- const evaluatorRawRequest = {
7721
- aggregator: "llm_judge",
7722
- userPrompt,
7723
- systemPrompt,
7724
- target: judgeProvider.targetName
7725
- };
7726
- try {
7727
- const model = judgeProvider.asLanguageModel?.();
7728
- if (model) {
7729
- const { text } = await (0, import_ai2.generateText)({
7730
- model,
7731
- system: systemPrompt,
7732
- prompt: userPrompt
7733
- });
7734
- const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text));
7735
- const score2 = clampScore(data2.score);
7736
- const hits2 = Array.isArray(data2.hits) ? data2.hits.filter(isNonEmptyString).slice(0, 4) : [];
7737
- const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
7738
- const reasoning2 = data2.reasoning;
7739
- return {
7740
- score: score2,
7741
- verdict: scoreToVerdict(score2),
7742
- hits: hits2,
7743
- misses: misses2,
7744
- expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
7745
- reasoning: reasoning2,
7746
- evaluatorRawRequest,
7747
- evaluatorResults
7748
- };
7749
- }
7750
- const response = await judgeProvider.invoke({
7751
- question: userPrompt,
7752
- systemPrompt,
7753
- evalCaseId: context.evalCase.id,
7754
- attempt: context.attempt
7755
- });
7756
- const data = freeformEvaluationSchema.parse(
7757
- parseJsonFromText(extractLastAssistantContent(response.outputMessages))
7758
- );
7759
- const score = clampScore(data.score);
7760
- const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
7761
- const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
7762
- const reasoning = data.reasoning;
7689
+ if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
7763
7690
  return {
7764
- score,
7765
- verdict: scoreToVerdict(score),
7766
- hits,
7767
- misses,
7768
- expectedAspectCount: Math.max(hits.length + misses.length, 1),
7769
- reasoning,
7770
- evaluatorRawRequest,
7771
- evaluatorResults
7691
+ path: path17,
7692
+ score: 0,
7693
+ weight,
7694
+ hit: false,
7695
+ message: `${path17} (invalid numeric value)`
7772
7696
  };
7773
- } catch {
7697
+ }
7698
+ const diff = Math.abs(candidateNum - expectedNum);
7699
+ let withinTolerance;
7700
+ if (relative) {
7701
+ const relativeDiff = expectedNum === 0 ? diff : diff / Math.abs(expectedNum);
7702
+ withinTolerance = relativeDiff <= tolerance;
7703
+ } else {
7704
+ withinTolerance = diff <= tolerance;
7705
+ }
7706
+ if (withinTolerance) {
7707
+ return {
7708
+ path: path17,
7709
+ score: 1,
7710
+ weight,
7711
+ hit: true,
7712
+ message: `${path17} (within tolerance: diff=${diff.toFixed(2)})`
7713
+ };
7714
+ }
7715
+ return {
7716
+ path: path17,
7717
+ score: 0,
7718
+ weight,
7719
+ hit: false,
7720
+ message: `${path17} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
7721
+ };
7722
+ }
7723
+ /**
7724
+ * Date comparison with format normalization.
7725
+ */
7726
+ compareDate(path17, candidateValue, expectedValue, fieldConfig, weight) {
7727
+ const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
7728
+ const candidateDate = parseDate(String(candidateValue), formats);
7729
+ const expectedDate = parseDate(String(expectedValue), formats);
7730
+ if (candidateDate === null) {
7774
7731
  return {
7732
+ path: path17,
7775
7733
  score: 0,
7776
- verdict: "fail",
7777
- hits: [],
7778
- misses: [],
7779
- expectedAspectCount: 1,
7780
- evaluatorRawRequest,
7781
- evaluatorResults
7734
+ weight,
7735
+ hit: false,
7736
+ message: `${path17} (unparseable candidate date)`
7737
+ };
7738
+ }
7739
+ if (expectedDate === null) {
7740
+ return {
7741
+ path: path17,
7742
+ score: 0,
7743
+ weight,
7744
+ hit: false,
7745
+ message: `${path17} (unparseable expected date)`
7746
+ };
7747
+ }
7748
+ if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
7749
+ return {
7750
+ path: path17,
7751
+ score: 1,
7752
+ weight,
7753
+ hit: true,
7754
+ message: path17
7782
7755
  };
7783
7756
  }
7757
+ return {
7758
+ path: path17,
7759
+ score: 0,
7760
+ weight,
7761
+ hit: false,
7762
+ message: `${path17} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
7763
+ };
7764
+ }
7765
+ /**
7766
+ * Aggregate field results using configured strategy.
7767
+ */
7768
+ aggregateResults(results) {
7769
+ const aggregation = this.config.aggregation ?? "weighted_average";
7770
+ const hits = [];
7771
+ const misses = [];
7772
+ for (const result of results) {
7773
+ if (result.hit) {
7774
+ hits.push(result.message);
7775
+ } else {
7776
+ misses.push(result.message);
7777
+ }
7778
+ }
7779
+ let score;
7780
+ if (aggregation === "all_or_nothing") {
7781
+ score = misses.length === 0 ? 1 : 0;
7782
+ } else {
7783
+ const totalWeight = results.reduce((sum, r) => sum + r.weight, 0);
7784
+ if (totalWeight === 0) {
7785
+ score = results.length === 0 ? 1 : 0;
7786
+ } else {
7787
+ const weightedSum = results.reduce((sum, r) => sum + r.score * r.weight, 0);
7788
+ score = weightedSum / totalWeight;
7789
+ }
7790
+ }
7791
+ const reasoning = `${hits.length}/${results.length} fields matched`;
7792
+ return {
7793
+ score: clampScore(score),
7794
+ verdict: scoreToVerdict(score),
7795
+ hits: hits.slice(0, 4),
7796
+ misses: misses.slice(0, 4),
7797
+ expectedAspectCount: results.length,
7798
+ reasoning
7799
+ };
7784
7800
  }
7785
7801
  };
7802
+ function resolvePath(obj, path17) {
7803
+ if (!path17 || !obj) {
7804
+ return void 0;
7805
+ }
7806
+ const parts = path17.split(/\.|\[|\]/).filter((p) => p.length > 0);
7807
+ let current = obj;
7808
+ for (const part of parts) {
7809
+ if (current === null || current === void 0) {
7810
+ return void 0;
7811
+ }
7812
+ if (typeof current !== "object") {
7813
+ return void 0;
7814
+ }
7815
+ const isIndex = /^\d+$/.test(part);
7816
+ if (isIndex && Array.isArray(current)) {
7817
+ current = current[Number.parseInt(part, 10)];
7818
+ } else {
7819
+ current = current[part];
7820
+ }
7821
+ }
7822
+ return current;
7823
+ }
7824
+ function toNumber(value) {
7825
+ if (typeof value === "number") {
7826
+ return value;
7827
+ }
7828
+ if (typeof value === "string") {
7829
+ const num = Number.parseFloat(value);
7830
+ return Number.isNaN(num) ? null : num;
7831
+ }
7832
+ return null;
7833
+ }
7834
+ function parseDate(dateStr, formats) {
7835
+ if (!dateStr) return null;
7836
+ const trimmed = dateStr.trim();
7837
+ const isoDate = new Date(trimmed);
7838
+ if (!Number.isNaN(isoDate.getTime())) {
7839
+ return isoDate;
7840
+ }
7841
+ const localizedMatch = trimmed.match(/^(\d{1,2})-([A-Za-z]{3,9})-(\d{4})$/);
7842
+ if (localizedMatch) {
7843
+ const day = Number.parseInt(localizedMatch[1], 10);
7844
+ const monthName = localizedMatch[2].toLowerCase();
7845
+ const year = Number.parseInt(localizedMatch[3], 10);
7846
+ const month = MONTH_NAMES[monthName];
7847
+ if (month !== void 0) {
7848
+ return new Date(year, month, day);
7849
+ }
7850
+ }
7851
+ const usMatch = trimmed.match(/^(\d{1,2})[\/\-](\d{1,2})[\/\-](\d{4})$/);
7852
+ if (usMatch) {
7853
+ const hasUSFormat = formats.some((f) => f.includes("MM/DD") || f.includes("MM-DD"));
7854
+ const hasEUFormat = formats.some((f) => f.includes("DD/MM") || f.includes("DD-MM"));
7855
+ if (hasUSFormat && !hasEUFormat) {
7856
+ const month = Number.parseInt(usMatch[1], 10) - 1;
7857
+ const day = Number.parseInt(usMatch[2], 10);
7858
+ const year = Number.parseInt(usMatch[3], 10);
7859
+ if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
7860
+ return new Date(year, month, day);
7861
+ }
7862
+ } else if (hasEUFormat && !hasUSFormat) {
7863
+ const day = Number.parseInt(usMatch[1], 10);
7864
+ const month = Number.parseInt(usMatch[2], 10) - 1;
7865
+ const year = Number.parseInt(usMatch[3], 10);
7866
+ if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
7867
+ return new Date(year, month, day);
7868
+ }
7869
+ } else {
7870
+ const num1 = Number.parseInt(usMatch[1], 10);
7871
+ const num2 = Number.parseInt(usMatch[2], 10);
7872
+ const year = Number.parseInt(usMatch[3], 10);
7873
+ if (num1 > 12 && num2 <= 12) {
7874
+ return new Date(year, num2 - 1, num1);
7875
+ }
7876
+ if (num2 > 12 && num1 <= 12) {
7877
+ return new Date(year, num1 - 1, num2);
7878
+ }
7879
+ if (num1 <= 12 && num2 <= 31) {
7880
+ return new Date(year, num1 - 1, num2);
7881
+ }
7882
+ }
7883
+ }
7884
+ return null;
7885
+ }
7886
+ function formatDateISO(date) {
7887
+ return date.toISOString().split("T")[0];
7888
+ }
7889
+ function parseJsonFromTextSafe(text) {
7890
+ return parseJsonFromText(text);
7891
+ }
7892
+
7893
+ // src/evaluation/evaluators/latency.ts
7786
7894
  var LatencyEvaluator = class {
7787
7895
  kind = "latency";
7788
7896
  config;
@@ -7816,56 +7924,16 @@ var LatencyEvaluator = class {
7816
7924
  misses: passed ? [] : [`Duration ${durationMs}ms > ${threshold}ms threshold`],
7817
7925
  expectedAspectCount: 1,
7818
7926
  reasoning: `Execution took ${durationMs}ms (threshold: ${threshold}ms)`,
7819
- evaluatorRawRequest: {
7820
- type: "latency",
7821
- threshold,
7822
- durationMs
7823
- }
7824
- };
7825
- }
7826
- };
7827
- var CostEvaluator = class {
7828
- kind = "cost";
7829
- config;
7830
- constructor(options) {
7831
- this.config = options.config;
7832
- }
7833
- evaluate(context) {
7834
- const { budget } = this.config;
7835
- const costUsd = context.traceSummary?.costUsd;
7836
- if (costUsd === void 0) {
7837
- return {
7838
- score: 0,
7839
- verdict: "fail",
7840
- hits: [],
7841
- misses: ["No cost data available in trace"],
7842
- expectedAspectCount: 1,
7843
- reasoning: "Execution cost not reported by provider",
7844
- evaluatorRawRequest: {
7845
- type: "cost",
7846
- budget,
7847
- costUsd: null
7848
- }
7849
- };
7850
- }
7851
- const passed = costUsd <= budget;
7852
- const score = passed ? 1 : 0;
7853
- const formatCost = (n) => `$${n.toFixed(4)}`;
7854
- return {
7855
- score,
7856
- verdict: passed ? "pass" : "fail",
7857
- hits: passed ? [`Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`] : [],
7858
- misses: passed ? [] : [`Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`],
7859
- expectedAspectCount: 1,
7860
- reasoning: `Execution cost ${formatCost(costUsd)} (budget: ${formatCost(budget)})`,
7861
- evaluatorRawRequest: {
7862
- type: "cost",
7863
- budget,
7864
- costUsd
7927
+ evaluatorRawRequest: {
7928
+ type: "latency",
7929
+ threshold,
7930
+ durationMs
7865
7931
  }
7866
7932
  };
7867
7933
  }
7868
7934
  };
7935
+
7936
+ // src/evaluation/evaluators/token-usage.ts
7869
7937
  var TokenUsageEvaluator = class {
7870
7938
  kind = "token_usage";
7871
7939
  config;
@@ -7949,8 +8017,228 @@ var TokenUsageEvaluator = class {
7949
8017
  }
7950
8018
  };
7951
8019
 
8020
+ // src/evaluation/evaluators/tool-trajectory.ts
8021
+ function argsMatch(expected, actual) {
8022
+ if (expected === void 0) return true;
8023
+ if (expected === "any") return true;
8024
+ if (actual === void 0) return false;
8025
+ for (const key of Object.keys(expected)) {
8026
+ if (!Object.hasOwn(actual, key)) return false;
8027
+ if (!deepEqual(expected[key], actual[key])) return false;
8028
+ }
8029
+ return true;
8030
+ }
8031
+ var ToolTrajectoryEvaluator = class {
8032
+ kind = "tool_trajectory";
8033
+ config;
8034
+ constructor(options) {
8035
+ this.config = options.config;
8036
+ }
8037
+ evaluate(context) {
8038
+ const { outputMessages, traceSummary } = context;
8039
+ const toolCalls = this.extractToolCallsFromMessages(outputMessages);
8040
+ if (toolCalls.length === 0 && !traceSummary) {
8041
+ return {
8042
+ score: 0,
8043
+ verdict: "fail",
8044
+ hits: [],
8045
+ misses: ["No trace available for evaluation"],
8046
+ expectedAspectCount: 1
8047
+ };
8048
+ }
8049
+ const summary = toolCalls.length > 0 ? this.buildSummary(toolCalls) : traceSummary;
8050
+ if (!summary) {
8051
+ return {
8052
+ score: 0,
8053
+ verdict: "fail",
8054
+ hits: [],
8055
+ misses: ["No trace available for evaluation"],
8056
+ expectedAspectCount: 1
8057
+ };
8058
+ }
8059
+ switch (this.config.mode) {
8060
+ case "any_order":
8061
+ return this.evaluateAnyOrder(summary);
8062
+ case "in_order":
8063
+ return this.evaluateInOrder(toolCalls);
8064
+ case "exact":
8065
+ return this.evaluateExact(toolCalls);
8066
+ default:
8067
+ return {
8068
+ score: 0,
8069
+ verdict: "fail",
8070
+ hits: [],
8071
+ misses: [`Unknown mode: ${this.config.mode}`],
8072
+ expectedAspectCount: 1
8073
+ };
8074
+ }
8075
+ }
8076
+ /**
8077
+ * Extract tool calls from output messages.
8078
+ */
8079
+ extractToolCallsFromMessages(messages) {
8080
+ if (!messages) {
8081
+ return [];
8082
+ }
8083
+ const toolCalls = [];
8084
+ for (const message of messages) {
8085
+ if (message.toolCalls) {
8086
+ for (const call of message.toolCalls) {
8087
+ toolCalls.push({
8088
+ name: call.tool,
8089
+ args: call.input
8090
+ });
8091
+ }
8092
+ }
8093
+ }
8094
+ return toolCalls;
8095
+ }
8096
+ /**
8097
+ * Build a summary from extracted tool calls.
8098
+ */
8099
+ buildSummary(toolCalls) {
8100
+ const toolCallsByName = {};
8101
+ for (const call of toolCalls) {
8102
+ toolCallsByName[call.name] = (toolCallsByName[call.name] ?? 0) + 1;
8103
+ }
8104
+ const toolNames = Object.keys(toolCallsByName).sort();
8105
+ return {
8106
+ eventCount: toolCalls.length,
8107
+ toolNames,
8108
+ toolCallsByName,
8109
+ errorCount: 0
8110
+ };
8111
+ }
8112
+ evaluateAnyOrder(summary) {
8113
+ const minimums = this.config.minimums ?? {};
8114
+ const toolNames = Object.keys(minimums);
8115
+ if (toolNames.length === 0) {
8116
+ return {
8117
+ score: 1,
8118
+ verdict: "pass",
8119
+ hits: ["No tool requirements specified"],
8120
+ misses: [],
8121
+ expectedAspectCount: 0
8122
+ };
8123
+ }
8124
+ const hits = [];
8125
+ const misses = [];
8126
+ for (const toolName of toolNames) {
8127
+ const required = minimums[toolName];
8128
+ const actual = summary.toolCallsByName[toolName] ?? 0;
8129
+ if (actual >= required) {
8130
+ hits.push(`${toolName}: called ${actual} times (required \u2265${required})`);
8131
+ } else {
8132
+ misses.push(`${toolName}: called ${actual} times (required \u2265${required})`);
8133
+ }
8134
+ }
8135
+ const score = hits.length / toolNames.length;
8136
+ return {
8137
+ score,
8138
+ verdict: scoreToVerdict(score),
8139
+ hits,
8140
+ misses,
8141
+ expectedAspectCount: toolNames.length
8142
+ };
8143
+ }
8144
+ evaluateInOrder(toolCalls) {
8145
+ const expected = this.config.expected ?? [];
8146
+ if (expected.length === 0) {
8147
+ return {
8148
+ score: 1,
8149
+ verdict: "pass",
8150
+ hits: ["No tool sequence specified"],
8151
+ misses: [],
8152
+ expectedAspectCount: 0
8153
+ };
8154
+ }
8155
+ const hits = [];
8156
+ const misses = [];
8157
+ let actualIndex = 0;
8158
+ for (let i = 0; i < expected.length; i++) {
8159
+ const expectedItem = expected[i];
8160
+ const expectedTool = expectedItem.tool;
8161
+ let found = false;
8162
+ let argsMismatch = false;
8163
+ while (actualIndex < toolCalls.length) {
8164
+ const actualCall = toolCalls[actualIndex];
8165
+ if (actualCall.name === expectedTool) {
8166
+ if (argsMatch(expectedItem.args, actualCall.args)) {
8167
+ hits.push(`Found ${expectedTool} at position ${actualIndex}`);
8168
+ actualIndex++;
8169
+ found = true;
8170
+ break;
8171
+ }
8172
+ misses.push(
8173
+ `Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`
8174
+ );
8175
+ actualIndex++;
8176
+ argsMismatch = true;
8177
+ break;
8178
+ }
8179
+ actualIndex++;
8180
+ }
8181
+ if (!found && !argsMismatch) {
8182
+ misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
8183
+ }
8184
+ }
8185
+ const score = hits.length / expected.length;
8186
+ return {
8187
+ score,
8188
+ verdict: scoreToVerdict(score),
8189
+ hits,
8190
+ misses,
8191
+ expectedAspectCount: expected.length
8192
+ };
8193
+ }
8194
+ evaluateExact(toolCalls) {
8195
+ const expected = this.config.expected ?? [];
8196
+ if (expected.length === 0) {
8197
+ return {
8198
+ score: 1,
8199
+ verdict: "pass",
8200
+ hits: ["No tool sequence specified"],
8201
+ misses: [],
8202
+ expectedAspectCount: 0
8203
+ };
8204
+ }
8205
+ const hits = [];
8206
+ const misses = [];
8207
+ if (toolCalls.length !== expected.length) {
8208
+ misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
8209
+ }
8210
+ const checkLength = Math.min(expected.length, toolCalls.length);
8211
+ for (let i = 0; i < checkLength; i++) {
8212
+ const expectedItem = expected[i];
8213
+ const expectedTool = expectedItem.tool;
8214
+ const actualCall = toolCalls[i];
8215
+ const actualTool = actualCall.name;
8216
+ if (actualTool === expectedTool) {
8217
+ if (argsMatch(expectedItem.args, actualCall.args)) {
8218
+ hits.push(`Position ${i}: ${expectedTool}`);
8219
+ } else {
8220
+ misses.push(`Position ${i}: ${expectedTool} args mismatch`);
8221
+ }
8222
+ } else {
8223
+ misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
8224
+ }
8225
+ }
8226
+ for (let i = checkLength; i < expected.length; i++) {
8227
+ misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
8228
+ }
8229
+ const score = hits.length / expected.length;
8230
+ return {
8231
+ score,
8232
+ verdict: scoreToVerdict(score),
8233
+ hits,
8234
+ misses,
8235
+ expectedAspectCount: expected.length
8236
+ };
8237
+ }
8238
+ };
8239
+
7952
8240
  // src/evaluation/orchestrator.ts
7953
- var import_node_crypto4 = require("crypto");
8241
+ var import_node_crypto5 = require("crypto");
7954
8242
  var import_node_path16 = __toESM(require("path"), 1);
7955
8243
 
7956
8244
  // ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
@@ -8162,6 +8450,17 @@ async function runEvaluation(options) {
8162
8450
  }
8163
8451
  return getOrCreateProvider(resolvedJudge);
8164
8452
  };
8453
+ const targetResolver = (name) => {
8454
+ const resolved = resolveTargetByName(name);
8455
+ if (!resolved) {
8456
+ return void 0;
8457
+ }
8458
+ return getOrCreateProvider(resolved);
8459
+ };
8460
+ const availableTargets = [
8461
+ target.name,
8462
+ ...Array.from(targetDefinitions.keys())
8463
+ ];
8165
8464
  const evaluatorRegistry = buildEvaluatorRegistry(evaluators, resolveJudgeProvider);
8166
8465
  const primaryProvider = getOrCreateProvider(target);
8167
8466
  const providerSupportsBatch = target.providerBatching === true && primaryProvider.supportsBatch === true && typeof primaryProvider.invokeBatch === "function";
@@ -8191,7 +8490,9 @@ async function runEvaluation(options) {
8191
8490
  onResult,
8192
8491
  verbose,
8193
8492
  resolveJudgeProvider,
8194
- agentTimeoutMs
8493
+ agentTimeoutMs,
8494
+ targetResolver,
8495
+ availableTargets
8195
8496
  });
8196
8497
  } catch (error) {
8197
8498
  if (verbose) {
@@ -8230,7 +8531,9 @@ async function runEvaluation(options) {
8230
8531
  cache,
8231
8532
  useCache,
8232
8533
  now,
8233
- judgeProvider
8534
+ judgeProvider,
8535
+ targetResolver,
8536
+ availableTargets
8234
8537
  });
8235
8538
  if (onProgress) {
8236
8539
  await onProgress({
@@ -8297,7 +8600,9 @@ async function runBatchEvaluation(options) {
8297
8600
  onProgress,
8298
8601
  onResult,
8299
8602
  resolveJudgeProvider,
8300
- agentTimeoutMs
8603
+ agentTimeoutMs,
8604
+ targetResolver,
8605
+ availableTargets
8301
8606
  } = options;
8302
8607
  const promptInputsList = [];
8303
8608
  const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
@@ -8356,7 +8661,7 @@ async function runBatchEvaluation(options) {
8356
8661
  costUsd: providerResponse.costUsd,
8357
8662
  durationMs: providerResponse.durationMs
8358
8663
  }) : void 0;
8359
- const candidate = extractLastAssistantContent(outputMessages);
8664
+ const candidate = extractLastAssistantContent2(outputMessages);
8360
8665
  const providerError = extractProviderError(providerResponse);
8361
8666
  let result;
8362
8667
  try {
@@ -8372,7 +8677,9 @@ async function runBatchEvaluation(options) {
8372
8677
  judgeProvider: await resolveJudgeProvider(target),
8373
8678
  agentTimeoutMs,
8374
8679
  outputMessages,
8375
- traceSummary
8680
+ traceSummary,
8681
+ targetResolver,
8682
+ availableTargets
8376
8683
  });
8377
8684
  if (providerError) {
8378
8685
  result = { ...result, error: providerError };
@@ -8430,7 +8737,9 @@ async function runEvalCase(options) {
8430
8737
  cache,
8431
8738
  useCache,
8432
8739
  signal,
8433
- judgeProvider
8740
+ judgeProvider,
8741
+ targetResolver,
8742
+ availableTargets
8434
8743
  } = options;
8435
8744
  const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
8436
8745
  const promptInputs = await buildPromptInputs(evalCase, formattingMode);
@@ -8489,7 +8798,7 @@ async function runEvalCase(options) {
8489
8798
  costUsd: providerResponse.costUsd,
8490
8799
  durationMs: providerResponse.durationMs
8491
8800
  }) : void 0;
8492
- const candidate = extractLastAssistantContent(outputMessages);
8801
+ const candidate = extractLastAssistantContent2(outputMessages);
8493
8802
  const providerError = extractProviderError(providerResponse);
8494
8803
  try {
8495
8804
  const result = await evaluateCandidate({
@@ -8504,7 +8813,9 @@ async function runEvalCase(options) {
8504
8813
  judgeProvider,
8505
8814
  agentTimeoutMs,
8506
8815
  outputMessages,
8507
- traceSummary
8816
+ traceSummary,
8817
+ targetResolver,
8818
+ availableTargets
8508
8819
  });
8509
8820
  return providerError ? { ...result, error: providerError } : result;
8510
8821
  } catch (error) {
@@ -8524,7 +8835,9 @@ async function evaluateCandidate(options) {
8524
8835
  judgeProvider,
8525
8836
  agentTimeoutMs,
8526
8837
  outputMessages,
8527
- traceSummary
8838
+ traceSummary,
8839
+ targetResolver,
8840
+ availableTargets
8528
8841
  } = options;
8529
8842
  const gradeTimestamp = nowFn();
8530
8843
  const { score, evaluatorResults } = await runEvaluatorsForCase({
@@ -8539,7 +8852,9 @@ async function evaluateCandidate(options) {
8539
8852
  judgeProvider,
8540
8853
  agentTimeoutMs,
8541
8854
  outputMessages,
8542
- traceSummary
8855
+ traceSummary,
8856
+ targetResolver,
8857
+ availableTargets
8543
8858
  });
8544
8859
  const completedAt = nowFn();
8545
8860
  let agentProviderRequest;
@@ -8592,7 +8907,9 @@ async function runEvaluatorsForCase(options) {
8592
8907
  judgeProvider,
8593
8908
  agentTimeoutMs,
8594
8909
  outputMessages,
8595
- traceSummary
8910
+ traceSummary,
8911
+ targetResolver,
8912
+ availableTargets
8596
8913
  } = options;
8597
8914
  if (evalCase.evaluators && evalCase.evaluators.length > 0) {
8598
8915
  return runEvaluatorList({
@@ -8608,7 +8925,9 @@ async function runEvaluatorsForCase(options) {
8608
8925
  judgeProvider,
8609
8926
  agentTimeoutMs,
8610
8927
  outputMessages,
8611
- traceSummary
8928
+ traceSummary,
8929
+ targetResolver,
8930
+ availableTargets
8612
8931
  });
8613
8932
  }
8614
8933
  const evaluatorKind = evalCase.evaluator ?? "llm_judge";
@@ -8626,7 +8945,9 @@ async function runEvaluatorsForCase(options) {
8626
8945
  now,
8627
8946
  judgeProvider,
8628
8947
  outputMessages,
8629
- traceSummary
8948
+ traceSummary,
8949
+ targetResolver,
8950
+ availableTargets
8630
8951
  });
8631
8952
  return { score };
8632
8953
  }
@@ -8644,7 +8965,9 @@ async function runEvaluatorList(options) {
8644
8965
  judgeProvider,
8645
8966
  agentTimeoutMs,
8646
8967
  outputMessages,
8647
- traceSummary
8968
+ traceSummary,
8969
+ targetResolver,
8970
+ availableTargets
8648
8971
  } = options;
8649
8972
  const scored = [];
8650
8973
  const evaluatorResults = [];
@@ -8682,7 +9005,8 @@ async function runEvaluatorList(options) {
8682
9005
  script: evaluator.script,
8683
9006
  cwd: evaluator.resolvedCwd ?? evaluator.cwd,
8684
9007
  agentTimeoutMs,
8685
- config: evaluator.config
9008
+ config: evaluator.config,
9009
+ target: evaluator.target
8686
9010
  });
8687
9011
  const score2 = await codeEvaluator.evaluate({
8688
9012
  evalCase,
@@ -8692,8 +9016,11 @@ async function runEvaluatorList(options) {
8692
9016
  attempt,
8693
9017
  promptInputs,
8694
9018
  now,
9019
+ judgeProvider,
8695
9020
  outputMessages,
8696
- traceSummary
9021
+ traceSummary,
9022
+ targetResolver,
9023
+ availableTargets
8697
9024
  });
8698
9025
  const weight = evaluator.weight ?? 1;
8699
9026
  scored.push({ score: score2, name: evaluator.name, type: "code_judge", weight });
@@ -8706,7 +9033,8 @@ async function runEvaluatorList(options) {
8706
9033
  hits: score2.hits,
8707
9034
  misses: score2.misses,
8708
9035
  reasoning: score2.reasoning,
8709
- evaluatorProviderRequest: score2.evaluatorRawRequest
9036
+ evaluatorProviderRequest: score2.evaluatorRawRequest,
9037
+ details: score2.details
8710
9038
  });
8711
9039
  }
8712
9040
  if (evaluator.type === "composite") {
@@ -8720,7 +9048,8 @@ async function runEvaluatorList(options) {
8720
9048
  script: memberConfig.script,
8721
9049
  cwd: memberConfig.resolvedCwd ?? memberConfig.cwd,
8722
9050
  agentTimeoutMs,
8723
- config: memberConfig.config
9051
+ config: memberConfig.config,
9052
+ target: memberConfig.target
8724
9053
  });
8725
9054
  case "composite":
8726
9055
  return new CompositeEvaluator({
@@ -8769,7 +9098,9 @@ async function runEvaluatorList(options) {
8769
9098
  now,
8770
9099
  judgeProvider,
8771
9100
  outputMessages,
8772
- traceSummary
9101
+ traceSummary,
9102
+ targetResolver,
9103
+ availableTargets
8773
9104
  });
8774
9105
  const weight = evaluator.weight ?? 1;
8775
9106
  scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
@@ -8965,11 +9296,11 @@ async function runEvaluatorList(options) {
8965
9296
  (total, entry) => total + (entry.score.expectedAspectCount ?? 0),
8966
9297
  0
8967
9298
  );
8968
- const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString2);
9299
+ const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString);
8969
9300
  const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
8970
9301
  const score = {
8971
9302
  score: aggregateScore,
8972
- verdict: scoreToVerdict2(aggregateScore),
9303
+ verdict: scoreToVerdict(aggregateScore),
8973
9304
  hits,
8974
9305
  misses,
8975
9306
  expectedAspectCount,
@@ -9016,18 +9347,6 @@ async function resolveCustomPrompt(config) {
9016
9347
  }
9017
9348
  return config.prompt;
9018
9349
  }
9019
- function isNonEmptyString2(value) {
9020
- return typeof value === "string" && value.trim().length > 0;
9021
- }
9022
- function scoreToVerdict2(score) {
9023
- if (score >= 0.8) {
9024
- return "pass";
9025
- }
9026
- if (score >= 0.6) {
9027
- return "borderline";
9028
- }
9029
- return "fail";
9030
- }
9031
9350
  function filterEvalCases(evalCases, evalId) {
9032
9351
  if (!evalId) {
9033
9352
  return evalCases;
@@ -9129,7 +9448,7 @@ function extractProviderError(response) {
9129
9448
  return trimmed.length > 0 ? trimmed : void 0;
9130
9449
  }
9131
9450
  function createCacheKey(provider, target, evalCase, promptInputs) {
9132
- const hash = (0, import_node_crypto4.createHash)("sha256");
9451
+ const hash = (0, import_node_crypto5.createHash)("sha256");
9133
9452
  hash.update(provider.id);
9134
9453
  hash.update(target.name);
9135
9454
  hash.update(evalCase.id);
@@ -9170,7 +9489,8 @@ function mapChildResults(children) {
9170
9489
  misses: child.misses,
9171
9490
  reasoning: child.reasoning,
9172
9491
  evaluatorProviderRequest: child.evaluatorRawRequest,
9173
- evaluatorResults: mapChildResults(child.evaluatorResults)
9492
+ evaluatorResults: mapChildResults(child.evaluatorResults),
9493
+ details: child.details
9174
9494
  }));
9175
9495
  }
9176
9496
  function computeWeightedMean(entries) {
@@ -9185,7 +9505,7 @@ function computeWeightedMean(entries) {
9185
9505
  }
9186
9506
 
9187
9507
  // src/evaluation/generators/rubric-generator.ts
9188
- var import_ai3 = require("ai");
9508
+ var import_ai4 = require("ai");
9189
9509
  var import_zod4 = require("zod");
9190
9510
  var rubricItemSchema = import_zod4.z.object({
9191
9511
  id: import_zod4.z.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
@@ -9219,7 +9539,7 @@ You must return a valid JSON object matching this schema:
9219
9539
  let lastError;
9220
9540
  for (let attempt = 1; attempt <= 3; attempt++) {
9221
9541
  try {
9222
- const { text } = await (0, import_ai3.generateText)({
9542
+ const { text } = await (0, import_ai4.generateText)({
9223
9543
  model,
9224
9544
  system,
9225
9545
  prompt
@@ -9282,31 +9602,39 @@ function createAgentKernel() {
9282
9602
  ToolTrajectoryEvaluator,
9283
9603
  avgToolDurationMs,
9284
9604
  buildDirectoryChain,
9605
+ buildOutputSchema,
9285
9606
  buildPromptInputs,
9286
9607
  buildSearchRoots,
9608
+ clampScore,
9287
9609
  computeTraceSummary,
9288
9610
  consumeClaudeCodeLogEntries,
9289
9611
  consumeCodexLogEntries,
9290
9612
  consumePiLogEntries,
9291
9613
  createAgentKernel,
9292
9614
  createProvider,
9615
+ deepEqual,
9293
9616
  ensureVSCodeSubagents,
9617
+ executeScript,
9294
9618
  explorationRatio,
9295
- extractCodeBlocks,
9619
+ extractJsonBlob,
9296
9620
  fileExists,
9297
9621
  findGitRoot,
9622
+ freeformEvaluationSchema,
9298
9623
  generateRubrics,
9299
9624
  getHitCount,
9300
9625
  isEvaluatorKind,
9301
9626
  isGuidelineFile,
9302
9627
  isJsonObject,
9303
9628
  isJsonValue,
9629
+ isNonEmptyString,
9304
9630
  isTestMessage,
9305
9631
  isTestMessageRole,
9306
9632
  listTargetNames,
9307
9633
  loadEvalCases,
9308
9634
  mergeExecutionMetrics,
9309
9635
  normalizeLineEndings,
9636
+ parseJsonFromText,
9637
+ parseJsonSafe,
9310
9638
  readJsonFile,
9311
9639
  readTargetDefinitions,
9312
9640
  readTestSuiteMetadata,
@@ -9316,6 +9644,7 @@ function createAgentKernel() {
9316
9644
  resolveTargetDefinition,
9317
9645
  runEvalCase,
9318
9646
  runEvaluation,
9647
+ scoreToVerdict,
9319
9648
  subscribeToClaudeCodeLogEntries,
9320
9649
  subscribeToCodexLogEntries,
9321
9650
  subscribeToPiLogEntries,