@agentv/core 2.0.1 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -42,33 +42,39 @@ __export(index_exports, {
42
42
  ToolTrajectoryEvaluator: () => ToolTrajectoryEvaluator,
43
43
  avgToolDurationMs: () => avgToolDurationMs,
44
44
  buildDirectoryChain: () => buildDirectoryChain2,
45
+ buildOutputSchema: () => buildOutputSchema,
45
46
  buildPromptInputs: () => buildPromptInputs,
46
47
  buildSearchRoots: () => buildSearchRoots2,
48
+ clampScore: () => clampScore,
47
49
  computeTraceSummary: () => computeTraceSummary,
48
50
  consumeClaudeCodeLogEntries: () => consumeClaudeCodeLogEntries,
49
51
  consumeCodexLogEntries: () => consumeCodexLogEntries,
50
52
  consumePiLogEntries: () => consumePiLogEntries,
51
53
  createAgentKernel: () => createAgentKernel,
52
54
  createProvider: () => createProvider,
55
+ deepEqual: () => deepEqual,
53
56
  ensureVSCodeSubagents: () => ensureVSCodeSubagents,
57
+ executeScript: () => executeScript,
54
58
  explorationRatio: () => explorationRatio,
55
- extractCodeBlocks: () => extractCodeBlocks,
59
+ extractJsonBlob: () => extractJsonBlob,
56
60
  fileExists: () => fileExists2,
57
61
  findGitRoot: () => findGitRoot,
62
+ freeformEvaluationSchema: () => freeformEvaluationSchema,
58
63
  generateRubrics: () => generateRubrics,
59
64
  getHitCount: () => getHitCount,
60
65
  isEvaluatorKind: () => isEvaluatorKind,
61
66
  isGuidelineFile: () => isGuidelineFile,
62
67
  isJsonObject: () => isJsonObject,
63
68
  isJsonValue: () => isJsonValue,
69
+ isNonEmptyString: () => isNonEmptyString,
64
70
  isTestMessage: () => isTestMessage,
65
71
  isTestMessageRole: () => isTestMessageRole,
66
72
  listTargetNames: () => listTargetNames,
67
73
  loadEvalCases: () => loadEvalCases,
68
74
  mergeExecutionMetrics: () => mergeExecutionMetrics,
69
75
  normalizeLineEndings: () => normalizeLineEndings,
70
- parseCodeJudgePayload: () => parseCodeJudgePayload,
71
- readCodeJudgePayload: () => readCodeJudgePayload,
76
+ parseJsonFromText: () => parseJsonFromText,
77
+ parseJsonSafe: () => parseJsonSafe,
72
78
  readJsonFile: () => readJsonFile,
73
79
  readTargetDefinitions: () => readTargetDefinitions,
74
80
  readTestSuiteMetadata: () => readTestSuiteMetadata,
@@ -78,6 +84,7 @@ __export(index_exports, {
78
84
  resolveTargetDefinition: () => resolveTargetDefinition,
79
85
  runEvalCase: () => runEvalCase,
80
86
  runEvaluation: () => runEvaluation,
87
+ scoreToVerdict: () => scoreToVerdict,
81
88
  subscribeToClaudeCodeLogEntries: () => subscribeToClaudeCodeLogEntries,
82
89
  subscribeToCodexLogEntries: () => subscribeToCodexLogEntries,
83
90
  subscribeToPiLogEntries: () => subscribeToPiLogEntries,
@@ -223,85 +230,6 @@ var import_promises6 = require("fs/promises");
223
230
  var import_node_path6 = __toESM(require("path"), 1);
224
231
  var import_yaml2 = require("yaml");
225
232
 
226
- // src/evaluation/formatting/segment-formatter.ts
227
- function extractCodeBlocks(segments) {
228
- const CODE_BLOCK_PATTERN = /```[\s\S]*?```/g;
229
- const codeBlocks = [];
230
- for (const segment of segments) {
231
- const typeValue = segment.type;
232
- if (typeof typeValue !== "string" || typeValue !== "text") {
233
- continue;
234
- }
235
- const textValue = segment.value;
236
- if (typeof textValue !== "string") {
237
- continue;
238
- }
239
- const matches = textValue.match(CODE_BLOCK_PATTERN);
240
- if (matches) {
241
- codeBlocks.push(...matches);
242
- }
243
- }
244
- return codeBlocks;
245
- }
246
- function formatFileContents(parts) {
247
- const fileCount = parts.filter((p) => p.isFile).length;
248
- if (fileCount > 0) {
249
- return parts.map((part) => {
250
- if (part.isFile && part.displayPath) {
251
- return `<file path="${part.displayPath}">
252
- ${part.content}
253
- </file>`;
254
- }
255
- return part.content;
256
- }).join("\n\n");
257
- }
258
- return parts.map((p) => p.content).join(" ");
259
- }
260
- function formatSegment(segment, mode = "lm") {
261
- const type = asString(segment.type);
262
- if (type === "text") {
263
- return asString(segment.value);
264
- }
265
- if (type === "guideline_ref") {
266
- const refPath = asString(segment.path);
267
- return refPath ? `<Attached: ${refPath}>` : void 0;
268
- }
269
- if (type === "file") {
270
- const filePath = asString(segment.path);
271
- if (!filePath) {
272
- return void 0;
273
- }
274
- if (mode === "agent") {
275
- return `<file: path="${filePath}">`;
276
- }
277
- const text = asString(segment.text);
278
- if (text && filePath) {
279
- return formatFileContents([{ content: text.trim(), isFile: true, displayPath: filePath }]);
280
- }
281
- }
282
- return void 0;
283
- }
284
- function hasVisibleContent(segments) {
285
- return segments.some((segment) => {
286
- const type = asString(segment.type);
287
- if (type === "text") {
288
- const value = asString(segment.value);
289
- return value !== void 0 && value.trim().length > 0;
290
- }
291
- if (type === "guideline_ref") {
292
- return false;
293
- }
294
- if (type === "file") {
295
- const text = asString(segment.text);
296
- return text !== void 0 && text.trim().length > 0;
297
- }
298
- return false;
299
- });
300
- }
301
- function asString(value) {
302
- return typeof value === "string" ? value : void 0;
303
- }
304
-
305
233
  // src/evaluation/loaders/config-loader.ts
306
234
  var import_promises2 = require("fs/promises");
307
235
  var import_node_path2 = __toESM(require("path"), 1);
@@ -556,7 +484,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
556
484
  logWarning2(`Skipping invalid evaluator entry for '${evalId}' (expected object)`);
557
485
  continue;
558
486
  }
559
- const name = asString2(rawEvaluator.name);
487
+ const name = asString(rawEvaluator.name);
560
488
  const typeValue = rawEvaluator.type;
561
489
  if (!name || !isEvaluatorKind(typeValue)) {
562
490
  logWarning2(`Skipping evaluator with invalid name/type in '${evalId}'`);
@@ -584,7 +512,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
584
512
  continue;
585
513
  }
586
514
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
587
- const cwd = asString2(rawEvaluator.cwd);
515
+ const cwd = asString(rawEvaluator.cwd);
588
516
  let resolvedCwd;
589
517
  if (cwd) {
590
518
  const resolved = await resolveFileReference(cwd, searchRoots);
@@ -599,7 +527,29 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
599
527
  } else {
600
528
  resolvedCwd = searchRoots[0];
601
529
  }
602
- const knownProps = /* @__PURE__ */ new Set(["name", "type", "script", "cwd", "weight"]);
530
+ const rawTarget = rawEvaluator.target;
531
+ let targetConfig;
532
+ if (rawTarget !== void 0) {
533
+ if (isJsonObject2(rawTarget)) {
534
+ const maxCalls = rawTarget.max_calls;
535
+ if (maxCalls !== void 0 && (typeof maxCalls !== "number" || maxCalls < 0)) {
536
+ logWarning2(
537
+ `Invalid target.max_calls for evaluator '${name}' in '${evalId}': must be a non-negative number`
538
+ );
539
+ } else {
540
+ targetConfig = {
541
+ ...typeof maxCalls === "number" ? { max_calls: maxCalls } : {}
542
+ };
543
+ }
544
+ } else if (rawTarget === true) {
545
+ targetConfig = {};
546
+ } else {
547
+ logWarning2(
548
+ `Invalid target config for evaluator '${name}' in '${evalId}': expected object or true`
549
+ );
550
+ }
551
+ }
552
+ const knownProps = /* @__PURE__ */ new Set(["name", "type", "script", "cwd", "weight", "target"]);
603
553
  const config = {};
604
554
  for (const [key, value] of Object.entries(rawEvaluator)) {
605
555
  if (!knownProps.has(key) && value !== void 0) {
@@ -613,7 +563,8 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
613
563
  cwd,
614
564
  resolvedCwd,
615
565
  ...weight2 !== void 0 ? { weight: weight2 } : {},
616
- ...Object.keys(config).length > 0 ? { config } : {}
566
+ ...Object.keys(config).length > 0 ? { config } : {},
567
+ ...targetConfig !== void 0 ? { target: targetConfig } : {}
617
568
  });
618
569
  continue;
619
570
  }
@@ -630,7 +581,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
630
581
  logWarning2(`Skipping composite evaluator '${name}' in '${evalId}': missing aggregator`);
631
582
  continue;
632
583
  }
633
- const aggregatorType = asString2(rawAggregator.type);
584
+ const aggregatorType = asString(rawAggregator.type);
634
585
  if (aggregatorType !== "weighted_average" && aggregatorType !== "code_judge" && aggregatorType !== "llm_judge") {
635
586
  logWarning2(
636
587
  `Skipping composite evaluator '${name}' in '${evalId}': invalid aggregator type '${aggregatorType}'`
@@ -643,7 +594,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
643
594
  logWarning2(`Skipping invalid member evaluator in composite '${name}' (expected object)`);
644
595
  continue;
645
596
  }
646
- const memberName = asString2(rawMember.name);
597
+ const memberName = asString(rawMember.name);
647
598
  const memberType = rawMember.type;
648
599
  if (!memberName || !isEvaluatorKind(memberType)) {
649
600
  logWarning2(`Skipping member evaluator with invalid name/type in composite '${name}'`);
@@ -681,7 +632,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
681
632
  ...Object.keys(parsedWeights).length > 0 ? { weights: parsedWeights } : {}
682
633
  };
683
634
  } else if (aggregatorType === "code_judge") {
684
- const aggregatorPath = asString2(rawAggregator.path);
635
+ const aggregatorPath = asString(rawAggregator.path);
685
636
  if (!aggregatorPath) {
686
637
  logWarning2(
687
638
  `Skipping composite evaluator '${name}' in '${evalId}': code_judge aggregator missing path`
@@ -694,7 +645,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
694
645
  cwd: searchRoots[0]
695
646
  };
696
647
  } else {
697
- const aggregatorPrompt = asString2(rawAggregator.prompt);
648
+ const aggregatorPrompt = asString(rawAggregator.prompt);
698
649
  let promptPath2;
699
650
  if (aggregatorPrompt) {
700
651
  const resolved = await resolveFileReference(aggregatorPrompt, searchRoots);
@@ -719,7 +670,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
719
670
  continue;
720
671
  }
721
672
  if (typeValue === "tool_trajectory") {
722
- const mode = asString2(rawEvaluator.mode);
673
+ const mode = asString(rawEvaluator.mode);
723
674
  if (mode !== "any_order" && mode !== "in_order" && mode !== "exact") {
724
675
  logWarning2(
725
676
  `Skipping tool_trajectory evaluator '${name}' in '${evalId}': invalid mode '${mode}' (must be any_order, in_order, or exact)`
@@ -810,8 +761,8 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
810
761
  );
811
762
  continue;
812
763
  }
813
- const fieldPath = asString2(rawField.path);
814
- const match = asString2(rawField.match);
764
+ const fieldPath = asString(rawField.path);
765
+ const match = asString(rawField.match);
815
766
  if (!fieldPath) {
816
767
  logWarning2(
817
768
  `Skipping field without path in field_accuracy evaluator '${name}' in '${evalId}'`
@@ -841,7 +792,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
841
792
  );
842
793
  continue;
843
794
  }
844
- const aggregation = asString2(rawEvaluator.aggregation);
795
+ const aggregation = asString(rawEvaluator.aggregation);
845
796
  const validAggregation = isValidFieldAggregationType(aggregation) ? aggregation : void 0;
846
797
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
847
798
  evaluators.push({
@@ -922,7 +873,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
922
873
  });
923
874
  continue;
924
875
  }
925
- const prompt = asString2(rawEvaluator.prompt);
876
+ const prompt = asString(rawEvaluator.prompt);
926
877
  let promptPath;
927
878
  if (prompt) {
928
879
  const resolved = await resolveFileReference(prompt, searchRoots);
@@ -941,11 +892,11 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
941
892
  );
942
893
  }
943
894
  }
944
- const _model = asString2(rawEvaluator.model);
895
+ const _model = asString(rawEvaluator.model);
945
896
  const rawRubrics = rawEvaluator.rubrics;
946
897
  const parsedRubrics = Array.isArray(rawRubrics) ? rawRubrics.filter((r) => isJsonObject2(r)).map((rubric, index) => ({
947
- id: asString2(rubric.id) ?? `rubric-${index + 1}`,
948
- description: asString2(rubric.description) ?? "",
898
+ id: asString(rubric.id) ?? `rubric-${index + 1}`,
899
+ description: asString(rubric.description) ?? "",
949
900
  weight: typeof rubric.weight === "number" ? rubric.weight : 1,
950
901
  required: typeof rubric.required === "boolean" ? rubric.required : true
951
902
  })).filter((r) => r.description.length > 0) : void 0;
@@ -989,7 +940,7 @@ function coerceEvaluator(candidate, contextId) {
989
940
  logWarning2(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
990
941
  return void 0;
991
942
  }
992
- function asString2(value) {
943
+ function asString(value) {
993
944
  return typeof value === "string" ? value : void 0;
994
945
  }
995
946
  function asStringArray(value, description) {
@@ -1065,6 +1016,68 @@ function isValidFieldAggregationType(value) {
1065
1016
  // src/evaluation/loaders/message-processor.ts
1066
1017
  var import_promises4 = require("fs/promises");
1067
1018
  var import_node_path4 = __toESM(require("path"), 1);
1019
+
1020
+ // src/evaluation/formatting/segment-formatter.ts
1021
+ function formatFileContents(parts) {
1022
+ const fileCount = parts.filter((p) => p.isFile).length;
1023
+ if (fileCount > 0) {
1024
+ return parts.map((part) => {
1025
+ if (part.isFile && part.displayPath) {
1026
+ return `<file path="${part.displayPath}">
1027
+ ${part.content}
1028
+ </file>`;
1029
+ }
1030
+ return part.content;
1031
+ }).join("\n\n");
1032
+ }
1033
+ return parts.map((p) => p.content).join(" ");
1034
+ }
1035
+ function formatSegment(segment, mode = "lm") {
1036
+ const type = asString2(segment.type);
1037
+ if (type === "text") {
1038
+ return asString2(segment.value);
1039
+ }
1040
+ if (type === "guideline_ref") {
1041
+ const refPath = asString2(segment.path);
1042
+ return refPath ? `<Attached: ${refPath}>` : void 0;
1043
+ }
1044
+ if (type === "file") {
1045
+ const filePath = asString2(segment.path);
1046
+ if (!filePath) {
1047
+ return void 0;
1048
+ }
1049
+ if (mode === "agent") {
1050
+ return `<file: path="${filePath}">`;
1051
+ }
1052
+ const text = asString2(segment.text);
1053
+ if (text && filePath) {
1054
+ return formatFileContents([{ content: text.trim(), isFile: true, displayPath: filePath }]);
1055
+ }
1056
+ }
1057
+ return void 0;
1058
+ }
1059
+ function hasVisibleContent(segments) {
1060
+ return segments.some((segment) => {
1061
+ const type = asString2(segment.type);
1062
+ if (type === "text") {
1063
+ const value = asString2(segment.value);
1064
+ return value !== void 0 && value.trim().length > 0;
1065
+ }
1066
+ if (type === "guideline_ref") {
1067
+ return false;
1068
+ }
1069
+ if (type === "file") {
1070
+ const text = asString2(segment.text);
1071
+ return text !== void 0 && text.trim().length > 0;
1072
+ }
1073
+ return false;
1074
+ });
1075
+ }
1076
+ function asString2(value) {
1077
+ return typeof value === "string" ? value : void 0;
1078
+ }
1079
+
1080
+ // src/evaluation/loaders/message-processor.ts
1068
1081
  var ANSI_YELLOW4 = "\x1B[33m";
1069
1082
  var ANSI_RESET4 = "\x1B[0m";
1070
1083
  async function processMessages(options) {
@@ -1370,9 +1383,6 @@ ${messageContent}`);
1370
1383
  questionParts.push(formattedContent);
1371
1384
  }
1372
1385
  }
1373
- if (testCase.code_snippets.length > 0) {
1374
- questionParts.push(testCase.code_snippets.join("\n"));
1375
- }
1376
1386
  question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
1377
1387
  }
1378
1388
  const chatPrompt = useRoleMarkers ? buildChatPromptFromSegments({
@@ -1571,7 +1581,6 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
1571
1581
  repoRootPath,
1572
1582
  verbose
1573
1583
  }) : [];
1574
- const codeSnippets = extractCodeBlocks(inputSegments);
1575
1584
  let referenceAnswer = "";
1576
1585
  if (outputSegments.length > 0) {
1577
1586
  const lastMessage = outputSegments[outputSegments.length - 1];
@@ -1644,7 +1653,6 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
1644
1653
  guideline_paths: guidelinePaths.map((guidelinePath) => import_node_path6.default.resolve(guidelinePath)),
1645
1654
  guideline_patterns: guidelinePatterns,
1646
1655
  file_paths: allFilePaths,
1647
- code_snippets: codeSnippets,
1648
1656
  expected_outcome: outcome,
1649
1657
  evaluator: evalCaseEvaluatorKind,
1650
1658
  evaluators
@@ -4272,6 +4280,167 @@ var MockProvider = class {
4272
4280
  }
4273
4281
  };
4274
4282
 
4283
+ // src/evaluation/providers/pi-agent-sdk.ts
4284
+ var piAgentModule = null;
4285
+ var piAiModule = null;
4286
+ async function loadPiModules() {
4287
+ if (!piAgentModule || !piAiModule) {
4288
+ try {
4289
+ [piAgentModule, piAiModule] = await Promise.all([
4290
+ import("@mariozechner/pi-agent"),
4291
+ import("@mariozechner/pi-ai")
4292
+ ]);
4293
+ } catch (error) {
4294
+ throw new Error(
4295
+ `Failed to load pi-agent-sdk dependencies. Please install them:
4296
+ npm install @mariozechner/pi-agent @mariozechner/pi-ai
4297
+
4298
+ Original error: ${error instanceof Error ? error.message : String(error)}`
4299
+ );
4300
+ }
4301
+ }
4302
+ return {
4303
+ Agent: piAgentModule.Agent,
4304
+ ProviderTransport: piAgentModule.ProviderTransport,
4305
+ getModel: piAiModule.getModel,
4306
+ getEnvApiKey: piAiModule.getEnvApiKey
4307
+ };
4308
+ }
4309
+ var PiAgentSdkProvider = class {
4310
+ id;
4311
+ kind = "pi-agent-sdk";
4312
+ targetName;
4313
+ supportsBatch = false;
4314
+ config;
4315
+ constructor(targetName, config) {
4316
+ this.id = `pi-agent-sdk:${targetName}`;
4317
+ this.targetName = targetName;
4318
+ this.config = config;
4319
+ }
4320
+ async invoke(request) {
4321
+ if (request.signal?.aborted) {
4322
+ throw new Error("Pi agent SDK request was aborted before execution");
4323
+ }
4324
+ const { Agent, ProviderTransport, getModel, getEnvApiKey } = await loadPiModules();
4325
+ const startTime = Date.now();
4326
+ const providerName = this.config.provider ?? "anthropic";
4327
+ const modelId = this.config.model ?? "claude-sonnet-4-20250514";
4328
+ const model = getModel(providerName, modelId);
4329
+ const systemPrompt = this.config.systemPrompt ?? "Answer directly and concisely.";
4330
+ const transport = new ProviderTransport({
4331
+ getApiKey: async (provider) => {
4332
+ return this.config.apiKey ?? getEnvApiKey(provider) ?? void 0;
4333
+ }
4334
+ });
4335
+ const agent = new Agent({
4336
+ initialState: {
4337
+ systemPrompt,
4338
+ model,
4339
+ tools: [],
4340
+ // No tools for simple Q&A
4341
+ messages: []
4342
+ },
4343
+ transport
4344
+ });
4345
+ const outputMessages = [];
4346
+ let finalAssistantContent = "";
4347
+ const unsubscribe = agent.subscribe((event) => {
4348
+ if (event.type === "message_end") {
4349
+ const msg = event.message;
4350
+ if (msg.role === "assistant") {
4351
+ const content = extractTextContent2(msg.content);
4352
+ if (content) {
4353
+ finalAssistantContent = content;
4354
+ }
4355
+ }
4356
+ }
4357
+ });
4358
+ try {
4359
+ const timeoutMs = this.config.timeoutMs ?? 12e4;
4360
+ const timeoutPromise = new Promise((_, reject) => {
4361
+ setTimeout(
4362
+ () => reject(new Error(`Pi agent SDK timed out after ${timeoutMs}ms`)),
4363
+ timeoutMs
4364
+ );
4365
+ });
4366
+ await Promise.race([agent.prompt(request.question), timeoutPromise]);
4367
+ await agent.waitForIdle();
4368
+ const agentMessages = agent.state.messages;
4369
+ for (const msg of agentMessages) {
4370
+ outputMessages.push(convertAgentMessage(msg));
4371
+ }
4372
+ const durationMs = Date.now() - startTime;
4373
+ return {
4374
+ raw: {
4375
+ messages: agentMessages,
4376
+ systemPrompt,
4377
+ model: this.config.model,
4378
+ provider: this.config.provider
4379
+ },
4380
+ outputMessages,
4381
+ durationMs
4382
+ };
4383
+ } finally {
4384
+ unsubscribe();
4385
+ }
4386
+ }
4387
+ };
4388
+ function extractTextContent2(content) {
4389
+ if (typeof content === "string") {
4390
+ return content;
4391
+ }
4392
+ if (!Array.isArray(content)) {
4393
+ return void 0;
4394
+ }
4395
+ const textParts = [];
4396
+ for (const part of content) {
4397
+ if (!part || typeof part !== "object") {
4398
+ continue;
4399
+ }
4400
+ const p = part;
4401
+ if (p.type === "text" && typeof p.text === "string") {
4402
+ textParts.push(p.text);
4403
+ }
4404
+ }
4405
+ return textParts.length > 0 ? textParts.join("\n") : void 0;
4406
+ }
4407
+ function convertAgentMessage(message) {
4408
+ if (!message || typeof message !== "object") {
4409
+ return { role: "unknown", content: String(message) };
4410
+ }
4411
+ const msg = message;
4412
+ const role = typeof msg.role === "string" ? msg.role : "unknown";
4413
+ const content = extractTextContent2(msg.content);
4414
+ const toolCalls = extractToolCalls2(msg.content);
4415
+ const timestamp = typeof msg.timestamp === "number" ? new Date(msg.timestamp).toISOString() : typeof msg.timestamp === "string" ? msg.timestamp : void 0;
4416
+ return {
4417
+ role,
4418
+ content,
4419
+ toolCalls: toolCalls.length > 0 ? toolCalls : void 0,
4420
+ timestamp
4421
+ };
4422
+ }
4423
+ function extractToolCalls2(content) {
4424
+ if (!Array.isArray(content)) {
4425
+ return [];
4426
+ }
4427
+ const toolCalls = [];
4428
+ for (const part of content) {
4429
+ if (!part || typeof part !== "object") {
4430
+ continue;
4431
+ }
4432
+ const p = part;
4433
+ if (p.type === "tool_use" && typeof p.name === "string") {
4434
+ toolCalls.push({
4435
+ tool: p.name,
4436
+ input: p.input,
4437
+ id: typeof p.id === "string" ? p.id : void 0
4438
+ });
4439
+ }
4440
+ }
4441
+ return toolCalls;
4442
+ }
4443
+
4275
4444
  // src/evaluation/providers/pi-coding-agent.ts
4276
4445
  var import_node_child_process4 = require("child_process");
4277
4446
  var import_node_crypto3 = require("crypto");
@@ -4787,8 +4956,8 @@ function convertPiMessage(message) {
4787
4956
  if (typeof role !== "string") {
4788
4957
  return void 0;
4789
4958
  }
4790
- const content = extractTextContent2(msg.content);
4791
- const toolCalls = extractToolCalls2(msg.content);
4959
+ const content = extractTextContent3(msg.content);
4960
+ const toolCalls = extractToolCalls3(msg.content);
4792
4961
  const timestamp = typeof msg.timestamp === "number" ? new Date(msg.timestamp).toISOString() : typeof msg.timestamp === "string" ? msg.timestamp : void 0;
4793
4962
  const metadata = {};
4794
4963
  if (msg.api) metadata.api = msg.api;
@@ -4804,7 +4973,7 @@ function convertPiMessage(message) {
4804
4973
  metadata: Object.keys(metadata).length > 0 ? metadata : void 0
4805
4974
  };
4806
4975
  }
4807
- function extractTextContent2(content) {
4976
+ function extractTextContent3(content) {
4808
4977
  if (typeof content === "string") {
4809
4978
  return content;
4810
4979
  }
@@ -4823,7 +4992,7 @@ function extractTextContent2(content) {
4823
4992
  }
4824
4993
  return textParts.length > 0 ? textParts.join("\n") : void 0;
4825
4994
  }
4826
- function extractToolCalls2(content) {
4995
+ function extractToolCalls3(content) {
4827
4996
  if (!Array.isArray(content)) {
4828
4997
  return [];
4829
4998
  }
@@ -5227,6 +5396,15 @@ function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
5227
5396
  providerBatching,
5228
5397
  config: resolvePiCodingAgentConfig(parsed, env)
5229
5398
  };
5399
+ case "pi-agent-sdk":
5400
+ return {
5401
+ kind: "pi-agent-sdk",
5402
+ name: parsed.name,
5403
+ judgeTarget: parsed.judge_target,
5404
+ workers: parsed.workers,
5405
+ providerBatching,
5406
+ config: resolvePiAgentSdkConfig(parsed, env)
5407
+ };
5230
5408
  case "claude-code":
5231
5409
  return {
5232
5410
  kind: "claude-code",
@@ -5448,25 +5626,58 @@ function resolvePiCodingAgentConfig(target, env) {
5448
5626
  systemPrompt
5449
5627
  };
5450
5628
  }
5451
- function resolveClaudeCodeConfig(target, env) {
5452
- const executableSource = target.executable ?? target.command ?? target.binary;
5453
- const modelSource = target.model;
5454
- const argsSource = target.args ?? target.arguments;
5455
- const cwdSource = target.cwd;
5629
+ function resolvePiAgentSdkConfig(target, env) {
5630
+ const providerSource = target.pi_provider ?? target.piProvider ?? target.llm_provider;
5631
+ const modelSource = target.model ?? target.pi_model ?? target.piModel;
5632
+ const apiKeySource = target.api_key ?? target.apiKey;
5456
5633
  const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
5457
- const logDirSource = target.log_dir ?? target.logDir ?? target.log_directory ?? target.logDirectory;
5458
- const logFormatSource = target.log_format ?? target.logFormat ?? target.log_output_format ?? target.logOutputFormat ?? env.AGENTV_CLAUDE_CODE_LOG_FORMAT;
5459
5634
  const systemPromptSource = target.system_prompt ?? target.systemPrompt;
5460
- const executable = resolveOptionalString(executableSource, env, `${target.name} claude-code executable`, {
5635
+ const provider = resolveOptionalString(
5636
+ providerSource,
5637
+ env,
5638
+ `${target.name} pi-agent-sdk provider`,
5639
+ {
5640
+ allowLiteral: true,
5641
+ optionalEnv: true
5642
+ }
5643
+ );
5644
+ const model = resolveOptionalString(modelSource, env, `${target.name} pi-agent-sdk model`, {
5461
5645
  allowLiteral: true,
5462
5646
  optionalEnv: true
5463
- }) ?? "claude";
5464
- const model = resolveOptionalString(modelSource, env, `${target.name} claude-code model`, {
5465
- allowLiteral: true,
5647
+ });
5648
+ const apiKey = resolveOptionalString(apiKeySource, env, `${target.name} pi-agent-sdk api key`, {
5649
+ allowLiteral: false,
5466
5650
  optionalEnv: true
5467
5651
  });
5468
- const args = resolveOptionalStringArray(argsSource, env, `${target.name} claude-code args`);
5469
- const cwd = resolveOptionalString(cwdSource, env, `${target.name} claude-code cwd`, {
5652
+ const timeoutMs = resolveTimeoutMs(timeoutSource, `${target.name} pi-agent-sdk timeout`);
5653
+ const systemPrompt = typeof systemPromptSource === "string" && systemPromptSource.trim().length > 0 ? systemPromptSource.trim() : void 0;
5654
+ return {
5655
+ provider,
5656
+ model,
5657
+ apiKey,
5658
+ timeoutMs,
5659
+ systemPrompt
5660
+ };
5661
+ }
5662
+ function resolveClaudeCodeConfig(target, env) {
5663
+ const executableSource = target.executable ?? target.command ?? target.binary;
5664
+ const modelSource = target.model;
5665
+ const argsSource = target.args ?? target.arguments;
5666
+ const cwdSource = target.cwd;
5667
+ const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
5668
+ const logDirSource = target.log_dir ?? target.logDir ?? target.log_directory ?? target.logDirectory;
5669
+ const logFormatSource = target.log_format ?? target.logFormat ?? target.log_output_format ?? target.logOutputFormat ?? env.AGENTV_CLAUDE_CODE_LOG_FORMAT;
5670
+ const systemPromptSource = target.system_prompt ?? target.systemPrompt;
5671
+ const executable = resolveOptionalString(executableSource, env, `${target.name} claude-code executable`, {
5672
+ allowLiteral: true,
5673
+ optionalEnv: true
5674
+ }) ?? "claude";
5675
+ const model = resolveOptionalString(modelSource, env, `${target.name} claude-code model`, {
5676
+ allowLiteral: true,
5677
+ optionalEnv: true
5678
+ });
5679
+ const args = resolveOptionalStringArray(argsSource, env, `${target.name} claude-code args`);
5680
+ const cwd = resolveOptionalString(cwdSource, env, `${target.name} claude-code cwd`, {
5470
5681
  allowLiteral: true,
5471
5682
  optionalEnv: true
5472
5683
  });
@@ -6106,6 +6317,8 @@ function createProvider(target) {
6106
6317
  return new CodexProvider(target.name, target.config);
6107
6318
  case "pi-coding-agent":
6108
6319
  return new PiCodingAgentProvider(target.name, target.config);
6320
+ case "pi-agent-sdk":
6321
+ return new PiAgentSdkProvider(target.name, target.config);
6109
6322
  case "claude-code":
6110
6323
  return new ClaudeCodeProvider(target.name, target.config);
6111
6324
  case "mock":
@@ -6124,9 +6337,64 @@ function resolveAndCreateProvider(definition, env = process.env) {
6124
6337
  return createProvider(resolved);
6125
6338
  }
6126
6339
 
6127
- // src/evaluation/evaluators.ts
6128
- var import_ai2 = require("ai");
6129
- var import_zod3 = require("zod");
6340
+ // src/evaluation/evaluators/scoring.ts
6341
+ function scoreToVerdict(score) {
6342
+ if (score >= 0.8) {
6343
+ return "pass";
6344
+ }
6345
+ if (score >= 0.6) {
6346
+ return "borderline";
6347
+ }
6348
+ return "fail";
6349
+ }
6350
+ function clampScore(value) {
6351
+ if (Number.isNaN(value) || !Number.isFinite(value)) {
6352
+ return 0;
6353
+ }
6354
+ if (value < 0) {
6355
+ return 0;
6356
+ }
6357
+ if (value > 1) {
6358
+ return 1;
6359
+ }
6360
+ return value;
6361
+ }
6362
+ function extractJsonBlob(text) {
6363
+ const match = text.match(/\{[\s\S]*\}/);
6364
+ return match?.[0];
6365
+ }
6366
+ function parseJsonFromText(text) {
6367
+ const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
6368
+ const blob = extractJsonBlob(cleaned) ?? cleaned;
6369
+ return JSON.parse(blob);
6370
+ }
6371
+ function isNonEmptyString(value) {
6372
+ return typeof value === "string" && value.trim().length > 0;
6373
+ }
6374
+ function parseJsonSafe(payload) {
6375
+ try {
6376
+ return JSON.parse(payload);
6377
+ } catch {
6378
+ return void 0;
6379
+ }
6380
+ }
6381
+ function deepEqual(a, b) {
6382
+ if (a === b) return true;
6383
+ if (a === null || b === null) return a === b;
6384
+ if (typeof a !== typeof b) return false;
6385
+ if (typeof a !== "object") return a === b;
6386
+ if (Array.isArray(a) !== Array.isArray(b)) return false;
6387
+ if (Array.isArray(a) && Array.isArray(b)) {
6388
+ if (a.length !== b.length) return false;
6389
+ return a.every((val, i) => deepEqual(val, b[i]));
6390
+ }
6391
+ const aObj = a;
6392
+ const bObj = b;
6393
+ const aKeys = Object.keys(aObj);
6394
+ const bKeys = Object.keys(bObj);
6395
+ if (aKeys.length !== bKeys.length) return false;
6396
+ return aKeys.every((key) => Object.hasOwn(bObj, key) && deepEqual(aObj[key], bObj[key]));
6397
+ }
6130
6398
 
6131
6399
  // src/runtime/exec.ts
6132
6400
  function shellEscapePath(value) {
@@ -6151,7 +6419,9 @@ async function execFileWithStdinBun(argv, stdinPayload, options) {
6151
6419
  cwd: options.cwd,
6152
6420
  stdin: encoder.encode(stdinPayload),
6153
6421
  stdout: "pipe",
6154
- stderr: "pipe"
6422
+ stderr: "pipe",
6423
+ // Merge additional env vars with process.env
6424
+ env: options.env ? { ...process.env, ...options.env } : process.env
6155
6425
  });
6156
6426
  let timedOut = false;
6157
6427
  const timeout = options.timeoutMs !== void 0 ? setTimeout(() => {
@@ -6186,7 +6456,9 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
6186
6456
  const [cmd, ...args] = argv;
6187
6457
  const child = spawn4(cmd, args, {
6188
6458
  cwd: options.cwd,
6189
- stdio: ["pipe", "pipe", "pipe"]
6459
+ stdio: ["pipe", "pipe", "pipe"],
6460
+ // Merge additional env vars with process.env
6461
+ env: options.env ? { ...process.env, ...options.env } : process.env
6190
6462
  });
6191
6463
  const stdoutChunks = [];
6192
6464
  const stderrChunks = [];
@@ -6239,7 +6511,9 @@ async function execShellWithStdin(command, stdinPayload, options = {}) {
6239
6511
  const child = spawn4(wrappedCommand, {
6240
6512
  shell: true,
6241
6513
  cwd: options.cwd,
6242
- stdio: ["ignore", "ignore", "ignore"]
6514
+ stdio: ["ignore", "ignore", "ignore"],
6515
+ // Merge additional env vars with process.env
6516
+ env: options.env ? { ...process.env, ...options.env } : process.env
6243
6517
  });
6244
6518
  const timeout = options.timeoutMs ? setTimeout(() => {
6245
6519
  child.kill();
@@ -6266,6 +6540,221 @@ async function execShellWithStdin(command, stdinPayload, options = {}) {
6266
6540
  }
6267
6541
  }
6268
6542
 
6543
+ // src/runtime/target-proxy.ts
6544
+ var import_node_crypto4 = require("crypto");
6545
+ var import_node_http = require("http");
6546
+ var DEFAULT_MAX_CALLS = 50;
6547
+ async function createTargetProxy(options) {
6548
+ const { defaultProvider, targetResolver, availableTargets, maxCalls } = options;
6549
+ const token = (0, import_node_crypto4.randomBytes)(32).toString("hex");
6550
+ let callCount = 0;
6551
+ let isShutdown = false;
6552
+ const targetsList = availableTargets ?? [defaultProvider.targetName];
6553
+ function resolveProvider(targetName) {
6554
+ if (targetName === void 0 || targetName === defaultProvider.targetName) {
6555
+ return defaultProvider;
6556
+ }
6557
+ if (targetResolver) {
6558
+ return targetResolver(targetName);
6559
+ }
6560
+ return void 0;
6561
+ }
6562
+ const server = (0, import_node_http.createServer)(async (req, res) => {
6563
+ res.setHeader("Access-Control-Allow-Origin", "*");
6564
+ res.setHeader("Access-Control-Allow-Methods", "GET, POST, OPTIONS");
6565
+ res.setHeader("Access-Control-Allow-Headers", "Content-Type, Authorization");
6566
+ if (req.method === "OPTIONS") {
6567
+ res.writeHead(204);
6568
+ res.end();
6569
+ return;
6570
+ }
6571
+ const authHeader = req.headers.authorization;
6572
+ if (!authHeader || authHeader !== `Bearer ${token}`) {
6573
+ sendJson(res, 401, { error: "Unauthorized" });
6574
+ return;
6575
+ }
6576
+ if (isShutdown) {
6577
+ sendJson(res, 503, { error: "Proxy is shutting down" });
6578
+ return;
6579
+ }
6580
+ const url2 = req.url ?? "";
6581
+ if (req.method === "GET" && url2 === "/info") {
6582
+ handleInfo(res);
6583
+ return;
6584
+ }
6585
+ if (req.method === "POST" && url2 === "/invoke") {
6586
+ await handleInvoke(req, res);
6587
+ return;
6588
+ }
6589
+ if (req.method === "POST" && url2 === "/invokeBatch") {
6590
+ await handleInvokeBatch(req, res);
6591
+ return;
6592
+ }
6593
+ sendJson(res, 404, { error: "Not found" });
6594
+ });
6595
+ function handleInfo(res) {
6596
+ const response = {
6597
+ targetName: defaultProvider.targetName,
6598
+ maxCalls,
6599
+ callCount,
6600
+ availableTargets: targetsList
6601
+ };
6602
+ sendJson(res, 200, response);
6603
+ }
6604
+ async function handleInvoke(req, res) {
6605
+ if (callCount >= maxCalls) {
6606
+ sendJson(res, 429, { error: `Max calls exceeded (limit: ${maxCalls})` });
6607
+ return;
6608
+ }
6609
+ try {
6610
+ const body = await readBody(req);
6611
+ const request = JSON.parse(body);
6612
+ if (!request.question || typeof request.question !== "string") {
6613
+ sendJson(res, 400, { error: "Missing required field: question" });
6614
+ return;
6615
+ }
6616
+ const provider = resolveProvider(request.target);
6617
+ if (!provider) {
6618
+ sendJson(res, 400, {
6619
+ error: `Unknown target '${request.target}'. Available: ${targetsList.join(", ")}`
6620
+ });
6621
+ return;
6622
+ }
6623
+ callCount++;
6624
+ const response = await provider.invoke({
6625
+ question: request.question,
6626
+ systemPrompt: request.systemPrompt,
6627
+ evalCaseId: request.evalCaseId ?? "proxy",
6628
+ attempt: request.attempt ?? 1
6629
+ });
6630
+ const outputMessages = response.outputMessages ?? [];
6631
+ const rawText = extractLastAssistantContent(outputMessages);
6632
+ const result = {
6633
+ outputMessages,
6634
+ rawText
6635
+ };
6636
+ sendJson(res, 200, result);
6637
+ } catch (error) {
6638
+ const message = error instanceof Error ? error.message : String(error);
6639
+ sendJson(res, 500, { error: message });
6640
+ }
6641
+ }
6642
+ async function handleInvokeBatch(req, res) {
6643
+ try {
6644
+ const body = await readBody(req);
6645
+ const { requests } = JSON.parse(body);
6646
+ if (!Array.isArray(requests)) {
6647
+ sendJson(res, 400, { error: "Missing required field: requests (array)" });
6648
+ return;
6649
+ }
6650
+ if (callCount + requests.length > maxCalls) {
6651
+ sendJson(res, 429, {
6652
+ error: `Batch would exceed max calls (current: ${callCount}, batch: ${requests.length}, limit: ${maxCalls})`
6653
+ });
6654
+ return;
6655
+ }
6656
+ const responses = [];
6657
+ for (const request of requests) {
6658
+ if (!request.question || typeof request.question !== "string") {
6659
+ responses.push({
6660
+ outputMessages: [],
6661
+ rawText: "Error: Missing required field: question"
6662
+ });
6663
+ continue;
6664
+ }
6665
+ const provider = resolveProvider(request.target);
6666
+ if (!provider) {
6667
+ responses.push({
6668
+ outputMessages: [],
6669
+ rawText: `Error: Unknown target '${request.target}'. Available: ${targetsList.join(", ")}`
6670
+ });
6671
+ continue;
6672
+ }
6673
+ callCount++;
6674
+ try {
6675
+ const response = await provider.invoke({
6676
+ question: request.question,
6677
+ systemPrompt: request.systemPrompt,
6678
+ evalCaseId: request.evalCaseId ?? "proxy",
6679
+ attempt: request.attempt ?? 1
6680
+ });
6681
+ const outputMessages = response.outputMessages ?? [];
6682
+ responses.push({
6683
+ outputMessages,
6684
+ rawText: extractLastAssistantContent(outputMessages)
6685
+ });
6686
+ } catch (error) {
6687
+ const message = error instanceof Error ? error.message : String(error);
6688
+ responses.push({
6689
+ outputMessages: [],
6690
+ rawText: `Error: ${message}`
6691
+ });
6692
+ }
6693
+ }
6694
+ sendJson(res, 200, { responses });
6695
+ } catch (error) {
6696
+ const message = error instanceof Error ? error.message : String(error);
6697
+ sendJson(res, 500, { error: message });
6698
+ }
6699
+ }
6700
+ await new Promise((resolve, reject) => {
6701
+ server.once("error", reject);
6702
+ server.listen(0, "127.0.0.1", () => {
6703
+ server.removeListener("error", reject);
6704
+ resolve();
6705
+ });
6706
+ });
6707
+ const address = server.address();
6708
+ const url = `http://127.0.0.1:${address.port}`;
6709
+ return {
6710
+ url,
6711
+ token,
6712
+ shutdown: async () => {
6713
+ isShutdown = true;
6714
+ return new Promise((resolve, reject) => {
6715
+ server.close((err) => {
6716
+ if (err) reject(err);
6717
+ else resolve();
6718
+ });
6719
+ });
6720
+ },
6721
+ getUsageMetadata: () => ({
6722
+ callCount,
6723
+ maxCalls
6724
+ })
6725
+ };
6726
+ }
6727
+ function sendJson(res, statusCode, body) {
6728
+ res.writeHead(statusCode, { "Content-Type": "application/json" });
6729
+ res.end(JSON.stringify(body));
6730
+ }
6731
+ function readBody(req) {
6732
+ return new Promise((resolve, reject) => {
6733
+ const chunks = [];
6734
+ req.on("data", (chunk) => chunks.push(chunk));
6735
+ req.on("end", () => resolve(Buffer.concat(chunks).toString("utf8")));
6736
+ req.on("error", reject);
6737
+ });
6738
+ }
6739
+ function extractLastAssistantContent(messages) {
6740
+ for (let i = messages.length - 1; i >= 0; i--) {
6741
+ const msg = messages[i];
6742
+ if (msg.role === "assistant" && msg.content !== void 0) {
6743
+ if (typeof msg.content === "string") {
6744
+ return msg.content;
6745
+ }
6746
+ if (Array.isArray(msg.content)) {
6747
+ for (const part of msg.content) {
6748
+ if (typeof part === "object" && part !== null && "text" in part) {
6749
+ return String(part.text);
6750
+ }
6751
+ }
6752
+ }
6753
+ }
6754
+ }
6755
+ return void 0;
6756
+ }
6757
+
6269
6758
  // src/evaluation/case-conversion.ts
6270
6759
  function toSnakeCase(str) {
6271
6760
  if (/^[A-Z]/.test(str)) {
@@ -6273,12 +6762,6 @@ function toSnakeCase(str) {
6273
6762
  }
6274
6763
  return str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
6275
6764
  }
6276
- function toCamelCase(str) {
6277
- if (/^[A-Z]/.test(str)) {
6278
- return str;
6279
- }
6280
- return str.replace(/_([a-z0-9])/g, (_, letter) => letter.toUpperCase());
6281
- }
6282
6765
  function toSnakeCaseDeep(obj) {
6283
6766
  if (obj === null || obj === void 0) {
6284
6767
  return obj;
@@ -6296,61 +6779,184 @@ function toSnakeCaseDeep(obj) {
6296
6779
  }
6297
6780
  return obj;
6298
6781
  }
6299
- function toCamelCaseDeep(obj) {
6300
- if (obj === null || obj === void 0) {
6301
- return obj;
6302
- }
6303
- if (Array.isArray(obj)) {
6304
- return obj.map((item) => toCamelCaseDeep(item));
6305
- }
6306
- if (typeof obj === "object") {
6307
- const result = {};
6308
- for (const [key, value] of Object.entries(obj)) {
6309
- const camelKey = toCamelCase(key);
6310
- result[camelKey] = toCamelCaseDeep(value);
6311
- }
6312
- return result;
6313
- }
6314
- return obj;
6315
- }
6316
6782
 
6317
- // src/evaluation/providers/types.ts
6318
- var AGENT_PROVIDER_KINDS = [
6319
- "codex",
6320
- "pi-coding-agent",
6321
- "claude-code",
6322
- "vscode",
6323
- "vscode-insiders"
6324
- ];
6325
- function extractLastAssistantContent(messages) {
6326
- if (!messages || messages.length === 0) {
6327
- return "";
6783
+ // src/evaluation/evaluators/code-evaluator.ts
6784
+ var CodeEvaluator = class {
6785
+ kind = "code";
6786
+ script;
6787
+ cwd;
6788
+ agentTimeoutMs;
6789
+ config;
6790
+ target;
6791
+ constructor(options) {
6792
+ this.script = options.script;
6793
+ this.cwd = options.cwd;
6794
+ this.agentTimeoutMs = options.agentTimeoutMs;
6795
+ this.config = options.config;
6796
+ this.target = options.target;
6328
6797
  }
6329
- for (let i = messages.length - 1; i >= 0; i--) {
6330
- const msg = messages[i];
6331
- if (msg.role === "assistant" && msg.content !== void 0) {
6332
- if (typeof msg.content === "string") {
6333
- return msg.content;
6334
- }
6335
- return JSON.stringify(msg.content);
6798
+ async evaluate(context) {
6799
+ const payload = {
6800
+ question: context.evalCase.question,
6801
+ expectedOutcome: context.evalCase.expected_outcome,
6802
+ expectedMessages: context.evalCase.expected_messages,
6803
+ referenceAnswer: context.evalCase.reference_answer,
6804
+ candidateAnswer: context.candidate,
6805
+ outputMessages: context.outputMessages ?? null,
6806
+ guidelineFiles: context.evalCase.guideline_paths,
6807
+ inputFiles: context.evalCase.file_paths.filter(
6808
+ (path17) => !context.evalCase.guideline_paths.includes(path17)
6809
+ ),
6810
+ inputMessages: context.evalCase.input_messages,
6811
+ traceSummary: context.traceSummary ?? null,
6812
+ config: this.config ?? null
6813
+ };
6814
+ const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
6815
+ let proxyEnv;
6816
+ let proxyShutdown;
6817
+ let getProxyUsage;
6818
+ if (this.target !== void 0 && context.judgeProvider) {
6819
+ const maxCalls = this.target.max_calls ?? DEFAULT_MAX_CALLS;
6820
+ const proxy = await createTargetProxy({
6821
+ defaultProvider: context.judgeProvider,
6822
+ targetResolver: context.targetResolver,
6823
+ availableTargets: context.availableTargets,
6824
+ maxCalls
6825
+ });
6826
+ proxyEnv = {
6827
+ AGENTV_TARGET_PROXY_URL: proxy.url,
6828
+ AGENTV_TARGET_PROXY_TOKEN: proxy.token
6829
+ };
6830
+ proxyShutdown = proxy.shutdown;
6831
+ getProxyUsage = proxy.getUsageMetadata;
6336
6832
  }
6337
- }
6338
- return "";
6339
- }
6340
- function isAgentProvider(provider) {
6341
- return provider ? AGENT_PROVIDER_KINDS.includes(provider.kind) : false;
6342
- }
6343
-
6344
- // src/evaluation/evaluators.ts
6345
- var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
6346
-
6347
- Use the reference_answer as a gold standard for a high-quality response (if provided). The reference_answer may be a simple text response, or it may contain a sequence of expected agent messages including tool calls. When it contains multiple messages, the last message represents the final expected answer. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
6348
-
6349
- Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.
6350
-
6351
- [[ ## expected_outcome ## ]]
6352
- {{${TEMPLATE_VARIABLES.EXPECTED_OUTCOME}}}
6353
-
6833
+ try {
6834
+ const stdout = await executeScript(
6835
+ this.script,
6836
+ inputPayload,
6837
+ this.agentTimeoutMs,
6838
+ this.cwd,
6839
+ proxyEnv
6840
+ );
6841
+ const parsed = parseJsonSafe(stdout);
6842
+ const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
6843
+ const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
6844
+ const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
6845
+ const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
6846
+ const details = parsed?.details && typeof parsed.details === "object" && !Array.isArray(parsed.details) ? parsed.details : void 0;
6847
+ const proxyUsage = getProxyUsage?.();
6848
+ const evaluatorRawRequest = {
6849
+ script: this.script,
6850
+ ...this.cwd ? { cwd: this.cwd } : {},
6851
+ ...proxyUsage ? {
6852
+ target_proxy: {
6853
+ call_count: proxyUsage.callCount,
6854
+ max_calls: proxyUsage.maxCalls
6855
+ }
6856
+ } : {}
6857
+ };
6858
+ return {
6859
+ score,
6860
+ verdict: scoreToVerdict(score),
6861
+ hits,
6862
+ misses,
6863
+ expectedAspectCount: hits.length + misses.length || 1,
6864
+ reasoning,
6865
+ evaluatorRawRequest,
6866
+ ...details ? { details } : {}
6867
+ };
6868
+ } catch (error) {
6869
+ const message = error instanceof Error ? error.message : String(error);
6870
+ const proxyUsage = getProxyUsage?.();
6871
+ return {
6872
+ score: 0,
6873
+ verdict: "fail",
6874
+ hits: [],
6875
+ misses: [`Code evaluator failed: ${message}`],
6876
+ expectedAspectCount: 1,
6877
+ reasoning: message,
6878
+ evaluatorRawRequest: {
6879
+ script: this.script,
6880
+ ...this.cwd ? { cwd: this.cwd } : {},
6881
+ ...proxyUsage ? {
6882
+ target_proxy: {
6883
+ call_count: proxyUsage.callCount,
6884
+ max_calls: proxyUsage.maxCalls
6885
+ }
6886
+ } : {},
6887
+ error: message
6888
+ }
6889
+ };
6890
+ } finally {
6891
+ if (proxyShutdown) {
6892
+ await proxyShutdown();
6893
+ }
6894
+ }
6895
+ }
6896
+ };
6897
+ async function executeScript(scriptPath, input, agentTimeoutMs, cwd, env) {
6898
+ const { stdout, stderr, exitCode } = typeof scriptPath === "string" ? await execShellWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env }) : await execFileWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env });
6899
+ if (exitCode !== 0) {
6900
+ const trimmedErr = formatStderr(stderr);
6901
+ throw new Error(
6902
+ trimmedErr.length > 0 ? `Code evaluator exited with code ${exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${exitCode}`
6903
+ );
6904
+ }
6905
+ return stdout.trim();
6906
+ }
6907
+ function formatStderr(stderr) {
6908
+ const trimmed = stderr.trim();
6909
+ const maxLength = 2e3;
6910
+ if (trimmed.length <= maxLength) {
6911
+ return trimmed;
6912
+ }
6913
+ const tail = trimmed.slice(-maxLength);
6914
+ return `...(truncated, last ${maxLength} chars)
6915
+ ${tail}`;
6916
+ }
6917
+
6918
+ // src/evaluation/evaluators/composite.ts
6919
+ var import_ai3 = require("ai");
6920
+
6921
+ // src/evaluation/providers/types.ts
6922
+ var AGENT_PROVIDER_KINDS = [
6923
+ "codex",
6924
+ "pi-coding-agent",
6925
+ "claude-code",
6926
+ "vscode",
6927
+ "vscode-insiders"
6928
+ ];
6929
+ function extractLastAssistantContent2(messages) {
6930
+ if (!messages || messages.length === 0) {
6931
+ return "";
6932
+ }
6933
+ for (let i = messages.length - 1; i >= 0; i--) {
6934
+ const msg = messages[i];
6935
+ if (msg.role === "assistant" && msg.content !== void 0) {
6936
+ if (typeof msg.content === "string") {
6937
+ return msg.content;
6938
+ }
6939
+ return JSON.stringify(msg.content);
6940
+ }
6941
+ }
6942
+ return "";
6943
+ }
6944
+ function isAgentProvider(provider) {
6945
+ return provider ? AGENT_PROVIDER_KINDS.includes(provider.kind) : false;
6946
+ }
6947
+
6948
+ // src/evaluation/evaluators/llm-judge.ts
6949
+ var import_ai2 = require("ai");
6950
+ var import_zod3 = require("zod");
6951
+ var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
6952
+
6953
+ Use the reference_answer as a gold standard for a high-quality response (if provided). The reference_answer may be a simple text response, or it may contain a sequence of expected agent messages including tool calls. When it contains multiple messages, the last message represents the final expected answer. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
6954
+
6955
+ Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.
6956
+
6957
+ [[ ## expected_outcome ## ]]
6958
+ {{${TEMPLATE_VARIABLES.EXPECTED_OUTCOME}}}
6959
+
6354
6960
  [[ ## question ## ]]
6355
6961
  {{${TEMPLATE_VARIABLES.QUESTION}}}
6356
6962
 
@@ -6421,7 +7027,7 @@ var LlmJudgeEvaluator = class {
6421
7027
  target: judgeProvider.targetName
6422
7028
  };
6423
7029
  try {
6424
- const { data, providerResponse } = await this.runWithRetry({
7030
+ const { data } = await this.runWithRetry({
6425
7031
  context,
6426
7032
  judgeProvider,
6427
7033
  systemPrompt,
@@ -6534,7 +7140,7 @@ var LlmJudgeEvaluator = class {
6534
7140
  temperature: this.temperature
6535
7141
  });
6536
7142
  const data = schema.parse(
6537
- parseJsonFromText(extractLastAssistantContent(response.outputMessages))
7143
+ parseJsonFromText(extractLastAssistantContent2(response.outputMessages))
6538
7144
  );
6539
7145
  return { data, providerResponse: response };
6540
7146
  } catch (e) {
@@ -6570,105 +7176,11 @@ You must return a valid JSON object matching this schema:
6570
7176
  "overall_reasoning": "string (summary)"
6571
7177
  }`;
6572
7178
  }
6573
- function scoreToVerdict(score) {
6574
- if (score >= 0.8) {
6575
- return "pass";
6576
- }
6577
- if (score >= 0.6) {
6578
- return "borderline";
6579
- }
6580
- return "fail";
6581
- }
6582
- function clampScore(value) {
6583
- if (Number.isNaN(value) || !Number.isFinite(value)) {
6584
- return 0;
6585
- }
6586
- if (value < 0) {
6587
- return 0;
6588
- }
6589
- if (value > 1) {
6590
- return 1;
6591
- }
6592
- return value;
6593
- }
6594
- function extractJsonBlob(text) {
6595
- const match = text.match(/\{[\s\S]*\}/);
6596
- return match?.[0];
6597
- }
6598
- function parseJsonFromText(text) {
6599
- const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
6600
- const blob = extractJsonBlob(cleaned) ?? cleaned;
6601
- return JSON.parse(blob);
6602
- }
6603
- function isNonEmptyString(value) {
6604
- return typeof value === "string" && value.trim().length > 0;
7179
+ function substituteVariables(template, variables) {
7180
+ return template.replace(/\{\{\s*([a-zA-Z0-9_]+)\s*\}\}/g, (match, varName) => {
7181
+ return variables[varName] ?? match;
7182
+ });
6605
7183
  }
6606
- var CodeEvaluator = class {
6607
- kind = "code";
6608
- script;
6609
- cwd;
6610
- agentTimeoutMs;
6611
- config;
6612
- constructor(options) {
6613
- this.script = options.script;
6614
- this.cwd = options.cwd;
6615
- this.agentTimeoutMs = options.agentTimeoutMs;
6616
- this.config = options.config;
6617
- }
6618
- async evaluate(context) {
6619
- const payload = {
6620
- question: context.evalCase.question,
6621
- expectedOutcome: context.evalCase.expected_outcome,
6622
- expectedMessages: context.evalCase.expected_messages,
6623
- referenceAnswer: context.evalCase.reference_answer,
6624
- candidateAnswer: context.candidate,
6625
- outputMessages: context.outputMessages ?? null,
6626
- guidelineFiles: context.evalCase.guideline_paths,
6627
- inputFiles: context.evalCase.file_paths.filter(
6628
- (path17) => !context.evalCase.guideline_paths.includes(path17)
6629
- ),
6630
- inputMessages: context.evalCase.input_messages,
6631
- traceSummary: context.traceSummary ?? null,
6632
- config: this.config ?? null
6633
- };
6634
- const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
6635
- try {
6636
- const stdout = await executeScript(this.script, inputPayload, this.agentTimeoutMs, this.cwd);
6637
- const parsed = parseJsonSafe(stdout);
6638
- const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
6639
- const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
6640
- const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
6641
- const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
6642
- return {
6643
- score,
6644
- verdict: scoreToVerdict(score),
6645
- hits,
6646
- misses,
6647
- expectedAspectCount: hits.length + misses.length || 1,
6648
- reasoning,
6649
- evaluatorRawRequest: {
6650
- script: this.script,
6651
- ...this.cwd ? { cwd: this.cwd } : {}
6652
- }
6653
- };
6654
- } catch (error) {
6655
- const message = error instanceof Error ? error.message : String(error);
6656
- return {
6657
- score: 0,
6658
- verdict: "fail",
6659
- hits: [],
6660
- misses: [`Code evaluator failed: ${message}`],
6661
- expectedAspectCount: 1,
6662
- reasoning: message,
6663
- evaluatorRawRequest: {
6664
- script: this.script,
6665
- ...this.cwd ? { cwd: this.cwd } : {},
6666
- error: message
6667
- }
6668
- };
6669
- }
6670
- }
6671
- };
6672
7184
  function calculateRubricScore(result, rubrics) {
6673
7185
  const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
6674
7186
  const hits = [];
@@ -6696,273 +7208,281 @@ function calculateRubricScore(result, rubrics) {
6696
7208
  const verdict = failedRequired ? "fail" : scoreToVerdict(score);
6697
7209
  return { score, verdict, hits, misses };
6698
7210
  }
6699
- async function executeScript(scriptPath, input, agentTimeoutMs, cwd) {
6700
- const { stdout, stderr, exitCode } = typeof scriptPath === "string" ? await execShellWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs }) : await execFileWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs });
6701
- if (exitCode !== 0) {
6702
- const trimmedErr = formatStderr(stderr);
6703
- throw new Error(
6704
- trimmedErr.length > 0 ? `Code evaluator exited with code ${exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${exitCode}`
6705
- );
7211
+
7212
+ // src/evaluation/evaluators/composite.ts
7213
+ var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
7214
+ {{EVALUATOR_RESULTS_JSON}}
7215
+
7216
+ Decide the final score and verdict based on all evaluator results.
7217
+ Return a JSON object with: score (0.0-1.0), verdict (pass/fail/borderline), and reasoning.`;
7218
+ var CompositeEvaluator = class {
7219
+ kind = "composite";
7220
+ config;
7221
+ evaluatorFactory;
7222
+ cwd;
7223
+ constructor(options) {
7224
+ this.config = options.config;
7225
+ this.evaluatorFactory = options.evaluatorFactory;
7226
+ this.cwd = options.cwd;
6706
7227
  }
6707
- return stdout.trim();
6708
- }
6709
- function formatStderr(stderr) {
6710
- const trimmed = stderr.trim();
6711
- const maxLength = 2e3;
6712
- if (trimmed.length <= maxLength) {
6713
- return trimmed;
7228
+ async evaluate(context) {
7229
+ const memberResults = await Promise.all(
7230
+ this.config.evaluators.map(async (memberConfig) => {
7231
+ const evaluator = this.evaluatorFactory.create(memberConfig, context);
7232
+ return {
7233
+ id: memberConfig.name,
7234
+ type: memberConfig.type,
7235
+ result: await evaluator.evaluate(context)
7236
+ };
7237
+ })
7238
+ );
7239
+ return this.aggregate(memberResults, context);
6714
7240
  }
6715
- const tail = trimmed.slice(-maxLength);
6716
- return `...(truncated, last ${maxLength} chars)
6717
- ${tail}`;
6718
- }
6719
- function parseJsonSafe(payload) {
6720
- try {
6721
- return JSON.parse(payload);
6722
- } catch {
6723
- return void 0;
7241
+ async aggregate(results, context) {
7242
+ const aggregator = this.config.aggregator;
7243
+ switch (aggregator.type) {
7244
+ case "code_judge":
7245
+ return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
7246
+ case "llm_judge":
7247
+ return this.runLlmAggregator(results, context, aggregator);
7248
+ default:
7249
+ return this.runWeightedAverage(results, aggregator.weights);
7250
+ }
6724
7251
  }
6725
- }
6726
- function substituteVariables(template, variables) {
6727
- return template.replace(/\{\{\s*([a-zA-Z0-9_]+)\s*\}\}/g, (match, varName) => {
6728
- return variables[varName] ?? match;
6729
- });
6730
- }
6731
- function deepEqual(a, b) {
6732
- if (a === b) return true;
6733
- if (a === null || b === null) return a === b;
6734
- if (typeof a !== typeof b) return false;
6735
- if (typeof a !== "object") return a === b;
6736
- if (Array.isArray(a) !== Array.isArray(b)) return false;
6737
- if (Array.isArray(a) && Array.isArray(b)) {
6738
- if (a.length !== b.length) return false;
6739
- return a.every((val, i) => deepEqual(val, b[i]));
6740
- }
6741
- const aObj = a;
6742
- const bObj = b;
6743
- const aKeys = Object.keys(aObj);
6744
- const bKeys = Object.keys(bObj);
6745
- if (aKeys.length !== bKeys.length) return false;
6746
- return aKeys.every((key) => Object.hasOwn(bObj, key) && deepEqual(aObj[key], bObj[key]));
6747
- }
6748
- function argsMatch(expected, actual) {
6749
- if (expected === void 0) return true;
6750
- if (expected === "any") return true;
6751
- if (actual === void 0) return false;
6752
- for (const key of Object.keys(expected)) {
6753
- if (!Object.hasOwn(actual, key)) return false;
6754
- if (!deepEqual(expected[key], actual[key])) return false;
6755
- }
6756
- return true;
6757
- }
6758
- var ToolTrajectoryEvaluator = class {
6759
- kind = "tool_trajectory";
6760
- config;
6761
- constructor(options) {
6762
- this.config = options.config;
7252
+ runWeightedAverage(results, weights) {
7253
+ let totalWeight = 0;
7254
+ let weightedSum = 0;
7255
+ const allHits = [];
7256
+ const allMisses = [];
7257
+ const reasoningParts = [];
7258
+ const evaluatorResults = [];
7259
+ for (const member of results) {
7260
+ const weight = weights?.[member.id] ?? 1;
7261
+ totalWeight += weight;
7262
+ weightedSum += member.result.score * weight;
7263
+ allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
7264
+ allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
7265
+ if (member.result.reasoning) {
7266
+ reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
7267
+ }
7268
+ evaluatorResults.push({
7269
+ name: member.id,
7270
+ type: member.type,
7271
+ score: member.result.score,
7272
+ weight,
7273
+ verdict: member.result.verdict,
7274
+ hits: [...member.result.hits],
7275
+ misses: [...member.result.misses],
7276
+ reasoning: member.result.reasoning,
7277
+ evaluatorRawRequest: member.result.evaluatorRawRequest,
7278
+ evaluatorResults: member.result.evaluatorResults,
7279
+ details: member.result.details
7280
+ });
7281
+ }
7282
+ const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
7283
+ return {
7284
+ score: clampScore(finalScore),
7285
+ verdict: scoreToVerdict(finalScore),
7286
+ hits: allHits,
7287
+ misses: allMisses,
7288
+ expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
7289
+ reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
7290
+ evaluatorRawRequest: {
7291
+ aggregator: "weighted_average",
7292
+ ...weights ? { weights } : {}
7293
+ },
7294
+ evaluatorResults
7295
+ };
6763
7296
  }
6764
- evaluate(context) {
6765
- const { outputMessages, traceSummary } = context;
6766
- const toolCalls = this.extractToolCallsFromMessages(outputMessages);
6767
- if (toolCalls.length === 0 && !traceSummary) {
7297
+ async runCodeAggregator(results, scriptPath, cwd, weights) {
7298
+ const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
7299
+ const inputPayload = JSON.stringify({ results: resultsObject }, null, 2);
7300
+ const evaluatorResults = results.map((member) => ({
7301
+ name: member.id,
7302
+ type: member.type,
7303
+ score: member.result.score,
7304
+ weight: weights?.[member.id] ?? 1,
7305
+ verdict: member.result.verdict,
7306
+ hits: [...member.result.hits],
7307
+ misses: [...member.result.misses],
7308
+ reasoning: member.result.reasoning,
7309
+ evaluatorRawRequest: member.result.evaluatorRawRequest,
7310
+ evaluatorResults: member.result.evaluatorResults,
7311
+ details: member.result.details
7312
+ }));
7313
+ try {
7314
+ const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
7315
+ const parsed = parseJsonSafe(stdout);
7316
+ const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
7317
+ const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
7318
+ const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
7319
+ const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
7320
+ const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
7321
+ return {
7322
+ score,
7323
+ verdict,
7324
+ hits,
7325
+ misses,
7326
+ expectedAspectCount: hits.length + misses.length || 1,
7327
+ reasoning,
7328
+ evaluatorRawRequest: {
7329
+ aggregator: "code_judge",
7330
+ script: scriptPath
7331
+ },
7332
+ evaluatorResults
7333
+ };
7334
+ } catch (error) {
7335
+ const message = error instanceof Error ? error.message : String(error);
6768
7336
  return {
6769
7337
  score: 0,
6770
7338
  verdict: "fail",
6771
7339
  hits: [],
6772
- misses: ["No trace available for evaluation"],
6773
- expectedAspectCount: 1
7340
+ misses: [`Code aggregator failed: ${message}`],
7341
+ expectedAspectCount: 1,
7342
+ reasoning: message,
7343
+ evaluatorRawRequest: {
7344
+ aggregator: "code_judge",
7345
+ script: scriptPath,
7346
+ error: message
7347
+ },
7348
+ evaluatorResults
6774
7349
  };
6775
7350
  }
6776
- const summary = toolCalls.length > 0 ? this.buildSummary(toolCalls) : traceSummary;
6777
- if (!summary) {
7351
+ }
7352
+ async runLlmAggregator(results, context, config) {
7353
+ const judgeProvider = context.judgeProvider;
7354
+ if (!judgeProvider) {
7355
+ throw new Error("No judge provider available for LLM aggregation");
7356
+ }
7357
+ const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
7358
+ const resultsJson = JSON.stringify(resultsObject, null, 2);
7359
+ const evaluatorResults = results.map((member) => ({
7360
+ name: member.id,
7361
+ type: member.type,
7362
+ score: member.result.score,
7363
+ verdict: member.result.verdict,
7364
+ hits: [...member.result.hits],
7365
+ misses: [...member.result.misses],
7366
+ reasoning: member.result.reasoning,
7367
+ evaluatorRawRequest: member.result.evaluatorRawRequest,
7368
+ evaluatorResults: member.result.evaluatorResults,
7369
+ details: member.result.details
7370
+ }));
7371
+ const promptTemplate = config.prompt ?? DEFAULT_COMPOSITE_AGGREGATOR_PROMPT;
7372
+ const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
7373
+ const systemPrompt = buildOutputSchema();
7374
+ const evaluatorRawRequest = {
7375
+ aggregator: "llm_judge",
7376
+ userPrompt,
7377
+ systemPrompt,
7378
+ target: judgeProvider.targetName
7379
+ };
7380
+ try {
7381
+ const model = judgeProvider.asLanguageModel?.();
7382
+ if (model) {
7383
+ const { text } = await (0, import_ai3.generateText)({
7384
+ model,
7385
+ system: systemPrompt,
7386
+ prompt: userPrompt
7387
+ });
7388
+ const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text));
7389
+ const score2 = clampScore(data2.score);
7390
+ const hits2 = Array.isArray(data2.hits) ? data2.hits.filter(isNonEmptyString).slice(0, 4) : [];
7391
+ const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
7392
+ const reasoning2 = data2.reasoning;
7393
+ return {
7394
+ score: score2,
7395
+ verdict: scoreToVerdict(score2),
7396
+ hits: hits2,
7397
+ misses: misses2,
7398
+ expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
7399
+ reasoning: reasoning2,
7400
+ evaluatorRawRequest,
7401
+ evaluatorResults
7402
+ };
7403
+ }
7404
+ const response = await judgeProvider.invoke({
7405
+ question: userPrompt,
7406
+ systemPrompt,
7407
+ evalCaseId: context.evalCase.id,
7408
+ attempt: context.attempt
7409
+ });
7410
+ const data = freeformEvaluationSchema.parse(
7411
+ parseJsonFromText(extractLastAssistantContent2(response.outputMessages))
7412
+ );
7413
+ const score = clampScore(data.score);
7414
+ const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
7415
+ const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
7416
+ const reasoning = data.reasoning;
7417
+ return {
7418
+ score,
7419
+ verdict: scoreToVerdict(score),
7420
+ hits,
7421
+ misses,
7422
+ expectedAspectCount: Math.max(hits.length + misses.length, 1),
7423
+ reasoning,
7424
+ evaluatorRawRequest,
7425
+ evaluatorResults
7426
+ };
7427
+ } catch {
6778
7428
  return {
6779
7429
  score: 0,
6780
7430
  verdict: "fail",
6781
7431
  hits: [],
6782
- misses: ["No trace available for evaluation"],
6783
- expectedAspectCount: 1
7432
+ misses: [],
7433
+ expectedAspectCount: 1,
7434
+ evaluatorRawRequest,
7435
+ evaluatorResults
6784
7436
  };
6785
7437
  }
6786
- switch (this.config.mode) {
6787
- case "any_order":
6788
- return this.evaluateAnyOrder(summary);
6789
- case "in_order":
6790
- return this.evaluateInOrder(toolCalls);
6791
- case "exact":
6792
- return this.evaluateExact(toolCalls);
6793
- default:
6794
- return {
6795
- score: 0,
6796
- verdict: "fail",
6797
- hits: [],
6798
- misses: [`Unknown mode: ${this.config.mode}`],
6799
- expectedAspectCount: 1
6800
- };
6801
- }
6802
7438
  }
6803
- /**
6804
- * Extract tool calls from output messages.
6805
- */
6806
- extractToolCallsFromMessages(messages) {
6807
- if (!messages) {
6808
- return [];
6809
- }
6810
- const toolCalls = [];
6811
- for (const message of messages) {
6812
- if (message.toolCalls) {
6813
- for (const call of message.toolCalls) {
6814
- toolCalls.push({
6815
- name: call.tool,
6816
- args: call.input
6817
- });
6818
- }
6819
- }
6820
- }
6821
- return toolCalls;
7439
+ };
7440
+
7441
+ // src/evaluation/evaluators/cost.ts
7442
+ var CostEvaluator = class {
7443
+ kind = "cost";
7444
+ config;
7445
+ constructor(options) {
7446
+ this.config = options.config;
6822
7447
  }
6823
- /**
6824
- * Build a summary from extracted tool calls.
6825
- */
6826
- buildSummary(toolCalls) {
6827
- const toolCallsByName = {};
6828
- for (const call of toolCalls) {
6829
- toolCallsByName[call.name] = (toolCallsByName[call.name] ?? 0) + 1;
6830
- }
6831
- const toolNames = Object.keys(toolCallsByName).sort();
6832
- return {
6833
- eventCount: toolCalls.length,
6834
- toolNames,
6835
- toolCallsByName,
6836
- errorCount: 0
6837
- };
6838
- }
6839
- evaluateAnyOrder(summary) {
6840
- const minimums = this.config.minimums ?? {};
6841
- const toolNames = Object.keys(minimums);
6842
- if (toolNames.length === 0) {
6843
- return {
6844
- score: 1,
6845
- verdict: "pass",
6846
- hits: ["No tool requirements specified"],
6847
- misses: [],
6848
- expectedAspectCount: 0
6849
- };
6850
- }
6851
- const hits = [];
6852
- const misses = [];
6853
- for (const toolName of toolNames) {
6854
- const required = minimums[toolName];
6855
- const actual = summary.toolCallsByName[toolName] ?? 0;
6856
- if (actual >= required) {
6857
- hits.push(`${toolName}: called ${actual} times (required \u2265${required})`);
6858
- } else {
6859
- misses.push(`${toolName}: called ${actual} times (required \u2265${required})`);
6860
- }
6861
- }
6862
- const score = hits.length / toolNames.length;
6863
- return {
6864
- score,
6865
- verdict: scoreToVerdict(score),
6866
- hits,
6867
- misses,
6868
- expectedAspectCount: toolNames.length
6869
- };
6870
- }
6871
- evaluateInOrder(toolCalls) {
6872
- const expected = this.config.expected ?? [];
6873
- if (expected.length === 0) {
7448
+ evaluate(context) {
7449
+ const { budget } = this.config;
7450
+ const costUsd = context.traceSummary?.costUsd;
7451
+ if (costUsd === void 0) {
6874
7452
  return {
6875
- score: 1,
6876
- verdict: "pass",
6877
- hits: ["No tool sequence specified"],
6878
- misses: [],
6879
- expectedAspectCount: 0
6880
- };
6881
- }
6882
- const hits = [];
6883
- const misses = [];
6884
- let actualIndex = 0;
6885
- for (let i = 0; i < expected.length; i++) {
6886
- const expectedItem = expected[i];
6887
- const expectedTool = expectedItem.tool;
6888
- let found = false;
6889
- let argsMismatch = false;
6890
- while (actualIndex < toolCalls.length) {
6891
- const actualCall = toolCalls[actualIndex];
6892
- if (actualCall.name === expectedTool) {
6893
- if (argsMatch(expectedItem.args, actualCall.args)) {
6894
- hits.push(`Found ${expectedTool} at position ${actualIndex}`);
6895
- actualIndex++;
6896
- found = true;
6897
- break;
6898
- }
6899
- misses.push(
6900
- `Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`
6901
- );
6902
- actualIndex++;
6903
- argsMismatch = true;
6904
- break;
7453
+ score: 0,
7454
+ verdict: "fail",
7455
+ hits: [],
7456
+ misses: ["No cost data available in trace"],
7457
+ expectedAspectCount: 1,
7458
+ reasoning: "Execution cost not reported by provider",
7459
+ evaluatorRawRequest: {
7460
+ type: "cost",
7461
+ budget,
7462
+ costUsd: null
6905
7463
  }
6906
- actualIndex++;
6907
- }
6908
- if (!found && !argsMismatch) {
6909
- misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
6910
- }
6911
- }
6912
- const score = hits.length / expected.length;
6913
- return {
6914
- score,
6915
- verdict: scoreToVerdict(score),
6916
- hits,
6917
- misses,
6918
- expectedAspectCount: expected.length
6919
- };
6920
- }
6921
- evaluateExact(toolCalls) {
6922
- const expected = this.config.expected ?? [];
6923
- if (expected.length === 0) {
6924
- return {
6925
- score: 1,
6926
- verdict: "pass",
6927
- hits: ["No tool sequence specified"],
6928
- misses: [],
6929
- expectedAspectCount: 0
6930
7464
  };
6931
7465
  }
6932
- const hits = [];
6933
- const misses = [];
6934
- if (toolCalls.length !== expected.length) {
6935
- misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
6936
- }
6937
- const checkLength = Math.min(expected.length, toolCalls.length);
6938
- for (let i = 0; i < checkLength; i++) {
6939
- const expectedItem = expected[i];
6940
- const expectedTool = expectedItem.tool;
6941
- const actualCall = toolCalls[i];
6942
- const actualTool = actualCall.name;
6943
- if (actualTool === expectedTool) {
6944
- if (argsMatch(expectedItem.args, actualCall.args)) {
6945
- hits.push(`Position ${i}: ${expectedTool}`);
6946
- } else {
6947
- misses.push(`Position ${i}: ${expectedTool} args mismatch`);
6948
- }
6949
- } else {
6950
- misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
6951
- }
6952
- }
6953
- for (let i = checkLength; i < expected.length; i++) {
6954
- misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
6955
- }
6956
- const score = hits.length / expected.length;
7466
+ const passed = costUsd <= budget;
7467
+ const score = passed ? 1 : 0;
7468
+ const formatCost = (n) => `$${n.toFixed(4)}`;
6957
7469
  return {
6958
7470
  score,
6959
- verdict: scoreToVerdict(score),
6960
- hits,
6961
- misses,
6962
- expectedAspectCount: expected.length
7471
+ verdict: passed ? "pass" : "fail",
7472
+ hits: passed ? [`Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`] : [],
7473
+ misses: passed ? [] : [`Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`],
7474
+ expectedAspectCount: 1,
7475
+ reasoning: `Execution cost ${formatCost(costUsd)} (budget: ${formatCost(budget)})`,
7476
+ evaluatorRawRequest: {
7477
+ type: "cost",
7478
+ budget,
7479
+ costUsd
7480
+ }
6963
7481
  };
6964
7482
  }
6965
7483
  };
7484
+
7485
+ // src/evaluation/evaluators/field-accuracy.ts
6966
7486
  var DEFAULT_DATE_FORMATS = [
6967
7487
  "YYYY-MM-DDTHH:mm:ssZ",
6968
7488
  // ISO with timezone
@@ -7058,551 +7578,326 @@ var FieldAccuracyEvaluator = class {
7058
7578
  return message.content;
7059
7579
  }
7060
7580
  if (typeof message.content === "string") {
7061
- try {
7062
- return parseJsonFromTextSafe(message.content);
7063
- } catch {
7064
- }
7065
- }
7066
- }
7067
- }
7068
- return void 0;
7069
- }
7070
- /**
7071
- * Evaluate a single field against the expected value.
7072
- */
7073
- evaluateField(fieldConfig, candidateData, expectedData) {
7074
- const { path: path17, match, required = true, weight = 1 } = fieldConfig;
7075
- const candidateValue = resolvePath(candidateData, path17);
7076
- const expectedValue = resolvePath(expectedData, path17);
7077
- if (expectedValue === void 0) {
7078
- return {
7079
- path: path17,
7080
- score: 1,
7081
- // No expected value means no comparison needed
7082
- weight,
7083
- hit: true,
7084
- message: `${path17}: no expected value`
7085
- };
7086
- }
7087
- if (candidateValue === void 0) {
7088
- if (required) {
7089
- return {
7090
- path: path17,
7091
- score: 0,
7092
- weight,
7093
- hit: false,
7094
- message: `${path17} (required, missing)`
7095
- };
7096
- }
7097
- return {
7098
- path: path17,
7099
- score: 1,
7100
- // Don't penalize missing optional fields
7101
- weight: 0,
7102
- // Zero weight means it won't affect the score
7103
- hit: true,
7104
- message: `${path17}: optional field missing`
7105
- };
7106
- }
7107
- switch (match) {
7108
- case "exact":
7109
- return this.compareExact(path17, candidateValue, expectedValue, weight);
7110
- case "numeric_tolerance":
7111
- return this.compareNumericTolerance(
7112
- path17,
7113
- candidateValue,
7114
- expectedValue,
7115
- fieldConfig,
7116
- weight
7117
- );
7118
- case "date":
7119
- return this.compareDate(path17, candidateValue, expectedValue, fieldConfig, weight);
7120
- default:
7121
- return {
7122
- path: path17,
7123
- score: 0,
7124
- weight,
7125
- hit: false,
7126
- message: `${path17}: unknown match type "${match}"`
7127
- };
7128
- }
7129
- }
7130
- /**
7131
- * Exact equality comparison.
7132
- */
7133
- compareExact(path17, candidateValue, expectedValue, weight) {
7134
- if (deepEqual(candidateValue, expectedValue)) {
7135
- return {
7136
- path: path17,
7137
- score: 1,
7138
- weight,
7139
- hit: true,
7140
- message: path17
7141
- };
7142
- }
7143
- if (typeof candidateValue !== typeof expectedValue) {
7144
- return {
7145
- path: path17,
7146
- score: 0,
7147
- weight,
7148
- hit: false,
7149
- message: `${path17} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
7150
- };
7151
- }
7152
- return {
7153
- path: path17,
7154
- score: 0,
7155
- weight,
7156
- hit: false,
7157
- message: `${path17} (value mismatch)`
7158
- };
7159
- }
7160
- /**
7161
- * Numeric comparison with absolute or relative tolerance.
7162
- */
7163
- compareNumericTolerance(path17, candidateValue, expectedValue, fieldConfig, weight) {
7164
- const { tolerance = 0, relative = false } = fieldConfig;
7165
- const candidateNum = toNumber(candidateValue);
7166
- const expectedNum = toNumber(expectedValue);
7167
- if (candidateNum === null || expectedNum === null) {
7168
- return {
7169
- path: path17,
7170
- score: 0,
7171
- weight,
7172
- hit: false,
7173
- message: `${path17} (non-numeric value)`
7174
- };
7175
- }
7176
- if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
7177
- return {
7178
- path: path17,
7179
- score: 0,
7180
- weight,
7181
- hit: false,
7182
- message: `${path17} (invalid numeric value)`
7183
- };
7184
- }
7185
- const diff = Math.abs(candidateNum - expectedNum);
7186
- let withinTolerance;
7187
- if (relative) {
7188
- const relativeDiff = expectedNum === 0 ? diff : diff / Math.abs(expectedNum);
7189
- withinTolerance = relativeDiff <= tolerance;
7190
- } else {
7191
- withinTolerance = diff <= tolerance;
7192
- }
7193
- if (withinTolerance) {
7194
- return {
7195
- path: path17,
7196
- score: 1,
7197
- weight,
7198
- hit: true,
7199
- message: `${path17} (within tolerance: diff=${diff.toFixed(2)})`
7200
- };
7201
- }
7202
- return {
7203
- path: path17,
7204
- score: 0,
7205
- weight,
7206
- hit: false,
7207
- message: `${path17} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
7208
- };
7209
- }
7210
- /**
7211
- * Date comparison with format normalization.
7212
- */
7213
- compareDate(path17, candidateValue, expectedValue, fieldConfig, weight) {
7214
- const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
7215
- const candidateDate = parseDate(String(candidateValue), formats);
7216
- const expectedDate = parseDate(String(expectedValue), formats);
7217
- if (candidateDate === null) {
7218
- return {
7219
- path: path17,
7220
- score: 0,
7221
- weight,
7222
- hit: false,
7223
- message: `${path17} (unparseable candidate date)`
7224
- };
7225
- }
7226
- if (expectedDate === null) {
7227
- return {
7228
- path: path17,
7229
- score: 0,
7230
- weight,
7231
- hit: false,
7232
- message: `${path17} (unparseable expected date)`
7233
- };
7234
- }
7235
- if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
7236
- return {
7237
- path: path17,
7238
- score: 1,
7239
- weight,
7240
- hit: true,
7241
- message: path17
7242
- };
7243
- }
7244
- return {
7245
- path: path17,
7246
- score: 0,
7247
- weight,
7248
- hit: false,
7249
- message: `${path17} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
7250
- };
7251
- }
7252
- /**
7253
- * Aggregate field results using configured strategy.
7254
- */
7255
- aggregateResults(results) {
7256
- const aggregation = this.config.aggregation ?? "weighted_average";
7257
- const hits = [];
7258
- const misses = [];
7259
- for (const result of results) {
7260
- if (result.hit) {
7261
- hits.push(result.message);
7262
- } else {
7263
- misses.push(result.message);
7264
- }
7265
- }
7266
- let score;
7267
- if (aggregation === "all_or_nothing") {
7268
- score = misses.length === 0 ? 1 : 0;
7269
- } else {
7270
- const totalWeight = results.reduce((sum, r) => sum + r.weight, 0);
7271
- if (totalWeight === 0) {
7272
- score = results.length === 0 ? 1 : 0;
7273
- } else {
7274
- const weightedSum = results.reduce((sum, r) => sum + r.score * r.weight, 0);
7275
- score = weightedSum / totalWeight;
7276
- }
7277
- }
7278
- const reasoning = `${hits.length}/${results.length} fields matched`;
7279
- return {
7280
- score: clampScore(score),
7281
- verdict: scoreToVerdict(score),
7282
- hits: hits.slice(0, 4),
7283
- misses: misses.slice(0, 4),
7284
- expectedAspectCount: results.length,
7285
- reasoning
7286
- };
7287
- }
7288
- };
7289
- function resolvePath(obj, path17) {
7290
- if (!path17 || !obj) {
7291
- return void 0;
7292
- }
7293
- const parts = path17.split(/\.|\[|\]/).filter((p) => p.length > 0);
7294
- let current = obj;
7295
- for (const part of parts) {
7296
- if (current === null || current === void 0) {
7297
- return void 0;
7298
- }
7299
- if (typeof current !== "object") {
7300
- return void 0;
7301
- }
7302
- const isIndex = /^\d+$/.test(part);
7303
- if (isIndex && Array.isArray(current)) {
7304
- current = current[Number.parseInt(part, 10)];
7305
- } else {
7306
- current = current[part];
7307
- }
7308
- }
7309
- return current;
7310
- }
7311
- function toNumber(value) {
7312
- if (typeof value === "number") {
7313
- return value;
7314
- }
7315
- if (typeof value === "string") {
7316
- const num = Number.parseFloat(value);
7317
- return Number.isNaN(num) ? null : num;
7318
- }
7319
- return null;
7320
- }
7321
- function parseDate(dateStr, formats) {
7322
- if (!dateStr) return null;
7323
- const trimmed = dateStr.trim();
7324
- const isoDate = new Date(trimmed);
7325
- if (!Number.isNaN(isoDate.getTime())) {
7326
- return isoDate;
7327
- }
7328
- const localizedMatch = trimmed.match(/^(\d{1,2})-([A-Za-z]{3,9})-(\d{4})$/);
7329
- if (localizedMatch) {
7330
- const day = Number.parseInt(localizedMatch[1], 10);
7331
- const monthName = localizedMatch[2].toLowerCase();
7332
- const year = Number.parseInt(localizedMatch[3], 10);
7333
- const month = MONTH_NAMES[monthName];
7334
- if (month !== void 0) {
7335
- return new Date(year, month, day);
7336
- }
7337
- }
7338
- const usMatch = trimmed.match(/^(\d{1,2})[\/\-](\d{1,2})[\/\-](\d{4})$/);
7339
- if (usMatch) {
7340
- const hasUSFormat = formats.some((f) => f.includes("MM/DD") || f.includes("MM-DD"));
7341
- const hasEUFormat = formats.some((f) => f.includes("DD/MM") || f.includes("DD-MM"));
7342
- if (hasUSFormat && !hasEUFormat) {
7343
- const month = Number.parseInt(usMatch[1], 10) - 1;
7344
- const day = Number.parseInt(usMatch[2], 10);
7345
- const year = Number.parseInt(usMatch[3], 10);
7346
- if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
7347
- return new Date(year, month, day);
7348
- }
7349
- } else if (hasEUFormat && !hasUSFormat) {
7350
- const day = Number.parseInt(usMatch[1], 10);
7351
- const month = Number.parseInt(usMatch[2], 10) - 1;
7352
- const year = Number.parseInt(usMatch[3], 10);
7353
- if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
7354
- return new Date(year, month, day);
7355
- }
7356
- } else {
7357
- const num1 = Number.parseInt(usMatch[1], 10);
7358
- const num2 = Number.parseInt(usMatch[2], 10);
7359
- const year = Number.parseInt(usMatch[3], 10);
7360
- if (num1 > 12 && num2 <= 12) {
7361
- return new Date(year, num2 - 1, num1);
7362
- }
7363
- if (num2 > 12 && num1 <= 12) {
7364
- return new Date(year, num1 - 1, num2);
7365
- }
7366
- if (num1 <= 12 && num2 <= 31) {
7367
- return new Date(year, num1 - 1, num2);
7368
- }
7369
- }
7370
- }
7371
- return null;
7372
- }
7373
- function formatDateISO(date) {
7374
- return date.toISOString().split("T")[0];
7375
- }
7376
- function parseJsonFromTextSafe(text) {
7377
- const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
7378
- const match = cleaned.match(/\{[\s\S]*\}/);
7379
- const blob = match?.[0] ?? cleaned;
7380
- return JSON.parse(blob);
7381
- }
7382
- var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
7383
- {{EVALUATOR_RESULTS_JSON}}
7384
-
7385
- Decide the final score and verdict based on all evaluator results.
7386
- Return a JSON object with: score (0.0-1.0), verdict (pass/fail/borderline), and reasoning.`;
7387
- var CompositeEvaluator = class {
7388
- kind = "composite";
7389
- config;
7390
- evaluatorFactory;
7391
- cwd;
7392
- constructor(options) {
7393
- this.config = options.config;
7394
- this.evaluatorFactory = options.evaluatorFactory;
7395
- this.cwd = options.cwd;
7581
+ try {
7582
+ return parseJsonFromTextSafe(message.content);
7583
+ } catch {
7584
+ }
7585
+ }
7586
+ }
7587
+ }
7588
+ return void 0;
7396
7589
  }
7397
- async evaluate(context) {
7398
- const memberResults = await Promise.all(
7399
- this.config.evaluators.map(async (memberConfig) => {
7400
- const evaluator = this.evaluatorFactory.create(memberConfig, context);
7590
+ /**
7591
+ * Evaluate a single field against the expected value.
7592
+ */
7593
+ evaluateField(fieldConfig, candidateData, expectedData) {
7594
+ const { path: path17, match, required = true, weight = 1 } = fieldConfig;
7595
+ const candidateValue = resolvePath(candidateData, path17);
7596
+ const expectedValue = resolvePath(expectedData, path17);
7597
+ if (expectedValue === void 0) {
7598
+ return {
7599
+ path: path17,
7600
+ score: 1,
7601
+ // No expected value means no comparison needed
7602
+ weight,
7603
+ hit: true,
7604
+ message: `${path17}: no expected value`
7605
+ };
7606
+ }
7607
+ if (candidateValue === void 0) {
7608
+ if (required) {
7401
7609
  return {
7402
- id: memberConfig.name,
7403
- type: memberConfig.type,
7404
- result: await evaluator.evaluate(context)
7610
+ path: path17,
7611
+ score: 0,
7612
+ weight,
7613
+ hit: false,
7614
+ message: `${path17} (required, missing)`
7405
7615
  };
7406
- })
7407
- );
7408
- return this.aggregate(memberResults, context);
7409
- }
7410
- async aggregate(results, context) {
7411
- const aggregator = this.config.aggregator;
7412
- switch (aggregator.type) {
7413
- case "code_judge":
7414
- return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
7415
- case "llm_judge":
7416
- return this.runLlmAggregator(results, context, aggregator);
7616
+ }
7617
+ return {
7618
+ path: path17,
7619
+ score: 1,
7620
+ // Don't penalize missing optional fields
7621
+ weight: 0,
7622
+ // Zero weight means it won't affect the score
7623
+ hit: true,
7624
+ message: `${path17}: optional field missing`
7625
+ };
7626
+ }
7627
+ switch (match) {
7628
+ case "exact":
7629
+ return this.compareExact(path17, candidateValue, expectedValue, weight);
7630
+ case "numeric_tolerance":
7631
+ return this.compareNumericTolerance(
7632
+ path17,
7633
+ candidateValue,
7634
+ expectedValue,
7635
+ fieldConfig,
7636
+ weight
7637
+ );
7638
+ case "date":
7639
+ return this.compareDate(path17, candidateValue, expectedValue, fieldConfig, weight);
7417
7640
  default:
7418
- return this.runWeightedAverage(results, aggregator.weights);
7641
+ return {
7642
+ path: path17,
7643
+ score: 0,
7644
+ weight,
7645
+ hit: false,
7646
+ message: `${path17}: unknown match type "${match}"`
7647
+ };
7419
7648
  }
7420
7649
  }
7421
- runWeightedAverage(results, weights) {
7422
- let totalWeight = 0;
7423
- let weightedSum = 0;
7424
- const allHits = [];
7425
- const allMisses = [];
7426
- const reasoningParts = [];
7427
- const evaluatorResults = [];
7428
- for (const member of results) {
7429
- const weight = weights?.[member.id] ?? 1;
7430
- totalWeight += weight;
7431
- weightedSum += member.result.score * weight;
7432
- allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
7433
- allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
7434
- if (member.result.reasoning) {
7435
- reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
7436
- }
7437
- evaluatorResults.push({
7438
- name: member.id,
7439
- type: member.type,
7440
- score: member.result.score,
7650
+ /**
7651
+ * Exact equality comparison.
7652
+ */
7653
+ compareExact(path17, candidateValue, expectedValue, weight) {
7654
+ if (deepEqual(candidateValue, expectedValue)) {
7655
+ return {
7656
+ path: path17,
7657
+ score: 1,
7441
7658
  weight,
7442
- verdict: member.result.verdict,
7443
- hits: [...member.result.hits],
7444
- misses: [...member.result.misses],
7445
- reasoning: member.result.reasoning,
7446
- evaluatorRawRequest: member.result.evaluatorRawRequest,
7447
- evaluatorResults: member.result.evaluatorResults
7448
- });
7659
+ hit: true,
7660
+ message: path17
7661
+ };
7662
+ }
7663
+ if (typeof candidateValue !== typeof expectedValue) {
7664
+ return {
7665
+ path: path17,
7666
+ score: 0,
7667
+ weight,
7668
+ hit: false,
7669
+ message: `${path17} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
7670
+ };
7449
7671
  }
7450
- const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
7451
7672
  return {
7452
- score: clampScore(finalScore),
7453
- verdict: scoreToVerdict(finalScore),
7454
- hits: allHits,
7455
- misses: allMisses,
7456
- expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
7457
- reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
7458
- evaluatorRawRequest: {
7459
- aggregator: "weighted_average",
7460
- ...weights ? { weights } : {}
7461
- },
7462
- evaluatorResults
7673
+ path: path17,
7674
+ score: 0,
7675
+ weight,
7676
+ hit: false,
7677
+ message: `${path17} (value mismatch)`
7463
7678
  };
7464
7679
  }
7465
- async runCodeAggregator(results, scriptPath, cwd, weights) {
7466
- const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
7467
- const inputPayload = JSON.stringify({ results: resultsObject }, null, 2);
7468
- const evaluatorResults = results.map((member) => ({
7469
- name: member.id,
7470
- type: member.type,
7471
- score: member.result.score,
7472
- weight: weights?.[member.id] ?? 1,
7473
- verdict: member.result.verdict,
7474
- hits: [...member.result.hits],
7475
- misses: [...member.result.misses],
7476
- reasoning: member.result.reasoning,
7477
- evaluatorRawRequest: member.result.evaluatorRawRequest,
7478
- evaluatorResults: member.result.evaluatorResults
7479
- }));
7480
- try {
7481
- const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
7482
- const parsed = parseJsonSafe(stdout);
7483
- const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
7484
- const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
7485
- const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
7486
- const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
7487
- const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
7680
+ /**
7681
+ * Numeric comparison with absolute or relative tolerance.
7682
+ */
7683
+ compareNumericTolerance(path17, candidateValue, expectedValue, fieldConfig, weight) {
7684
+ const { tolerance = 0, relative = false } = fieldConfig;
7685
+ const candidateNum = toNumber(candidateValue);
7686
+ const expectedNum = toNumber(expectedValue);
7687
+ if (candidateNum === null || expectedNum === null) {
7488
7688
  return {
7489
- score,
7490
- verdict,
7491
- hits,
7492
- misses,
7493
- expectedAspectCount: hits.length + misses.length || 1,
7494
- reasoning,
7495
- evaluatorRawRequest: {
7496
- aggregator: "code_judge",
7497
- script: scriptPath
7498
- },
7499
- evaluatorResults
7689
+ path: path17,
7690
+ score: 0,
7691
+ weight,
7692
+ hit: false,
7693
+ message: `${path17} (non-numeric value)`
7500
7694
  };
7501
- } catch (error) {
7502
- const message = error instanceof Error ? error.message : String(error);
7695
+ }
7696
+ if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
7503
7697
  return {
7698
+ path: path17,
7504
7699
  score: 0,
7505
- verdict: "fail",
7506
- hits: [],
7507
- misses: [`Code aggregator failed: ${message}`],
7508
- expectedAspectCount: 1,
7509
- reasoning: message,
7510
- evaluatorRawRequest: {
7511
- aggregator: "code_judge",
7512
- script: scriptPath,
7513
- error: message
7514
- },
7515
- evaluatorResults
7700
+ weight,
7701
+ hit: false,
7702
+ message: `${path17} (invalid numeric value)`
7516
7703
  };
7517
7704
  }
7518
- }
7519
- async runLlmAggregator(results, context, config) {
7520
- const judgeProvider = context.judgeProvider;
7521
- if (!judgeProvider) {
7522
- throw new Error("No judge provider available for LLM aggregation");
7705
+ const diff = Math.abs(candidateNum - expectedNum);
7706
+ let withinTolerance;
7707
+ if (relative) {
7708
+ const relativeDiff = expectedNum === 0 ? diff : diff / Math.abs(expectedNum);
7709
+ withinTolerance = relativeDiff <= tolerance;
7710
+ } else {
7711
+ withinTolerance = diff <= tolerance;
7523
7712
  }
7524
- const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
7525
- const resultsJson = JSON.stringify(resultsObject, null, 2);
7526
- const evaluatorResults = results.map((member) => ({
7527
- name: member.id,
7528
- type: member.type,
7529
- score: member.result.score,
7530
- verdict: member.result.verdict,
7531
- hits: [...member.result.hits],
7532
- misses: [...member.result.misses],
7533
- reasoning: member.result.reasoning,
7534
- evaluatorRawRequest: member.result.evaluatorRawRequest,
7535
- evaluatorResults: member.result.evaluatorResults
7536
- }));
7537
- const promptTemplate = config.prompt ?? DEFAULT_COMPOSITE_AGGREGATOR_PROMPT;
7538
- const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
7539
- const systemPrompt = buildOutputSchema();
7540
- const evaluatorRawRequest = {
7541
- aggregator: "llm_judge",
7542
- userPrompt,
7543
- systemPrompt,
7544
- target: judgeProvider.targetName
7713
+ if (withinTolerance) {
7714
+ return {
7715
+ path: path17,
7716
+ score: 1,
7717
+ weight,
7718
+ hit: true,
7719
+ message: `${path17} (within tolerance: diff=${diff.toFixed(2)})`
7720
+ };
7721
+ }
7722
+ return {
7723
+ path: path17,
7724
+ score: 0,
7725
+ weight,
7726
+ hit: false,
7727
+ message: `${path17} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
7545
7728
  };
7546
- try {
7547
- const model = judgeProvider.asLanguageModel?.();
7548
- if (model) {
7549
- const { text } = await (0, import_ai2.generateText)({
7550
- model,
7551
- system: systemPrompt,
7552
- prompt: userPrompt
7553
- });
7554
- const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text));
7555
- const score2 = clampScore(data2.score);
7556
- const hits2 = Array.isArray(data2.hits) ? data2.hits.filter(isNonEmptyString).slice(0, 4) : [];
7557
- const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
7558
- const reasoning2 = data2.reasoning;
7559
- return {
7560
- score: score2,
7561
- verdict: scoreToVerdict(score2),
7562
- hits: hits2,
7563
- misses: misses2,
7564
- expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
7565
- reasoning: reasoning2,
7566
- evaluatorRawRequest,
7567
- evaluatorResults
7568
- };
7569
- }
7570
- const response = await judgeProvider.invoke({
7571
- question: userPrompt,
7572
- systemPrompt,
7573
- evalCaseId: context.evalCase.id,
7574
- attempt: context.attempt
7575
- });
7576
- const data = freeformEvaluationSchema.parse(
7577
- parseJsonFromText(extractLastAssistantContent(response.outputMessages))
7578
- );
7579
- const score = clampScore(data.score);
7580
- const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
7581
- const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
7582
- const reasoning = data.reasoning;
7729
+ }
7730
+ /**
7731
+ * Date comparison with format normalization.
7732
+ */
7733
+ compareDate(path17, candidateValue, expectedValue, fieldConfig, weight) {
7734
+ const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
7735
+ const candidateDate = parseDate(String(candidateValue), formats);
7736
+ const expectedDate = parseDate(String(expectedValue), formats);
7737
+ if (candidateDate === null) {
7583
7738
  return {
7584
- score,
7585
- verdict: scoreToVerdict(score),
7586
- hits,
7587
- misses,
7588
- expectedAspectCount: Math.max(hits.length + misses.length, 1),
7589
- reasoning,
7590
- evaluatorRawRequest,
7591
- evaluatorResults
7739
+ path: path17,
7740
+ score: 0,
7741
+ weight,
7742
+ hit: false,
7743
+ message: `${path17} (unparseable candidate date)`
7592
7744
  };
7593
- } catch {
7745
+ }
7746
+ if (expectedDate === null) {
7594
7747
  return {
7748
+ path: path17,
7595
7749
  score: 0,
7596
- verdict: "fail",
7597
- hits: [],
7598
- misses: [],
7599
- expectedAspectCount: 1,
7600
- evaluatorRawRequest,
7601
- evaluatorResults
7750
+ weight,
7751
+ hit: false,
7752
+ message: `${path17} (unparseable expected date)`
7753
+ };
7754
+ }
7755
+ if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
7756
+ return {
7757
+ path: path17,
7758
+ score: 1,
7759
+ weight,
7760
+ hit: true,
7761
+ message: path17
7602
7762
  };
7603
7763
  }
7764
+ return {
7765
+ path: path17,
7766
+ score: 0,
7767
+ weight,
7768
+ hit: false,
7769
+ message: `${path17} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
7770
+ };
7771
+ }
7772
+ /**
7773
+ * Aggregate field results using configured strategy.
7774
+ */
7775
+ aggregateResults(results) {
7776
+ const aggregation = this.config.aggregation ?? "weighted_average";
7777
+ const hits = [];
7778
+ const misses = [];
7779
+ for (const result of results) {
7780
+ if (result.hit) {
7781
+ hits.push(result.message);
7782
+ } else {
7783
+ misses.push(result.message);
7784
+ }
7785
+ }
7786
+ let score;
7787
+ if (aggregation === "all_or_nothing") {
7788
+ score = misses.length === 0 ? 1 : 0;
7789
+ } else {
7790
+ const totalWeight = results.reduce((sum, r) => sum + r.weight, 0);
7791
+ if (totalWeight === 0) {
7792
+ score = results.length === 0 ? 1 : 0;
7793
+ } else {
7794
+ const weightedSum = results.reduce((sum, r) => sum + r.score * r.weight, 0);
7795
+ score = weightedSum / totalWeight;
7796
+ }
7797
+ }
7798
+ const reasoning = `${hits.length}/${results.length} fields matched`;
7799
+ return {
7800
+ score: clampScore(score),
7801
+ verdict: scoreToVerdict(score),
7802
+ hits: hits.slice(0, 4),
7803
+ misses: misses.slice(0, 4),
7804
+ expectedAspectCount: results.length,
7805
+ reasoning
7806
+ };
7604
7807
  }
7605
7808
  };
7809
+ function resolvePath(obj, path17) {
7810
+ if (!path17 || !obj) {
7811
+ return void 0;
7812
+ }
7813
+ const parts = path17.split(/\.|\[|\]/).filter((p) => p.length > 0);
7814
+ let current = obj;
7815
+ for (const part of parts) {
7816
+ if (current === null || current === void 0) {
7817
+ return void 0;
7818
+ }
7819
+ if (typeof current !== "object") {
7820
+ return void 0;
7821
+ }
7822
+ const isIndex = /^\d+$/.test(part);
7823
+ if (isIndex && Array.isArray(current)) {
7824
+ current = current[Number.parseInt(part, 10)];
7825
+ } else {
7826
+ current = current[part];
7827
+ }
7828
+ }
7829
+ return current;
7830
+ }
7831
+ function toNumber(value) {
7832
+ if (typeof value === "number") {
7833
+ return value;
7834
+ }
7835
+ if (typeof value === "string") {
7836
+ const num = Number.parseFloat(value);
7837
+ return Number.isNaN(num) ? null : num;
7838
+ }
7839
+ return null;
7840
+ }
7841
+ function parseDate(dateStr, formats) {
7842
+ if (!dateStr) return null;
7843
+ const trimmed = dateStr.trim();
7844
+ const isoDate = new Date(trimmed);
7845
+ if (!Number.isNaN(isoDate.getTime())) {
7846
+ return isoDate;
7847
+ }
7848
+ const localizedMatch = trimmed.match(/^(\d{1,2})-([A-Za-z]{3,9})-(\d{4})$/);
7849
+ if (localizedMatch) {
7850
+ const day = Number.parseInt(localizedMatch[1], 10);
7851
+ const monthName = localizedMatch[2].toLowerCase();
7852
+ const year = Number.parseInt(localizedMatch[3], 10);
7853
+ const month = MONTH_NAMES[monthName];
7854
+ if (month !== void 0) {
7855
+ return new Date(year, month, day);
7856
+ }
7857
+ }
7858
+ const usMatch = trimmed.match(/^(\d{1,2})[\/\-](\d{1,2})[\/\-](\d{4})$/);
7859
+ if (usMatch) {
7860
+ const hasUSFormat = formats.some((f) => f.includes("MM/DD") || f.includes("MM-DD"));
7861
+ const hasEUFormat = formats.some((f) => f.includes("DD/MM") || f.includes("DD-MM"));
7862
+ if (hasUSFormat && !hasEUFormat) {
7863
+ const month = Number.parseInt(usMatch[1], 10) - 1;
7864
+ const day = Number.parseInt(usMatch[2], 10);
7865
+ const year = Number.parseInt(usMatch[3], 10);
7866
+ if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
7867
+ return new Date(year, month, day);
7868
+ }
7869
+ } else if (hasEUFormat && !hasUSFormat) {
7870
+ const day = Number.parseInt(usMatch[1], 10);
7871
+ const month = Number.parseInt(usMatch[2], 10) - 1;
7872
+ const year = Number.parseInt(usMatch[3], 10);
7873
+ if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
7874
+ return new Date(year, month, day);
7875
+ }
7876
+ } else {
7877
+ const num1 = Number.parseInt(usMatch[1], 10);
7878
+ const num2 = Number.parseInt(usMatch[2], 10);
7879
+ const year = Number.parseInt(usMatch[3], 10);
7880
+ if (num1 > 12 && num2 <= 12) {
7881
+ return new Date(year, num2 - 1, num1);
7882
+ }
7883
+ if (num2 > 12 && num1 <= 12) {
7884
+ return new Date(year, num1 - 1, num2);
7885
+ }
7886
+ if (num1 <= 12 && num2 <= 31) {
7887
+ return new Date(year, num1 - 1, num2);
7888
+ }
7889
+ }
7890
+ }
7891
+ return null;
7892
+ }
7893
+ function formatDateISO(date) {
7894
+ return date.toISOString().split("T")[0];
7895
+ }
7896
+ function parseJsonFromTextSafe(text) {
7897
+ return parseJsonFromText(text);
7898
+ }
7899
+
7900
+ // src/evaluation/evaluators/latency.ts
7606
7901
  var LatencyEvaluator = class {
7607
7902
  kind = "latency";
7608
7903
  config;
@@ -7639,53 +7934,13 @@ var LatencyEvaluator = class {
7639
7934
  evaluatorRawRequest: {
7640
7935
  type: "latency",
7641
7936
  threshold,
7642
- durationMs
7643
- }
7644
- };
7645
- }
7646
- };
7647
- var CostEvaluator = class {
7648
- kind = "cost";
7649
- config;
7650
- constructor(options) {
7651
- this.config = options.config;
7652
- }
7653
- evaluate(context) {
7654
- const { budget } = this.config;
7655
- const costUsd = context.traceSummary?.costUsd;
7656
- if (costUsd === void 0) {
7657
- return {
7658
- score: 0,
7659
- verdict: "fail",
7660
- hits: [],
7661
- misses: ["No cost data available in trace"],
7662
- expectedAspectCount: 1,
7663
- reasoning: "Execution cost not reported by provider",
7664
- evaluatorRawRequest: {
7665
- type: "cost",
7666
- budget,
7667
- costUsd: null
7668
- }
7669
- };
7670
- }
7671
- const passed = costUsd <= budget;
7672
- const score = passed ? 1 : 0;
7673
- const formatCost = (n) => `$${n.toFixed(4)}`;
7674
- return {
7675
- score,
7676
- verdict: passed ? "pass" : "fail",
7677
- hits: passed ? [`Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`] : [],
7678
- misses: passed ? [] : [`Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`],
7679
- expectedAspectCount: 1,
7680
- reasoning: `Execution cost ${formatCost(costUsd)} (budget: ${formatCost(budget)})`,
7681
- evaluatorRawRequest: {
7682
- type: "cost",
7683
- budget,
7684
- costUsd
7937
+ durationMs
7685
7938
  }
7686
7939
  };
7687
7940
  }
7688
7941
  };
7942
+
7943
+ // src/evaluation/evaluators/token-usage.ts
7689
7944
  var TokenUsageEvaluator = class {
7690
7945
  kind = "token_usage";
7691
7946
  config;
@@ -7769,8 +8024,228 @@ var TokenUsageEvaluator = class {
7769
8024
  }
7770
8025
  };
7771
8026
 
8027
+ // src/evaluation/evaluators/tool-trajectory.ts
8028
+ function argsMatch(expected, actual) {
8029
+ if (expected === void 0) return true;
8030
+ if (expected === "any") return true;
8031
+ if (actual === void 0) return false;
8032
+ for (const key of Object.keys(expected)) {
8033
+ if (!Object.hasOwn(actual, key)) return false;
8034
+ if (!deepEqual(expected[key], actual[key])) return false;
8035
+ }
8036
+ return true;
8037
+ }
8038
+ var ToolTrajectoryEvaluator = class {
8039
+ kind = "tool_trajectory";
8040
+ config;
8041
+ constructor(options) {
8042
+ this.config = options.config;
8043
+ }
8044
+ evaluate(context) {
8045
+ const { outputMessages, traceSummary } = context;
8046
+ const toolCalls = this.extractToolCallsFromMessages(outputMessages);
8047
+ if (toolCalls.length === 0 && !traceSummary) {
8048
+ return {
8049
+ score: 0,
8050
+ verdict: "fail",
8051
+ hits: [],
8052
+ misses: ["No trace available for evaluation"],
8053
+ expectedAspectCount: 1
8054
+ };
8055
+ }
8056
+ const summary = toolCalls.length > 0 ? this.buildSummary(toolCalls) : traceSummary;
8057
+ if (!summary) {
8058
+ return {
8059
+ score: 0,
8060
+ verdict: "fail",
8061
+ hits: [],
8062
+ misses: ["No trace available for evaluation"],
8063
+ expectedAspectCount: 1
8064
+ };
8065
+ }
8066
+ switch (this.config.mode) {
8067
+ case "any_order":
8068
+ return this.evaluateAnyOrder(summary);
8069
+ case "in_order":
8070
+ return this.evaluateInOrder(toolCalls);
8071
+ case "exact":
8072
+ return this.evaluateExact(toolCalls);
8073
+ default:
8074
+ return {
8075
+ score: 0,
8076
+ verdict: "fail",
8077
+ hits: [],
8078
+ misses: [`Unknown mode: ${this.config.mode}`],
8079
+ expectedAspectCount: 1
8080
+ };
8081
+ }
8082
+ }
8083
+ /**
8084
+ * Extract tool calls from output messages.
8085
+ */
8086
+ extractToolCallsFromMessages(messages) {
8087
+ if (!messages) {
8088
+ return [];
8089
+ }
8090
+ const toolCalls = [];
8091
+ for (const message of messages) {
8092
+ if (message.toolCalls) {
8093
+ for (const call of message.toolCalls) {
8094
+ toolCalls.push({
8095
+ name: call.tool,
8096
+ args: call.input
8097
+ });
8098
+ }
8099
+ }
8100
+ }
8101
+ return toolCalls;
8102
+ }
8103
+ /**
8104
+ * Build a summary from extracted tool calls.
8105
+ */
8106
+ buildSummary(toolCalls) {
8107
+ const toolCallsByName = {};
8108
+ for (const call of toolCalls) {
8109
+ toolCallsByName[call.name] = (toolCallsByName[call.name] ?? 0) + 1;
8110
+ }
8111
+ const toolNames = Object.keys(toolCallsByName).sort();
8112
+ return {
8113
+ eventCount: toolCalls.length,
8114
+ toolNames,
8115
+ toolCallsByName,
8116
+ errorCount: 0
8117
+ };
8118
+ }
8119
+ evaluateAnyOrder(summary) {
8120
+ const minimums = this.config.minimums ?? {};
8121
+ const toolNames = Object.keys(minimums);
8122
+ if (toolNames.length === 0) {
8123
+ return {
8124
+ score: 1,
8125
+ verdict: "pass",
8126
+ hits: ["No tool requirements specified"],
8127
+ misses: [],
8128
+ expectedAspectCount: 0
8129
+ };
8130
+ }
8131
+ const hits = [];
8132
+ const misses = [];
8133
+ for (const toolName of toolNames) {
8134
+ const required = minimums[toolName];
8135
+ const actual = summary.toolCallsByName[toolName] ?? 0;
8136
+ if (actual >= required) {
8137
+ hits.push(`${toolName}: called ${actual} times (required \u2265${required})`);
8138
+ } else {
8139
+ misses.push(`${toolName}: called ${actual} times (required \u2265${required})`);
8140
+ }
8141
+ }
8142
+ const score = hits.length / toolNames.length;
8143
+ return {
8144
+ score,
8145
+ verdict: scoreToVerdict(score),
8146
+ hits,
8147
+ misses,
8148
+ expectedAspectCount: toolNames.length
8149
+ };
8150
+ }
8151
+ evaluateInOrder(toolCalls) {
8152
+ const expected = this.config.expected ?? [];
8153
+ if (expected.length === 0) {
8154
+ return {
8155
+ score: 1,
8156
+ verdict: "pass",
8157
+ hits: ["No tool sequence specified"],
8158
+ misses: [],
8159
+ expectedAspectCount: 0
8160
+ };
8161
+ }
8162
+ const hits = [];
8163
+ const misses = [];
8164
+ let actualIndex = 0;
8165
+ for (let i = 0; i < expected.length; i++) {
8166
+ const expectedItem = expected[i];
8167
+ const expectedTool = expectedItem.tool;
8168
+ let found = false;
8169
+ let argsMismatch = false;
8170
+ while (actualIndex < toolCalls.length) {
8171
+ const actualCall = toolCalls[actualIndex];
8172
+ if (actualCall.name === expectedTool) {
8173
+ if (argsMatch(expectedItem.args, actualCall.args)) {
8174
+ hits.push(`Found ${expectedTool} at position ${actualIndex}`);
8175
+ actualIndex++;
8176
+ found = true;
8177
+ break;
8178
+ }
8179
+ misses.push(
8180
+ `Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`
8181
+ );
8182
+ actualIndex++;
8183
+ argsMismatch = true;
8184
+ break;
8185
+ }
8186
+ actualIndex++;
8187
+ }
8188
+ if (!found && !argsMismatch) {
8189
+ misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
8190
+ }
8191
+ }
8192
+ const score = hits.length / expected.length;
8193
+ return {
8194
+ score,
8195
+ verdict: scoreToVerdict(score),
8196
+ hits,
8197
+ misses,
8198
+ expectedAspectCount: expected.length
8199
+ };
8200
+ }
8201
+ evaluateExact(toolCalls) {
8202
+ const expected = this.config.expected ?? [];
8203
+ if (expected.length === 0) {
8204
+ return {
8205
+ score: 1,
8206
+ verdict: "pass",
8207
+ hits: ["No tool sequence specified"],
8208
+ misses: [],
8209
+ expectedAspectCount: 0
8210
+ };
8211
+ }
8212
+ const hits = [];
8213
+ const misses = [];
8214
+ if (toolCalls.length !== expected.length) {
8215
+ misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
8216
+ }
8217
+ const checkLength = Math.min(expected.length, toolCalls.length);
8218
+ for (let i = 0; i < checkLength; i++) {
8219
+ const expectedItem = expected[i];
8220
+ const expectedTool = expectedItem.tool;
8221
+ const actualCall = toolCalls[i];
8222
+ const actualTool = actualCall.name;
8223
+ if (actualTool === expectedTool) {
8224
+ if (argsMatch(expectedItem.args, actualCall.args)) {
8225
+ hits.push(`Position ${i}: ${expectedTool}`);
8226
+ } else {
8227
+ misses.push(`Position ${i}: ${expectedTool} args mismatch`);
8228
+ }
8229
+ } else {
8230
+ misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
8231
+ }
8232
+ }
8233
+ for (let i = checkLength; i < expected.length; i++) {
8234
+ misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
8235
+ }
8236
+ const score = hits.length / expected.length;
8237
+ return {
8238
+ score,
8239
+ verdict: scoreToVerdict(score),
8240
+ hits,
8241
+ misses,
8242
+ expectedAspectCount: expected.length
8243
+ };
8244
+ }
8245
+ };
8246
+
7772
8247
  // src/evaluation/orchestrator.ts
7773
- var import_node_crypto4 = require("crypto");
8248
+ var import_node_crypto5 = require("crypto");
7774
8249
  var import_node_path16 = __toESM(require("path"), 1);
7775
8250
 
7776
8251
  // ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
@@ -7982,6 +8457,17 @@ async function runEvaluation(options) {
7982
8457
  }
7983
8458
  return getOrCreateProvider(resolvedJudge);
7984
8459
  };
8460
+ const targetResolver = (name) => {
8461
+ const resolved = resolveTargetByName(name);
8462
+ if (!resolved) {
8463
+ return void 0;
8464
+ }
8465
+ return getOrCreateProvider(resolved);
8466
+ };
8467
+ const availableTargets = [
8468
+ target.name,
8469
+ ...Array.from(targetDefinitions.keys())
8470
+ ];
7985
8471
  const evaluatorRegistry = buildEvaluatorRegistry(evaluators, resolveJudgeProvider);
7986
8472
  const primaryProvider = getOrCreateProvider(target);
7987
8473
  const providerSupportsBatch = target.providerBatching === true && primaryProvider.supportsBatch === true && typeof primaryProvider.invokeBatch === "function";
@@ -8011,7 +8497,9 @@ async function runEvaluation(options) {
8011
8497
  onResult,
8012
8498
  verbose,
8013
8499
  resolveJudgeProvider,
8014
- agentTimeoutMs
8500
+ agentTimeoutMs,
8501
+ targetResolver,
8502
+ availableTargets
8015
8503
  });
8016
8504
  } catch (error) {
8017
8505
  if (verbose) {
@@ -8050,7 +8538,9 @@ async function runEvaluation(options) {
8050
8538
  cache,
8051
8539
  useCache,
8052
8540
  now,
8053
- judgeProvider
8541
+ judgeProvider,
8542
+ targetResolver,
8543
+ availableTargets
8054
8544
  });
8055
8545
  if (onProgress) {
8056
8546
  await onProgress({
@@ -8117,7 +8607,9 @@ async function runBatchEvaluation(options) {
8117
8607
  onProgress,
8118
8608
  onResult,
8119
8609
  resolveJudgeProvider,
8120
- agentTimeoutMs
8610
+ agentTimeoutMs,
8611
+ targetResolver,
8612
+ availableTargets
8121
8613
  } = options;
8122
8614
  const promptInputsList = [];
8123
8615
  const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
@@ -8176,7 +8668,7 @@ async function runBatchEvaluation(options) {
8176
8668
  costUsd: providerResponse.costUsd,
8177
8669
  durationMs: providerResponse.durationMs
8178
8670
  }) : void 0;
8179
- const candidate = extractLastAssistantContent(outputMessages);
8671
+ const candidate = extractLastAssistantContent2(outputMessages);
8180
8672
  const providerError = extractProviderError(providerResponse);
8181
8673
  let result;
8182
8674
  try {
@@ -8192,7 +8684,9 @@ async function runBatchEvaluation(options) {
8192
8684
  judgeProvider: await resolveJudgeProvider(target),
8193
8685
  agentTimeoutMs,
8194
8686
  outputMessages,
8195
- traceSummary
8687
+ traceSummary,
8688
+ targetResolver,
8689
+ availableTargets
8196
8690
  });
8197
8691
  if (providerError) {
8198
8692
  result = { ...result, error: providerError };
@@ -8250,7 +8744,9 @@ async function runEvalCase(options) {
8250
8744
  cache,
8251
8745
  useCache,
8252
8746
  signal,
8253
- judgeProvider
8747
+ judgeProvider,
8748
+ targetResolver,
8749
+ availableTargets
8254
8750
  } = options;
8255
8751
  const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
8256
8752
  const promptInputs = await buildPromptInputs(evalCase, formattingMode);
@@ -8309,7 +8805,7 @@ async function runEvalCase(options) {
8309
8805
  costUsd: providerResponse.costUsd,
8310
8806
  durationMs: providerResponse.durationMs
8311
8807
  }) : void 0;
8312
- const candidate = extractLastAssistantContent(outputMessages);
8808
+ const candidate = extractLastAssistantContent2(outputMessages);
8313
8809
  const providerError = extractProviderError(providerResponse);
8314
8810
  try {
8315
8811
  const result = await evaluateCandidate({
@@ -8324,7 +8820,9 @@ async function runEvalCase(options) {
8324
8820
  judgeProvider,
8325
8821
  agentTimeoutMs,
8326
8822
  outputMessages,
8327
- traceSummary
8823
+ traceSummary,
8824
+ targetResolver,
8825
+ availableTargets
8328
8826
  });
8329
8827
  return providerError ? { ...result, error: providerError } : result;
8330
8828
  } catch (error) {
@@ -8344,7 +8842,9 @@ async function evaluateCandidate(options) {
8344
8842
  judgeProvider,
8345
8843
  agentTimeoutMs,
8346
8844
  outputMessages,
8347
- traceSummary
8845
+ traceSummary,
8846
+ targetResolver,
8847
+ availableTargets
8348
8848
  } = options;
8349
8849
  const gradeTimestamp = nowFn();
8350
8850
  const { score, evaluatorResults } = await runEvaluatorsForCase({
@@ -8359,7 +8859,9 @@ async function evaluateCandidate(options) {
8359
8859
  judgeProvider,
8360
8860
  agentTimeoutMs,
8361
8861
  outputMessages,
8362
- traceSummary
8862
+ traceSummary,
8863
+ targetResolver,
8864
+ availableTargets
8363
8865
  });
8364
8866
  const completedAt = nowFn();
8365
8867
  let agentProviderRequest;
@@ -8412,7 +8914,9 @@ async function runEvaluatorsForCase(options) {
8412
8914
  judgeProvider,
8413
8915
  agentTimeoutMs,
8414
8916
  outputMessages,
8415
- traceSummary
8917
+ traceSummary,
8918
+ targetResolver,
8919
+ availableTargets
8416
8920
  } = options;
8417
8921
  if (evalCase.evaluators && evalCase.evaluators.length > 0) {
8418
8922
  return runEvaluatorList({
@@ -8428,7 +8932,9 @@ async function runEvaluatorsForCase(options) {
8428
8932
  judgeProvider,
8429
8933
  agentTimeoutMs,
8430
8934
  outputMessages,
8431
- traceSummary
8935
+ traceSummary,
8936
+ targetResolver,
8937
+ availableTargets
8432
8938
  });
8433
8939
  }
8434
8940
  const evaluatorKind = evalCase.evaluator ?? "llm_judge";
@@ -8446,7 +8952,9 @@ async function runEvaluatorsForCase(options) {
8446
8952
  now,
8447
8953
  judgeProvider,
8448
8954
  outputMessages,
8449
- traceSummary
8955
+ traceSummary,
8956
+ targetResolver,
8957
+ availableTargets
8450
8958
  });
8451
8959
  return { score };
8452
8960
  }
@@ -8464,7 +8972,9 @@ async function runEvaluatorList(options) {
8464
8972
  judgeProvider,
8465
8973
  agentTimeoutMs,
8466
8974
  outputMessages,
8467
- traceSummary
8975
+ traceSummary,
8976
+ targetResolver,
8977
+ availableTargets
8468
8978
  } = options;
8469
8979
  const scored = [];
8470
8980
  const evaluatorResults = [];
@@ -8502,7 +9012,8 @@ async function runEvaluatorList(options) {
8502
9012
  script: evaluator.script,
8503
9013
  cwd: evaluator.resolvedCwd ?? evaluator.cwd,
8504
9014
  agentTimeoutMs,
8505
- config: evaluator.config
9015
+ config: evaluator.config,
9016
+ target: evaluator.target
8506
9017
  });
8507
9018
  const score2 = await codeEvaluator.evaluate({
8508
9019
  evalCase,
@@ -8512,8 +9023,11 @@ async function runEvaluatorList(options) {
8512
9023
  attempt,
8513
9024
  promptInputs,
8514
9025
  now,
9026
+ judgeProvider,
8515
9027
  outputMessages,
8516
- traceSummary
9028
+ traceSummary,
9029
+ targetResolver,
9030
+ availableTargets
8517
9031
  });
8518
9032
  const weight = evaluator.weight ?? 1;
8519
9033
  scored.push({ score: score2, name: evaluator.name, type: "code_judge", weight });
@@ -8526,7 +9040,8 @@ async function runEvaluatorList(options) {
8526
9040
  hits: score2.hits,
8527
9041
  misses: score2.misses,
8528
9042
  reasoning: score2.reasoning,
8529
- evaluatorProviderRequest: score2.evaluatorRawRequest
9043
+ evaluatorProviderRequest: score2.evaluatorRawRequest,
9044
+ details: score2.details
8530
9045
  });
8531
9046
  }
8532
9047
  if (evaluator.type === "composite") {
@@ -8540,7 +9055,8 @@ async function runEvaluatorList(options) {
8540
9055
  script: memberConfig.script,
8541
9056
  cwd: memberConfig.resolvedCwd ?? memberConfig.cwd,
8542
9057
  agentTimeoutMs,
8543
- config: memberConfig.config
9058
+ config: memberConfig.config,
9059
+ target: memberConfig.target
8544
9060
  });
8545
9061
  case "composite":
8546
9062
  return new CompositeEvaluator({
@@ -8589,7 +9105,9 @@ async function runEvaluatorList(options) {
8589
9105
  now,
8590
9106
  judgeProvider,
8591
9107
  outputMessages,
8592
- traceSummary
9108
+ traceSummary,
9109
+ targetResolver,
9110
+ availableTargets
8593
9111
  });
8594
9112
  const weight = evaluator.weight ?? 1;
8595
9113
  scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
@@ -8785,11 +9303,11 @@ async function runEvaluatorList(options) {
8785
9303
  (total, entry) => total + (entry.score.expectedAspectCount ?? 0),
8786
9304
  0
8787
9305
  );
8788
- const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString2);
9306
+ const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString);
8789
9307
  const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
8790
9308
  const score = {
8791
9309
  score: aggregateScore,
8792
- verdict: scoreToVerdict2(aggregateScore),
9310
+ verdict: scoreToVerdict(aggregateScore),
8793
9311
  hits,
8794
9312
  misses,
8795
9313
  expectedAspectCount,
@@ -8836,18 +9354,6 @@ async function resolveCustomPrompt(config) {
8836
9354
  }
8837
9355
  return config.prompt;
8838
9356
  }
8839
- function isNonEmptyString2(value) {
8840
- return typeof value === "string" && value.trim().length > 0;
8841
- }
8842
- function scoreToVerdict2(score) {
8843
- if (score >= 0.8) {
8844
- return "pass";
8845
- }
8846
- if (score >= 0.6) {
8847
- return "borderline";
8848
- }
8849
- return "fail";
8850
- }
8851
9357
  function filterEvalCases(evalCases, evalId) {
8852
9358
  if (!evalId) {
8853
9359
  return evalCases;
@@ -8949,7 +9455,7 @@ function extractProviderError(response) {
8949
9455
  return trimmed.length > 0 ? trimmed : void 0;
8950
9456
  }
8951
9457
  function createCacheKey(provider, target, evalCase, promptInputs) {
8952
- const hash = (0, import_node_crypto4.createHash)("sha256");
9458
+ const hash = (0, import_node_crypto5.createHash)("sha256");
8953
9459
  hash.update(provider.id);
8954
9460
  hash.update(target.name);
8955
9461
  hash.update(evalCase.id);
@@ -8990,7 +9496,8 @@ function mapChildResults(children) {
8990
9496
  misses: child.misses,
8991
9497
  reasoning: child.reasoning,
8992
9498
  evaluatorProviderRequest: child.evaluatorRawRequest,
8993
- evaluatorResults: mapChildResults(child.evaluatorResults)
9499
+ evaluatorResults: mapChildResults(child.evaluatorResults),
9500
+ details: child.details
8994
9501
  }));
8995
9502
  }
8996
9503
  function computeWeightedMean(entries) {
@@ -9005,7 +9512,7 @@ function computeWeightedMean(entries) {
9005
9512
  }
9006
9513
 
9007
9514
  // src/evaluation/generators/rubric-generator.ts
9008
- var import_ai3 = require("ai");
9515
+ var import_ai4 = require("ai");
9009
9516
  var import_zod4 = require("zod");
9010
9517
  var rubricItemSchema = import_zod4.z.object({
9011
9518
  id: import_zod4.z.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
@@ -9039,7 +9546,7 @@ You must return a valid JSON object matching this schema:
9039
9546
  let lastError;
9040
9547
  for (let attempt = 1; attempt <= 3; attempt++) {
9041
9548
  try {
9042
- const { text } = await (0, import_ai3.generateText)({
9549
+ const { text } = await (0, import_ai4.generateText)({
9043
9550
  model,
9044
9551
  system,
9045
9552
  prompt
@@ -9084,17 +9591,6 @@ function buildPrompt(expectedOutcome, question, referenceAnswer) {
9084
9591
  return parts.join("\n");
9085
9592
  }
9086
9593
 
9087
- // src/evaluation/code-judge-sdk.ts
9088
- var import_node_fs7 = require("fs");
9089
- function parseCodeJudgePayload(payload) {
9090
- const parsed = JSON.parse(payload);
9091
- return toCamelCaseDeep(parsed);
9092
- }
9093
- function readCodeJudgePayload() {
9094
- const stdin = (0, import_node_fs7.readFileSync)(0, "utf8");
9095
- return parseCodeJudgePayload(stdin);
9096
- }
9097
-
9098
9594
  // src/index.ts
9099
9595
  function createAgentKernel() {
9100
9596
  return { status: "stub" };
@@ -9113,33 +9609,39 @@ function createAgentKernel() {
9113
9609
  ToolTrajectoryEvaluator,
9114
9610
  avgToolDurationMs,
9115
9611
  buildDirectoryChain,
9612
+ buildOutputSchema,
9116
9613
  buildPromptInputs,
9117
9614
  buildSearchRoots,
9615
+ clampScore,
9118
9616
  computeTraceSummary,
9119
9617
  consumeClaudeCodeLogEntries,
9120
9618
  consumeCodexLogEntries,
9121
9619
  consumePiLogEntries,
9122
9620
  createAgentKernel,
9123
9621
  createProvider,
9622
+ deepEqual,
9124
9623
  ensureVSCodeSubagents,
9624
+ executeScript,
9125
9625
  explorationRatio,
9126
- extractCodeBlocks,
9626
+ extractJsonBlob,
9127
9627
  fileExists,
9128
9628
  findGitRoot,
9629
+ freeformEvaluationSchema,
9129
9630
  generateRubrics,
9130
9631
  getHitCount,
9131
9632
  isEvaluatorKind,
9132
9633
  isGuidelineFile,
9133
9634
  isJsonObject,
9134
9635
  isJsonValue,
9636
+ isNonEmptyString,
9135
9637
  isTestMessage,
9136
9638
  isTestMessageRole,
9137
9639
  listTargetNames,
9138
9640
  loadEvalCases,
9139
9641
  mergeExecutionMetrics,
9140
9642
  normalizeLineEndings,
9141
- parseCodeJudgePayload,
9142
- readCodeJudgePayload,
9643
+ parseJsonFromText,
9644
+ parseJsonSafe,
9143
9645
  readJsonFile,
9144
9646
  readTargetDefinitions,
9145
9647
  readTestSuiteMetadata,
@@ -9149,6 +9651,7 @@ function createAgentKernel() {
9149
9651
  resolveTargetDefinition,
9150
9652
  runEvalCase,
9151
9653
  runEvaluation,
9654
+ scoreToVerdict,
9152
9655
  subscribeToClaudeCodeLogEntries,
9153
9656
  subscribeToCodexLogEntries,
9154
9657
  subscribeToPiLogEntries,