@agentv/core 2.0.2 → 2.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -150,85 +150,6 @@ import { readFile as readFile5 } from "node:fs/promises";
150
150
  import path6 from "node:path";
151
151
  import { parse as parse2 } from "yaml";
152
152
 
153
- // src/evaluation/formatting/segment-formatter.ts
154
- function extractCodeBlocks(segments) {
155
- const CODE_BLOCK_PATTERN = /```[\s\S]*?```/g;
156
- const codeBlocks = [];
157
- for (const segment of segments) {
158
- const typeValue = segment.type;
159
- if (typeof typeValue !== "string" || typeValue !== "text") {
160
- continue;
161
- }
162
- const textValue = segment.value;
163
- if (typeof textValue !== "string") {
164
- continue;
165
- }
166
- const matches = textValue.match(CODE_BLOCK_PATTERN);
167
- if (matches) {
168
- codeBlocks.push(...matches);
169
- }
170
- }
171
- return codeBlocks;
172
- }
173
- function formatFileContents(parts) {
174
- const fileCount = parts.filter((p) => p.isFile).length;
175
- if (fileCount > 0) {
176
- return parts.map((part) => {
177
- if (part.isFile && part.displayPath) {
178
- return `<file path="${part.displayPath}">
179
- ${part.content}
180
- </file>`;
181
- }
182
- return part.content;
183
- }).join("\n\n");
184
- }
185
- return parts.map((p) => p.content).join(" ");
186
- }
187
- function formatSegment(segment, mode = "lm") {
188
- const type = asString(segment.type);
189
- if (type === "text") {
190
- return asString(segment.value);
191
- }
192
- if (type === "guideline_ref") {
193
- const refPath = asString(segment.path);
194
- return refPath ? `<Attached: ${refPath}>` : void 0;
195
- }
196
- if (type === "file") {
197
- const filePath = asString(segment.path);
198
- if (!filePath) {
199
- return void 0;
200
- }
201
- if (mode === "agent") {
202
- return `<file: path="${filePath}">`;
203
- }
204
- const text = asString(segment.text);
205
- if (text && filePath) {
206
- return formatFileContents([{ content: text.trim(), isFile: true, displayPath: filePath }]);
207
- }
208
- }
209
- return void 0;
210
- }
211
- function hasVisibleContent(segments) {
212
- return segments.some((segment) => {
213
- const type = asString(segment.type);
214
- if (type === "text") {
215
- const value = asString(segment.value);
216
- return value !== void 0 && value.trim().length > 0;
217
- }
218
- if (type === "guideline_ref") {
219
- return false;
220
- }
221
- if (type === "file") {
222
- const text = asString(segment.text);
223
- return text !== void 0 && text.trim().length > 0;
224
- }
225
- return false;
226
- });
227
- }
228
- function asString(value) {
229
- return typeof value === "string" ? value : void 0;
230
- }
231
-
232
153
  // src/evaluation/loaders/config-loader.ts
233
154
  import { readFile } from "node:fs/promises";
234
155
  import path2 from "node:path";
@@ -336,7 +257,6 @@ async function resolveFileReference2(rawValue, searchRoots) {
336
257
  }
337
258
 
338
259
  // src/evaluation/loaders/config-loader.ts
339
- var SCHEMA_CONFIG_V2 = "agentv-config-v2";
340
260
  var ANSI_YELLOW = "\x1B[33m";
341
261
  var ANSI_RESET = "\x1B[0m";
342
262
  async function loadConfig(evalFilePath, repoRoot) {
@@ -354,13 +274,6 @@ async function loadConfig(evalFilePath, repoRoot) {
354
274
  continue;
355
275
  }
356
276
  const config = parsed;
357
- const schema = config.$schema;
358
- if (schema !== SCHEMA_CONFIG_V2) {
359
- const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${configPath}. Expected '${SCHEMA_CONFIG_V2}'` : `Missing required field '$schema' in ${configPath}.
360
- Please add '$schema: ${SCHEMA_CONFIG_V2}' at the top of the file.`;
361
- logWarning(message);
362
- continue;
363
- }
364
277
  const guidelinePatterns = config.guideline_patterns;
365
278
  if (guidelinePatterns !== void 0 && !Array.isArray(guidelinePatterns)) {
366
279
  logWarning(`Invalid guideline_patterns in ${configPath}, expected array`);
@@ -469,7 +382,8 @@ var ANSI_YELLOW3 = "\x1B[33m";
469
382
  var ANSI_RESET3 = "\x1B[0m";
470
383
  async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
471
384
  const execution = rawEvalCase.execution;
472
- const candidateEvaluators = isJsonObject2(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators ?? globalExecution?.evaluators;
385
+ const executionObject = isJsonObject2(execution) ? execution : void 0;
386
+ const candidateEvaluators = (executionObject ? executionObject.evaluators : void 0) ?? rawEvalCase.evaluators ?? globalExecution?.evaluators;
473
387
  if (candidateEvaluators === void 0) {
474
388
  return void 0;
475
389
  }
@@ -483,7 +397,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
483
397
  logWarning2(`Skipping invalid evaluator entry for '${evalId}' (expected object)`);
484
398
  continue;
485
399
  }
486
- const name = asString2(rawEvaluator.name);
400
+ const name = asString(rawEvaluator.name);
487
401
  const typeValue = rawEvaluator.type;
488
402
  if (!name || !isEvaluatorKind(typeValue)) {
489
403
  logWarning2(`Skipping evaluator with invalid name/type in '${evalId}'`);
@@ -511,7 +425,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
511
425
  continue;
512
426
  }
513
427
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
514
- const cwd = asString2(rawEvaluator.cwd);
428
+ const cwd = asString(rawEvaluator.cwd);
515
429
  let resolvedCwd;
516
430
  if (cwd) {
517
431
  const resolved = await resolveFileReference2(cwd, searchRoots);
@@ -526,7 +440,29 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
526
440
  } else {
527
441
  resolvedCwd = searchRoots[0];
528
442
  }
529
- const knownProps = /* @__PURE__ */ new Set(["name", "type", "script", "cwd", "weight"]);
443
+ const rawTarget = rawEvaluator.target;
444
+ let targetConfig;
445
+ if (rawTarget !== void 0) {
446
+ if (isJsonObject2(rawTarget)) {
447
+ const maxCalls = rawTarget.max_calls;
448
+ if (maxCalls !== void 0 && (typeof maxCalls !== "number" || maxCalls < 0)) {
449
+ logWarning2(
450
+ `Invalid target.max_calls for evaluator '${name}' in '${evalId}': must be a non-negative number`
451
+ );
452
+ } else {
453
+ targetConfig = {
454
+ ...typeof maxCalls === "number" ? { max_calls: maxCalls } : {}
455
+ };
456
+ }
457
+ } else if (rawTarget === true) {
458
+ targetConfig = {};
459
+ } else {
460
+ logWarning2(
461
+ `Invalid target config for evaluator '${name}' in '${evalId}': expected object or true`
462
+ );
463
+ }
464
+ }
465
+ const knownProps = /* @__PURE__ */ new Set(["name", "type", "script", "cwd", "weight", "target"]);
530
466
  const config = {};
531
467
  for (const [key, value] of Object.entries(rawEvaluator)) {
532
468
  if (!knownProps.has(key) && value !== void 0) {
@@ -540,7 +476,8 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
540
476
  cwd,
541
477
  resolvedCwd,
542
478
  ...weight2 !== void 0 ? { weight: weight2 } : {},
543
- ...Object.keys(config).length > 0 ? { config } : {}
479
+ ...Object.keys(config).length > 0 ? { config } : {},
480
+ ...targetConfig !== void 0 ? { target: targetConfig } : {}
544
481
  });
545
482
  continue;
546
483
  }
@@ -557,7 +494,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
557
494
  logWarning2(`Skipping composite evaluator '${name}' in '${evalId}': missing aggregator`);
558
495
  continue;
559
496
  }
560
- const aggregatorType = asString2(rawAggregator.type);
497
+ const aggregatorType = asString(rawAggregator.type);
561
498
  if (aggregatorType !== "weighted_average" && aggregatorType !== "code_judge" && aggregatorType !== "llm_judge") {
562
499
  logWarning2(
563
500
  `Skipping composite evaluator '${name}' in '${evalId}': invalid aggregator type '${aggregatorType}'`
@@ -570,7 +507,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
570
507
  logWarning2(`Skipping invalid member evaluator in composite '${name}' (expected object)`);
571
508
  continue;
572
509
  }
573
- const memberName = asString2(rawMember.name);
510
+ const memberName = asString(rawMember.name);
574
511
  const memberType = rawMember.type;
575
512
  if (!memberName || !isEvaluatorKind(memberType)) {
576
513
  logWarning2(`Skipping member evaluator with invalid name/type in composite '${name}'`);
@@ -608,7 +545,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
608
545
  ...Object.keys(parsedWeights).length > 0 ? { weights: parsedWeights } : {}
609
546
  };
610
547
  } else if (aggregatorType === "code_judge") {
611
- const aggregatorPath = asString2(rawAggregator.path);
548
+ const aggregatorPath = asString(rawAggregator.path);
612
549
  if (!aggregatorPath) {
613
550
  logWarning2(
614
551
  `Skipping composite evaluator '${name}' in '${evalId}': code_judge aggregator missing path`
@@ -621,7 +558,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
621
558
  cwd: searchRoots[0]
622
559
  };
623
560
  } else {
624
- const aggregatorPrompt = asString2(rawAggregator.prompt);
561
+ const aggregatorPrompt = asString(rawAggregator.prompt);
625
562
  let promptPath2;
626
563
  if (aggregatorPrompt) {
627
564
  const resolved = await resolveFileReference2(aggregatorPrompt, searchRoots);
@@ -646,7 +583,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
646
583
  continue;
647
584
  }
648
585
  if (typeValue === "tool_trajectory") {
649
- const mode = asString2(rawEvaluator.mode);
586
+ const mode = asString(rawEvaluator.mode);
650
587
  if (mode !== "any_order" && mode !== "in_order" && mode !== "exact") {
651
588
  logWarning2(
652
589
  `Skipping tool_trajectory evaluator '${name}' in '${evalId}': invalid mode '${mode}' (must be any_order, in_order, or exact)`
@@ -737,8 +674,8 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
737
674
  );
738
675
  continue;
739
676
  }
740
- const fieldPath = asString2(rawField.path);
741
- const match = asString2(rawField.match);
677
+ const fieldPath = asString(rawField.path);
678
+ const match = asString(rawField.match);
742
679
  if (!fieldPath) {
743
680
  logWarning2(
744
681
  `Skipping field without path in field_accuracy evaluator '${name}' in '${evalId}'`
@@ -768,7 +705,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
768
705
  );
769
706
  continue;
770
707
  }
771
- const aggregation = asString2(rawEvaluator.aggregation);
708
+ const aggregation = asString(rawEvaluator.aggregation);
772
709
  const validAggregation = isValidFieldAggregationType(aggregation) ? aggregation : void 0;
773
710
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
774
711
  evaluators.push({
@@ -849,7 +786,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
849
786
  });
850
787
  continue;
851
788
  }
852
- const prompt = asString2(rawEvaluator.prompt);
789
+ const prompt = asString(rawEvaluator.prompt);
853
790
  let promptPath;
854
791
  if (prompt) {
855
792
  const resolved = await resolveFileReference2(prompt, searchRoots);
@@ -868,11 +805,11 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
868
805
  );
869
806
  }
870
807
  }
871
- const _model = asString2(rawEvaluator.model);
808
+ const _model = asString(rawEvaluator.model);
872
809
  const rawRubrics = rawEvaluator.rubrics;
873
810
  const parsedRubrics = Array.isArray(rawRubrics) ? rawRubrics.filter((r) => isJsonObject2(r)).map((rubric, index) => ({
874
- id: asString2(rubric.id) ?? `rubric-${index + 1}`,
875
- description: asString2(rubric.description) ?? "",
811
+ id: asString(rubric.id) ?? `rubric-${index + 1}`,
812
+ description: asString(rubric.description) ?? "",
876
813
  weight: typeof rubric.weight === "number" ? rubric.weight : 1,
877
814
  required: typeof rubric.required === "boolean" ? rubric.required : true
878
815
  })).filter((r) => r.description.length > 0) : void 0;
@@ -916,7 +853,7 @@ function coerceEvaluator(candidate, contextId) {
916
853
  logWarning2(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
917
854
  return void 0;
918
855
  }
919
- function asString2(value) {
856
+ function asString(value) {
920
857
  return typeof value === "string" ? value : void 0;
921
858
  }
922
859
  function asStringArray(value, description) {
@@ -992,6 +929,68 @@ function isValidFieldAggregationType(value) {
992
929
  // src/evaluation/loaders/message-processor.ts
993
930
  import { readFile as readFile3 } from "node:fs/promises";
994
931
  import path4 from "node:path";
932
+
933
+ // src/evaluation/formatting/segment-formatter.ts
934
+ function formatFileContents(parts) {
935
+ const fileCount = parts.filter((p) => p.isFile).length;
936
+ if (fileCount > 0) {
937
+ return parts.map((part) => {
938
+ if (part.isFile && part.displayPath) {
939
+ return `<file path="${part.displayPath}">
940
+ ${part.content}
941
+ </file>`;
942
+ }
943
+ return part.content;
944
+ }).join("\n\n");
945
+ }
946
+ return parts.map((p) => p.content).join(" ");
947
+ }
948
+ function formatSegment(segment, mode = "lm") {
949
+ const type = asString2(segment.type);
950
+ if (type === "text") {
951
+ return asString2(segment.value);
952
+ }
953
+ if (type === "guideline_ref") {
954
+ const refPath = asString2(segment.path);
955
+ return refPath ? `<Attached: ${refPath}>` : void 0;
956
+ }
957
+ if (type === "file") {
958
+ const filePath = asString2(segment.path);
959
+ if (!filePath) {
960
+ return void 0;
961
+ }
962
+ if (mode === "agent") {
963
+ return `<file: path="${filePath}">`;
964
+ }
965
+ const text = asString2(segment.text);
966
+ if (text && filePath) {
967
+ return formatFileContents([{ content: text.trim(), isFile: true, displayPath: filePath }]);
968
+ }
969
+ }
970
+ return void 0;
971
+ }
972
+ function hasVisibleContent(segments) {
973
+ return segments.some((segment) => {
974
+ const type = asString2(segment.type);
975
+ if (type === "text") {
976
+ const value = asString2(segment.value);
977
+ return value !== void 0 && value.trim().length > 0;
978
+ }
979
+ if (type === "guideline_ref") {
980
+ return false;
981
+ }
982
+ if (type === "file") {
983
+ const text = asString2(segment.text);
984
+ return text !== void 0 && text.trim().length > 0;
985
+ }
986
+ return false;
987
+ });
988
+ }
989
+ function asString2(value) {
990
+ return typeof value === "string" ? value : void 0;
991
+ }
992
+
993
+ // src/evaluation/loaders/message-processor.ts
995
994
  var ANSI_YELLOW4 = "\x1B[33m";
996
995
  var ANSI_RESET4 = "\x1B[0m";
997
996
  async function processMessages(options) {
@@ -1297,9 +1296,6 @@ ${messageContent}`);
1297
1296
  questionParts.push(formattedContent);
1298
1297
  }
1299
1298
  }
1300
- if (testCase.code_snippets.length > 0) {
1301
- questionParts.push(testCase.code_snippets.join("\n"));
1302
- }
1303
1299
  question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
1304
1300
  }
1305
1301
  const chatPrompt = useRoleMarkers ? buildChatPromptFromSegments({
@@ -1498,7 +1494,6 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
1498
1494
  repoRootPath,
1499
1495
  verbose
1500
1496
  }) : [];
1501
- const codeSnippets = extractCodeBlocks(inputSegments);
1502
1497
  let referenceAnswer = "";
1503
1498
  if (outputSegments.length > 0) {
1504
1499
  const lastMessage = outputSegments[outputSegments.length - 1];
@@ -1571,7 +1566,6 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
1571
1566
  guideline_paths: guidelinePaths.map((guidelinePath) => path6.resolve(guidelinePath)),
1572
1567
  guideline_patterns: guidelinePatterns,
1573
1568
  file_paths: allFilePaths,
1574
- code_snippets: codeSnippets,
1575
1569
  expected_outcome: outcome,
1576
1570
  evaluator: evalCaseEvaluatorKind,
1577
1571
  evaluators
@@ -5311,9 +5305,64 @@ function resolveAndCreateProvider(definition, env = process.env) {
5311
5305
  return createProvider(resolved);
5312
5306
  }
5313
5307
 
5314
- // src/evaluation/evaluators.ts
5315
- import { generateText as generateText2 } from "ai";
5316
- import { z as z2 } from "zod";
5308
+ // src/evaluation/evaluators/scoring.ts
5309
+ function scoreToVerdict(score) {
5310
+ if (score >= 0.8) {
5311
+ return "pass";
5312
+ }
5313
+ if (score >= 0.6) {
5314
+ return "borderline";
5315
+ }
5316
+ return "fail";
5317
+ }
5318
+ function clampScore(value) {
5319
+ if (Number.isNaN(value) || !Number.isFinite(value)) {
5320
+ return 0;
5321
+ }
5322
+ if (value < 0) {
5323
+ return 0;
5324
+ }
5325
+ if (value > 1) {
5326
+ return 1;
5327
+ }
5328
+ return value;
5329
+ }
5330
+ function extractJsonBlob(text) {
5331
+ const match = text.match(/\{[\s\S]*\}/);
5332
+ return match?.[0];
5333
+ }
5334
+ function parseJsonFromText(text) {
5335
+ const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
5336
+ const blob = extractJsonBlob(cleaned) ?? cleaned;
5337
+ return JSON.parse(blob);
5338
+ }
5339
+ function isNonEmptyString(value) {
5340
+ return typeof value === "string" && value.trim().length > 0;
5341
+ }
5342
+ function parseJsonSafe(payload) {
5343
+ try {
5344
+ return JSON.parse(payload);
5345
+ } catch {
5346
+ return void 0;
5347
+ }
5348
+ }
5349
+ function deepEqual(a, b) {
5350
+ if (a === b) return true;
5351
+ if (a === null || b === null) return a === b;
5352
+ if (typeof a !== typeof b) return false;
5353
+ if (typeof a !== "object") return a === b;
5354
+ if (Array.isArray(a) !== Array.isArray(b)) return false;
5355
+ if (Array.isArray(a) && Array.isArray(b)) {
5356
+ if (a.length !== b.length) return false;
5357
+ return a.every((val, i) => deepEqual(val, b[i]));
5358
+ }
5359
+ const aObj = a;
5360
+ const bObj = b;
5361
+ const aKeys = Object.keys(aObj);
5362
+ const bKeys = Object.keys(bObj);
5363
+ if (aKeys.length !== bKeys.length) return false;
5364
+ return aKeys.every((key) => Object.hasOwn(bObj, key) && deepEqual(aObj[key], bObj[key]));
5365
+ }
5317
5366
 
5318
5367
  // src/runtime/exec.ts
5319
5368
  function shellEscapePath(value) {
@@ -5338,7 +5387,9 @@ async function execFileWithStdinBun(argv, stdinPayload, options) {
5338
5387
  cwd: options.cwd,
5339
5388
  stdin: encoder.encode(stdinPayload),
5340
5389
  stdout: "pipe",
5341
- stderr: "pipe"
5390
+ stderr: "pipe",
5391
+ // Merge additional env vars with process.env
5392
+ env: options.env ? { ...process.env, ...options.env } : process.env
5342
5393
  });
5343
5394
  let timedOut = false;
5344
5395
  const timeout = options.timeoutMs !== void 0 ? setTimeout(() => {
@@ -5373,7 +5424,9 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
5373
5424
  const [cmd, ...args] = argv;
5374
5425
  const child = spawn4(cmd, args, {
5375
5426
  cwd: options.cwd,
5376
- stdio: ["pipe", "pipe", "pipe"]
5427
+ stdio: ["pipe", "pipe", "pipe"],
5428
+ // Merge additional env vars with process.env
5429
+ env: options.env ? { ...process.env, ...options.env } : process.env
5377
5430
  });
5378
5431
  const stdoutChunks = [];
5379
5432
  const stderrChunks = [];
@@ -5426,7 +5479,9 @@ async function execShellWithStdin(command, stdinPayload, options = {}) {
5426
5479
  const child = spawn4(wrappedCommand, {
5427
5480
  shell: true,
5428
5481
  cwd: options.cwd,
5429
- stdio: ["ignore", "ignore", "ignore"]
5482
+ stdio: ["ignore", "ignore", "ignore"],
5483
+ // Merge additional env vars with process.env
5484
+ env: options.env ? { ...process.env, ...options.env } : process.env
5430
5485
  });
5431
5486
  const timeout = options.timeoutMs ? setTimeout(() => {
5432
5487
  child.kill();
@@ -5453,32 +5508,387 @@ async function execShellWithStdin(command, stdinPayload, options = {}) {
5453
5508
  }
5454
5509
  }
5455
5510
 
5456
- // src/evaluation/case-conversion.ts
5457
- function toSnakeCase(str) {
5458
- if (/^[A-Z]/.test(str)) {
5459
- return str;
5511
+ // src/runtime/target-proxy.ts
5512
+ import { randomBytes } from "node:crypto";
5513
+ import { createServer } from "node:http";
5514
+ var DEFAULT_MAX_CALLS = 50;
5515
+ async function createTargetProxy(options) {
5516
+ const { defaultProvider, targetResolver, availableTargets, maxCalls } = options;
5517
+ const token = randomBytes(32).toString("hex");
5518
+ let callCount = 0;
5519
+ let isShutdown = false;
5520
+ const targetsList = availableTargets ?? [defaultProvider.targetName];
5521
+ function resolveProvider(targetName) {
5522
+ if (targetName === void 0 || targetName === defaultProvider.targetName) {
5523
+ return defaultProvider;
5524
+ }
5525
+ if (targetResolver) {
5526
+ return targetResolver(targetName);
5527
+ }
5528
+ return void 0;
5460
5529
  }
5461
- return str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
5462
- }
5463
- function toSnakeCaseDeep(obj) {
5464
- if (obj === null || obj === void 0) {
5465
- return obj;
5530
+ const server = createServer(async (req, res) => {
5531
+ res.setHeader("Access-Control-Allow-Origin", "*");
5532
+ res.setHeader("Access-Control-Allow-Methods", "GET, POST, OPTIONS");
5533
+ res.setHeader("Access-Control-Allow-Headers", "Content-Type, Authorization");
5534
+ if (req.method === "OPTIONS") {
5535
+ res.writeHead(204);
5536
+ res.end();
5537
+ return;
5538
+ }
5539
+ const authHeader = req.headers.authorization;
5540
+ if (!authHeader || authHeader !== `Bearer ${token}`) {
5541
+ sendJson(res, 401, { error: "Unauthorized" });
5542
+ return;
5543
+ }
5544
+ if (isShutdown) {
5545
+ sendJson(res, 503, { error: "Proxy is shutting down" });
5546
+ return;
5547
+ }
5548
+ const url2 = req.url ?? "";
5549
+ if (req.method === "GET" && url2 === "/info") {
5550
+ handleInfo(res);
5551
+ return;
5552
+ }
5553
+ if (req.method === "POST" && url2 === "/invoke") {
5554
+ await handleInvoke(req, res);
5555
+ return;
5556
+ }
5557
+ if (req.method === "POST" && url2 === "/invokeBatch") {
5558
+ await handleInvokeBatch(req, res);
5559
+ return;
5560
+ }
5561
+ sendJson(res, 404, { error: "Not found" });
5562
+ });
5563
+ function handleInfo(res) {
5564
+ const response = {
5565
+ targetName: defaultProvider.targetName,
5566
+ maxCalls,
5567
+ callCount,
5568
+ availableTargets: targetsList
5569
+ };
5570
+ sendJson(res, 200, response);
5466
5571
  }
5467
- if (Array.isArray(obj)) {
5468
- return obj.map((item) => toSnakeCaseDeep(item));
5572
+ async function handleInvoke(req, res) {
5573
+ if (callCount >= maxCalls) {
5574
+ sendJson(res, 429, { error: `Max calls exceeded (limit: ${maxCalls})` });
5575
+ return;
5576
+ }
5577
+ try {
5578
+ const body = await readBody(req);
5579
+ const request = JSON.parse(body);
5580
+ if (!request.question || typeof request.question !== "string") {
5581
+ sendJson(res, 400, { error: "Missing required field: question" });
5582
+ return;
5583
+ }
5584
+ const provider = resolveProvider(request.target);
5585
+ if (!provider) {
5586
+ sendJson(res, 400, {
5587
+ error: `Unknown target '${request.target}'. Available: ${targetsList.join(", ")}`
5588
+ });
5589
+ return;
5590
+ }
5591
+ callCount++;
5592
+ const response = await provider.invoke({
5593
+ question: request.question,
5594
+ systemPrompt: request.systemPrompt,
5595
+ evalCaseId: request.evalCaseId ?? "proxy",
5596
+ attempt: request.attempt ?? 1
5597
+ });
5598
+ const outputMessages = response.outputMessages ?? [];
5599
+ const rawText = extractLastAssistantContent2(outputMessages);
5600
+ const result = {
5601
+ outputMessages,
5602
+ rawText
5603
+ };
5604
+ sendJson(res, 200, result);
5605
+ } catch (error) {
5606
+ const message = error instanceof Error ? error.message : String(error);
5607
+ sendJson(res, 500, { error: message });
5608
+ }
5469
5609
  }
5470
- if (typeof obj === "object") {
5471
- const result = {};
5472
- for (const [key, value] of Object.entries(obj)) {
5473
- const snakeKey = toSnakeCase(key);
5474
- result[snakeKey] = toSnakeCaseDeep(value);
5610
+ async function handleInvokeBatch(req, res) {
5611
+ try {
5612
+ const body = await readBody(req);
5613
+ const { requests } = JSON.parse(body);
5614
+ if (!Array.isArray(requests)) {
5615
+ sendJson(res, 400, { error: "Missing required field: requests (array)" });
5616
+ return;
5617
+ }
5618
+ if (callCount + requests.length > maxCalls) {
5619
+ sendJson(res, 429, {
5620
+ error: `Batch would exceed max calls (current: ${callCount}, batch: ${requests.length}, limit: ${maxCalls})`
5621
+ });
5622
+ return;
5623
+ }
5624
+ const responses = [];
5625
+ for (const request of requests) {
5626
+ if (!request.question || typeof request.question !== "string") {
5627
+ responses.push({
5628
+ outputMessages: [],
5629
+ rawText: "Error: Missing required field: question"
5630
+ });
5631
+ continue;
5632
+ }
5633
+ const provider = resolveProvider(request.target);
5634
+ if (!provider) {
5635
+ responses.push({
5636
+ outputMessages: [],
5637
+ rawText: `Error: Unknown target '${request.target}'. Available: ${targetsList.join(", ")}`
5638
+ });
5639
+ continue;
5640
+ }
5641
+ callCount++;
5642
+ try {
5643
+ const response = await provider.invoke({
5644
+ question: request.question,
5645
+ systemPrompt: request.systemPrompt,
5646
+ evalCaseId: request.evalCaseId ?? "proxy",
5647
+ attempt: request.attempt ?? 1
5648
+ });
5649
+ const outputMessages = response.outputMessages ?? [];
5650
+ responses.push({
5651
+ outputMessages,
5652
+ rawText: extractLastAssistantContent2(outputMessages)
5653
+ });
5654
+ } catch (error) {
5655
+ const message = error instanceof Error ? error.message : String(error);
5656
+ responses.push({
5657
+ outputMessages: [],
5658
+ rawText: `Error: ${message}`
5659
+ });
5660
+ }
5661
+ }
5662
+ sendJson(res, 200, { responses });
5663
+ } catch (error) {
5664
+ const message = error instanceof Error ? error.message : String(error);
5665
+ sendJson(res, 500, { error: message });
5475
5666
  }
5476
- return result;
5477
5667
  }
5478
- return obj;
5668
+ await new Promise((resolve, reject) => {
5669
+ server.once("error", reject);
5670
+ server.listen(0, "127.0.0.1", () => {
5671
+ server.removeListener("error", reject);
5672
+ resolve();
5673
+ });
5674
+ });
5675
+ const address = server.address();
5676
+ const url = `http://127.0.0.1:${address.port}`;
5677
+ return {
5678
+ url,
5679
+ token,
5680
+ shutdown: async () => {
5681
+ isShutdown = true;
5682
+ return new Promise((resolve, reject) => {
5683
+ server.close((err) => {
5684
+ if (err) reject(err);
5685
+ else resolve();
5686
+ });
5687
+ });
5688
+ },
5689
+ getUsageMetadata: () => ({
5690
+ callCount,
5691
+ maxCalls
5692
+ })
5693
+ };
5694
+ }
5695
+ function sendJson(res, statusCode, body) {
5696
+ res.writeHead(statusCode, { "Content-Type": "application/json" });
5697
+ res.end(JSON.stringify(body));
5698
+ }
5699
+ function readBody(req) {
5700
+ return new Promise((resolve, reject) => {
5701
+ const chunks = [];
5702
+ req.on("data", (chunk) => chunks.push(chunk));
5703
+ req.on("end", () => resolve(Buffer.concat(chunks).toString("utf8")));
5704
+ req.on("error", reject);
5705
+ });
5706
+ }
5707
+ function extractLastAssistantContent2(messages) {
5708
+ for (let i = messages.length - 1; i >= 0; i--) {
5709
+ const msg = messages[i];
5710
+ if (msg.role === "assistant" && msg.content !== void 0) {
5711
+ if (typeof msg.content === "string") {
5712
+ return msg.content;
5713
+ }
5714
+ if (Array.isArray(msg.content)) {
5715
+ for (const part of msg.content) {
5716
+ if (typeof part === "object" && part !== null && "text" in part) {
5717
+ return String(part.text);
5718
+ }
5719
+ }
5720
+ }
5721
+ }
5722
+ }
5723
+ return void 0;
5724
+ }
5725
+
5726
+ // src/evaluation/case-conversion.ts
5727
+ function toSnakeCase(str) {
5728
+ if (/^[A-Z]/.test(str)) {
5729
+ return str;
5730
+ }
5731
+ return str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
5732
+ }
5733
+ function toSnakeCaseDeep(obj) {
5734
+ if (obj === null || obj === void 0) {
5735
+ return obj;
5736
+ }
5737
+ if (Array.isArray(obj)) {
5738
+ return obj.map((item) => toSnakeCaseDeep(item));
5739
+ }
5740
+ if (typeof obj === "object") {
5741
+ const result = {};
5742
+ for (const [key, value] of Object.entries(obj)) {
5743
+ const snakeKey = toSnakeCase(key);
5744
+ result[snakeKey] = toSnakeCaseDeep(value);
5745
+ }
5746
+ return result;
5747
+ }
5748
+ return obj;
5749
+ }
5750
+
5751
+ // src/evaluation/evaluators/code-evaluator.ts
5752
+ var CodeEvaluator = class {
5753
+ kind = "code";
5754
+ script;
5755
+ cwd;
5756
+ agentTimeoutMs;
5757
+ config;
5758
+ target;
5759
+ constructor(options) {
5760
+ this.script = options.script;
5761
+ this.cwd = options.cwd;
5762
+ this.agentTimeoutMs = options.agentTimeoutMs;
5763
+ this.config = options.config;
5764
+ this.target = options.target;
5765
+ }
5766
+ async evaluate(context) {
5767
+ const payload = {
5768
+ question: context.evalCase.question,
5769
+ expectedOutcome: context.evalCase.expected_outcome,
5770
+ expectedMessages: context.evalCase.expected_messages,
5771
+ referenceAnswer: context.evalCase.reference_answer,
5772
+ candidateAnswer: context.candidate,
5773
+ outputMessages: context.outputMessages ?? null,
5774
+ guidelineFiles: context.evalCase.guideline_paths,
5775
+ inputFiles: context.evalCase.file_paths.filter(
5776
+ (path15) => !context.evalCase.guideline_paths.includes(path15)
5777
+ ),
5778
+ inputMessages: context.evalCase.input_messages,
5779
+ traceSummary: context.traceSummary ?? null,
5780
+ config: this.config ?? null
5781
+ };
5782
+ const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
5783
+ let proxyEnv;
5784
+ let proxyShutdown;
5785
+ let getProxyUsage;
5786
+ if (this.target !== void 0 && context.judgeProvider) {
5787
+ const maxCalls = this.target.max_calls ?? DEFAULT_MAX_CALLS;
5788
+ const proxy = await createTargetProxy({
5789
+ defaultProvider: context.judgeProvider,
5790
+ targetResolver: context.targetResolver,
5791
+ availableTargets: context.availableTargets,
5792
+ maxCalls
5793
+ });
5794
+ proxyEnv = {
5795
+ AGENTV_TARGET_PROXY_URL: proxy.url,
5796
+ AGENTV_TARGET_PROXY_TOKEN: proxy.token
5797
+ };
5798
+ proxyShutdown = proxy.shutdown;
5799
+ getProxyUsage = proxy.getUsageMetadata;
5800
+ }
5801
+ try {
5802
+ const stdout = await executeScript(
5803
+ this.script,
5804
+ inputPayload,
5805
+ this.agentTimeoutMs,
5806
+ this.cwd,
5807
+ proxyEnv
5808
+ );
5809
+ const parsed = parseJsonSafe(stdout);
5810
+ const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
5811
+ const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
5812
+ const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
5813
+ const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
5814
+ const details = parsed?.details && typeof parsed.details === "object" && !Array.isArray(parsed.details) ? parsed.details : void 0;
5815
+ const proxyUsage = getProxyUsage?.();
5816
+ const evaluatorRawRequest = {
5817
+ script: this.script,
5818
+ ...this.cwd ? { cwd: this.cwd } : {},
5819
+ ...proxyUsage ? {
5820
+ target_proxy: {
5821
+ call_count: proxyUsage.callCount,
5822
+ max_calls: proxyUsage.maxCalls
5823
+ }
5824
+ } : {}
5825
+ };
5826
+ return {
5827
+ score,
5828
+ verdict: scoreToVerdict(score),
5829
+ hits,
5830
+ misses,
5831
+ expectedAspectCount: hits.length + misses.length || 1,
5832
+ reasoning,
5833
+ evaluatorRawRequest,
5834
+ ...details ? { details } : {}
5835
+ };
5836
+ } catch (error) {
5837
+ const message = error instanceof Error ? error.message : String(error);
5838
+ const proxyUsage = getProxyUsage?.();
5839
+ return {
5840
+ score: 0,
5841
+ verdict: "fail",
5842
+ hits: [],
5843
+ misses: [`Code evaluator failed: ${message}`],
5844
+ expectedAspectCount: 1,
5845
+ reasoning: message,
5846
+ evaluatorRawRequest: {
5847
+ script: this.script,
5848
+ ...this.cwd ? { cwd: this.cwd } : {},
5849
+ ...proxyUsage ? {
5850
+ target_proxy: {
5851
+ call_count: proxyUsage.callCount,
5852
+ max_calls: proxyUsage.maxCalls
5853
+ }
5854
+ } : {},
5855
+ error: message
5856
+ }
5857
+ };
5858
+ } finally {
5859
+ if (proxyShutdown) {
5860
+ await proxyShutdown();
5861
+ }
5862
+ }
5863
+ }
5864
+ };
5865
+ async function executeScript(scriptPath, input, agentTimeoutMs, cwd, env) {
5866
+ const { stdout, stderr, exitCode } = typeof scriptPath === "string" ? await execShellWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env }) : await execFileWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env });
5867
+ if (exitCode !== 0) {
5868
+ const trimmedErr = formatStderr(stderr);
5869
+ throw new Error(
5870
+ trimmedErr.length > 0 ? `Code evaluator exited with code ${exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${exitCode}`
5871
+ );
5872
+ }
5873
+ return stdout.trim();
5874
+ }
5875
+ function formatStderr(stderr) {
5876
+ const trimmed = stderr.trim();
5877
+ const maxLength = 2e3;
5878
+ if (trimmed.length <= maxLength) {
5879
+ return trimmed;
5880
+ }
5881
+ const tail = trimmed.slice(-maxLength);
5882
+ return `...(truncated, last ${maxLength} chars)
5883
+ ${tail}`;
5479
5884
  }
5480
5885
 
5481
- // src/evaluation/evaluators.ts
5886
+ // src/evaluation/evaluators/composite.ts
5887
+ import { generateText as generateText3 } from "ai";
5888
+
5889
+ // src/evaluation/evaluators/llm-judge.ts
5890
+ import { generateText as generateText2 } from "ai";
5891
+ import { z as z2 } from "zod";
5482
5892
  var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
5483
5893
 
5484
5894
  Use the reference_answer as a gold standard for a high-quality response (if provided). The reference_answer may be a simple text response, or it may contain a sequence of expected agent messages including tool calls. When it contains multiple messages, the last message represents the final expected answer. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
@@ -5558,7 +5968,7 @@ var LlmJudgeEvaluator = class {
5558
5968
  target: judgeProvider.targetName
5559
5969
  };
5560
5970
  try {
5561
- const { data, providerResponse } = await this.runWithRetry({
5971
+ const { data } = await this.runWithRetry({
5562
5972
  context,
5563
5973
  judgeProvider,
5564
5974
  systemPrompt,
@@ -5707,105 +6117,11 @@ You must return a valid JSON object matching this schema:
5707
6117
  "overall_reasoning": "string (summary)"
5708
6118
  }`;
5709
6119
  }
5710
- function scoreToVerdict(score) {
5711
- if (score >= 0.8) {
5712
- return "pass";
5713
- }
5714
- if (score >= 0.6) {
5715
- return "borderline";
5716
- }
5717
- return "fail";
5718
- }
5719
- function clampScore(value) {
5720
- if (Number.isNaN(value) || !Number.isFinite(value)) {
5721
- return 0;
5722
- }
5723
- if (value < 0) {
5724
- return 0;
5725
- }
5726
- if (value > 1) {
5727
- return 1;
5728
- }
5729
- return value;
5730
- }
5731
- function extractJsonBlob(text) {
5732
- const match = text.match(/\{[\s\S]*\}/);
5733
- return match?.[0];
5734
- }
5735
- function parseJsonFromText(text) {
5736
- const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
5737
- const blob = extractJsonBlob(cleaned) ?? cleaned;
5738
- return JSON.parse(blob);
5739
- }
5740
- function isNonEmptyString(value) {
5741
- return typeof value === "string" && value.trim().length > 0;
6120
+ function substituteVariables(template, variables) {
6121
+ return template.replace(/\{\{\s*([a-zA-Z0-9_]+)\s*\}\}/g, (match, varName) => {
6122
+ return variables[varName] ?? match;
6123
+ });
5742
6124
  }
5743
- var CodeEvaluator = class {
5744
- kind = "code";
5745
- script;
5746
- cwd;
5747
- agentTimeoutMs;
5748
- config;
5749
- constructor(options) {
5750
- this.script = options.script;
5751
- this.cwd = options.cwd;
5752
- this.agentTimeoutMs = options.agentTimeoutMs;
5753
- this.config = options.config;
5754
- }
5755
- async evaluate(context) {
5756
- const payload = {
5757
- question: context.evalCase.question,
5758
- expectedOutcome: context.evalCase.expected_outcome,
5759
- expectedMessages: context.evalCase.expected_messages,
5760
- referenceAnswer: context.evalCase.reference_answer,
5761
- candidateAnswer: context.candidate,
5762
- outputMessages: context.outputMessages ?? null,
5763
- guidelineFiles: context.evalCase.guideline_paths,
5764
- inputFiles: context.evalCase.file_paths.filter(
5765
- (path15) => !context.evalCase.guideline_paths.includes(path15)
5766
- ),
5767
- inputMessages: context.evalCase.input_messages,
5768
- traceSummary: context.traceSummary ?? null,
5769
- config: this.config ?? null
5770
- };
5771
- const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
5772
- try {
5773
- const stdout = await executeScript(this.script, inputPayload, this.agentTimeoutMs, this.cwd);
5774
- const parsed = parseJsonSafe(stdout);
5775
- const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
5776
- const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
5777
- const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
5778
- const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
5779
- return {
5780
- score,
5781
- verdict: scoreToVerdict(score),
5782
- hits,
5783
- misses,
5784
- expectedAspectCount: hits.length + misses.length || 1,
5785
- reasoning,
5786
- evaluatorRawRequest: {
5787
- script: this.script,
5788
- ...this.cwd ? { cwd: this.cwd } : {}
5789
- }
5790
- };
5791
- } catch (error) {
5792
- const message = error instanceof Error ? error.message : String(error);
5793
- return {
5794
- score: 0,
5795
- verdict: "fail",
5796
- hits: [],
5797
- misses: [`Code evaluator failed: ${message}`],
5798
- expectedAspectCount: 1,
5799
- reasoning: message,
5800
- evaluatorRawRequest: {
5801
- script: this.script,
5802
- ...this.cwd ? { cwd: this.cwd } : {},
5803
- error: message
5804
- }
5805
- };
5806
- }
5807
- }
5808
- };
5809
6125
  function calculateRubricScore(result, rubrics) {
5810
6126
  const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
5811
6127
  const hits = [];
@@ -5833,273 +6149,281 @@ function calculateRubricScore(result, rubrics) {
5833
6149
  const verdict = failedRequired ? "fail" : scoreToVerdict(score);
5834
6150
  return { score, verdict, hits, misses };
5835
6151
  }
5836
- async function executeScript(scriptPath, input, agentTimeoutMs, cwd) {
5837
- const { stdout, stderr, exitCode } = typeof scriptPath === "string" ? await execShellWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs }) : await execFileWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs });
5838
- if (exitCode !== 0) {
5839
- const trimmedErr = formatStderr(stderr);
5840
- throw new Error(
5841
- trimmedErr.length > 0 ? `Code evaluator exited with code ${exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${exitCode}`
5842
- );
5843
- }
5844
- return stdout.trim();
5845
- }
5846
- function formatStderr(stderr) {
5847
- const trimmed = stderr.trim();
5848
- const maxLength = 2e3;
5849
- if (trimmed.length <= maxLength) {
5850
- return trimmed;
5851
- }
5852
- const tail = trimmed.slice(-maxLength);
5853
- return `...(truncated, last ${maxLength} chars)
5854
- ${tail}`;
5855
- }
5856
- function parseJsonSafe(payload) {
5857
- try {
5858
- return JSON.parse(payload);
5859
- } catch {
5860
- return void 0;
5861
- }
5862
- }
5863
- function substituteVariables(template, variables) {
5864
- return template.replace(/\{\{\s*([a-zA-Z0-9_]+)\s*\}\}/g, (match, varName) => {
5865
- return variables[varName] ?? match;
5866
- });
5867
- }
5868
- function deepEqual(a, b) {
5869
- if (a === b) return true;
5870
- if (a === null || b === null) return a === b;
5871
- if (typeof a !== typeof b) return false;
5872
- if (typeof a !== "object") return a === b;
5873
- if (Array.isArray(a) !== Array.isArray(b)) return false;
5874
- if (Array.isArray(a) && Array.isArray(b)) {
5875
- if (a.length !== b.length) return false;
5876
- return a.every((val, i) => deepEqual(val, b[i]));
5877
- }
5878
- const aObj = a;
5879
- const bObj = b;
5880
- const aKeys = Object.keys(aObj);
5881
- const bKeys = Object.keys(bObj);
5882
- if (aKeys.length !== bKeys.length) return false;
5883
- return aKeys.every((key) => Object.hasOwn(bObj, key) && deepEqual(aObj[key], bObj[key]));
5884
- }
5885
- function argsMatch(expected, actual) {
5886
- if (expected === void 0) return true;
5887
- if (expected === "any") return true;
5888
- if (actual === void 0) return false;
5889
- for (const key of Object.keys(expected)) {
5890
- if (!Object.hasOwn(actual, key)) return false;
5891
- if (!deepEqual(expected[key], actual[key])) return false;
5892
- }
5893
- return true;
5894
- }
5895
- var ToolTrajectoryEvaluator = class {
5896
- kind = "tool_trajectory";
6152
+
6153
+ // src/evaluation/evaluators/composite.ts
6154
+ var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
6155
+ {{EVALUATOR_RESULTS_JSON}}
6156
+
6157
+ Decide the final score and verdict based on all evaluator results.
6158
+ Return a JSON object with: score (0.0-1.0), verdict (pass/fail/borderline), and reasoning.`;
6159
+ var CompositeEvaluator = class {
6160
+ kind = "composite";
5897
6161
  config;
6162
+ evaluatorFactory;
6163
+ cwd;
5898
6164
  constructor(options) {
5899
6165
  this.config = options.config;
6166
+ this.evaluatorFactory = options.evaluatorFactory;
6167
+ this.cwd = options.cwd;
5900
6168
  }
5901
- evaluate(context) {
5902
- const { outputMessages, traceSummary } = context;
5903
- const toolCalls = this.extractToolCallsFromMessages(outputMessages);
5904
- if (toolCalls.length === 0 && !traceSummary) {
5905
- return {
5906
- score: 0,
5907
- verdict: "fail",
5908
- hits: [],
5909
- misses: ["No trace available for evaluation"],
5910
- expectedAspectCount: 1
5911
- };
5912
- }
5913
- const summary = toolCalls.length > 0 ? this.buildSummary(toolCalls) : traceSummary;
5914
- if (!summary) {
5915
- return {
5916
- score: 0,
5917
- verdict: "fail",
5918
- hits: [],
5919
- misses: ["No trace available for evaluation"],
5920
- expectedAspectCount: 1
5921
- };
5922
- }
5923
- switch (this.config.mode) {
5924
- case "any_order":
5925
- return this.evaluateAnyOrder(summary);
5926
- case "in_order":
5927
- return this.evaluateInOrder(toolCalls);
5928
- case "exact":
5929
- return this.evaluateExact(toolCalls);
5930
- default:
6169
+ async evaluate(context) {
6170
+ const memberResults = await Promise.all(
6171
+ this.config.evaluators.map(async (memberConfig) => {
6172
+ const evaluator = this.evaluatorFactory.create(memberConfig, context);
5931
6173
  return {
5932
- score: 0,
5933
- verdict: "fail",
5934
- hits: [],
5935
- misses: [`Unknown mode: ${this.config.mode}`],
5936
- expectedAspectCount: 1
6174
+ id: memberConfig.name,
6175
+ type: memberConfig.type,
6176
+ result: await evaluator.evaluate(context)
5937
6177
  };
5938
- }
6178
+ })
6179
+ );
6180
+ return this.aggregate(memberResults, context);
5939
6181
  }
5940
- /**
5941
- * Extract tool calls from output messages.
5942
- */
5943
- extractToolCallsFromMessages(messages) {
5944
- if (!messages) {
5945
- return [];
5946
- }
5947
- const toolCalls = [];
5948
- for (const message of messages) {
5949
- if (message.toolCalls) {
5950
- for (const call of message.toolCalls) {
5951
- toolCalls.push({
5952
- name: call.tool,
5953
- args: call.input
5954
- });
5955
- }
5956
- }
6182
+ async aggregate(results, context) {
6183
+ const aggregator = this.config.aggregator;
6184
+ switch (aggregator.type) {
6185
+ case "code_judge":
6186
+ return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
6187
+ case "llm_judge":
6188
+ return this.runLlmAggregator(results, context, aggregator);
6189
+ default:
6190
+ return this.runWeightedAverage(results, aggregator.weights);
5957
6191
  }
5958
- return toolCalls;
5959
6192
  }
5960
- /**
5961
- * Build a summary from extracted tool calls.
5962
- */
5963
- buildSummary(toolCalls) {
5964
- const toolCallsByName = {};
5965
- for (const call of toolCalls) {
5966
- toolCallsByName[call.name] = (toolCallsByName[call.name] ?? 0) + 1;
6193
+ runWeightedAverage(results, weights) {
6194
+ let totalWeight = 0;
6195
+ let weightedSum = 0;
6196
+ const allHits = [];
6197
+ const allMisses = [];
6198
+ const reasoningParts = [];
6199
+ const evaluatorResults = [];
6200
+ for (const member of results) {
6201
+ const weight = weights?.[member.id] ?? 1;
6202
+ totalWeight += weight;
6203
+ weightedSum += member.result.score * weight;
6204
+ allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
6205
+ allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
6206
+ if (member.result.reasoning) {
6207
+ reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
6208
+ }
6209
+ evaluatorResults.push({
6210
+ name: member.id,
6211
+ type: member.type,
6212
+ score: member.result.score,
6213
+ weight,
6214
+ verdict: member.result.verdict,
6215
+ hits: [...member.result.hits],
6216
+ misses: [...member.result.misses],
6217
+ reasoning: member.result.reasoning,
6218
+ evaluatorRawRequest: member.result.evaluatorRawRequest,
6219
+ evaluatorResults: member.result.evaluatorResults,
6220
+ details: member.result.details
6221
+ });
5967
6222
  }
5968
- const toolNames = Object.keys(toolCallsByName).sort();
6223
+ const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
5969
6224
  return {
5970
- eventCount: toolCalls.length,
5971
- toolNames,
5972
- toolCallsByName,
5973
- errorCount: 0
6225
+ score: clampScore(finalScore),
6226
+ verdict: scoreToVerdict(finalScore),
6227
+ hits: allHits,
6228
+ misses: allMisses,
6229
+ expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
6230
+ reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
6231
+ evaluatorRawRequest: {
6232
+ aggregator: "weighted_average",
6233
+ ...weights ? { weights } : {}
6234
+ },
6235
+ evaluatorResults
5974
6236
  };
5975
6237
  }
5976
- evaluateAnyOrder(summary) {
5977
- const minimums = this.config.minimums ?? {};
5978
- const toolNames = Object.keys(minimums);
5979
- if (toolNames.length === 0) {
6238
+ async runCodeAggregator(results, scriptPath, cwd, weights) {
6239
+ const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
6240
+ const inputPayload = JSON.stringify({ results: resultsObject }, null, 2);
6241
+ const evaluatorResults = results.map((member) => ({
6242
+ name: member.id,
6243
+ type: member.type,
6244
+ score: member.result.score,
6245
+ weight: weights?.[member.id] ?? 1,
6246
+ verdict: member.result.verdict,
6247
+ hits: [...member.result.hits],
6248
+ misses: [...member.result.misses],
6249
+ reasoning: member.result.reasoning,
6250
+ evaluatorRawRequest: member.result.evaluatorRawRequest,
6251
+ evaluatorResults: member.result.evaluatorResults,
6252
+ details: member.result.details
6253
+ }));
6254
+ try {
6255
+ const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
6256
+ const parsed = parseJsonSafe(stdout);
6257
+ const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
6258
+ const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
6259
+ const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
6260
+ const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
6261
+ const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
5980
6262
  return {
5981
- score: 1,
5982
- verdict: "pass",
5983
- hits: ["No tool requirements specified"],
5984
- misses: [],
5985
- expectedAspectCount: 0
6263
+ score,
6264
+ verdict,
6265
+ hits,
6266
+ misses,
6267
+ expectedAspectCount: hits.length + misses.length || 1,
6268
+ reasoning,
6269
+ evaluatorRawRequest: {
6270
+ aggregator: "code_judge",
6271
+ script: scriptPath
6272
+ },
6273
+ evaluatorResults
6274
+ };
6275
+ } catch (error) {
6276
+ const message = error instanceof Error ? error.message : String(error);
6277
+ return {
6278
+ score: 0,
6279
+ verdict: "fail",
6280
+ hits: [],
6281
+ misses: [`Code aggregator failed: ${message}`],
6282
+ expectedAspectCount: 1,
6283
+ reasoning: message,
6284
+ evaluatorRawRequest: {
6285
+ aggregator: "code_judge",
6286
+ script: scriptPath,
6287
+ error: message
6288
+ },
6289
+ evaluatorResults
5986
6290
  };
5987
6291
  }
5988
- const hits = [];
5989
- const misses = [];
5990
- for (const toolName of toolNames) {
5991
- const required = minimums[toolName];
5992
- const actual = summary.toolCallsByName[toolName] ?? 0;
5993
- if (actual >= required) {
5994
- hits.push(`${toolName}: called ${actual} times (required \u2265${required})`);
5995
- } else {
5996
- misses.push(`${toolName}: called ${actual} times (required \u2265${required})`);
5997
- }
6292
+ }
6293
+ async runLlmAggregator(results, context, config) {
6294
+ const judgeProvider = context.judgeProvider;
6295
+ if (!judgeProvider) {
6296
+ throw new Error("No judge provider available for LLM aggregation");
5998
6297
  }
5999
- const score = hits.length / toolNames.length;
6000
- return {
6001
- score,
6002
- verdict: scoreToVerdict(score),
6003
- hits,
6004
- misses,
6005
- expectedAspectCount: toolNames.length
6298
+ const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
6299
+ const resultsJson = JSON.stringify(resultsObject, null, 2);
6300
+ const evaluatorResults = results.map((member) => ({
6301
+ name: member.id,
6302
+ type: member.type,
6303
+ score: member.result.score,
6304
+ verdict: member.result.verdict,
6305
+ hits: [...member.result.hits],
6306
+ misses: [...member.result.misses],
6307
+ reasoning: member.result.reasoning,
6308
+ evaluatorRawRequest: member.result.evaluatorRawRequest,
6309
+ evaluatorResults: member.result.evaluatorResults,
6310
+ details: member.result.details
6311
+ }));
6312
+ const promptTemplate = config.prompt ?? DEFAULT_COMPOSITE_AGGREGATOR_PROMPT;
6313
+ const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
6314
+ const systemPrompt = buildOutputSchema();
6315
+ const evaluatorRawRequest = {
6316
+ aggregator: "llm_judge",
6317
+ userPrompt,
6318
+ systemPrompt,
6319
+ target: judgeProvider.targetName
6006
6320
  };
6007
- }
6008
- evaluateInOrder(toolCalls) {
6009
- const expected = this.config.expected ?? [];
6010
- if (expected.length === 0) {
6321
+ try {
6322
+ const model = judgeProvider.asLanguageModel?.();
6323
+ if (model) {
6324
+ const { text } = await generateText3({
6325
+ model,
6326
+ system: systemPrompt,
6327
+ prompt: userPrompt
6328
+ });
6329
+ const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text));
6330
+ const score2 = clampScore(data2.score);
6331
+ const hits2 = Array.isArray(data2.hits) ? data2.hits.filter(isNonEmptyString).slice(0, 4) : [];
6332
+ const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
6333
+ const reasoning2 = data2.reasoning;
6334
+ return {
6335
+ score: score2,
6336
+ verdict: scoreToVerdict(score2),
6337
+ hits: hits2,
6338
+ misses: misses2,
6339
+ expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
6340
+ reasoning: reasoning2,
6341
+ evaluatorRawRequest,
6342
+ evaluatorResults
6343
+ };
6344
+ }
6345
+ const response = await judgeProvider.invoke({
6346
+ question: userPrompt,
6347
+ systemPrompt,
6348
+ evalCaseId: context.evalCase.id,
6349
+ attempt: context.attempt
6350
+ });
6351
+ const data = freeformEvaluationSchema.parse(
6352
+ parseJsonFromText(extractLastAssistantContent(response.outputMessages))
6353
+ );
6354
+ const score = clampScore(data.score);
6355
+ const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
6356
+ const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
6357
+ const reasoning = data.reasoning;
6011
6358
  return {
6012
- score: 1,
6013
- verdict: "pass",
6014
- hits: ["No tool sequence specified"],
6015
- misses: [],
6016
- expectedAspectCount: 0
6359
+ score,
6360
+ verdict: scoreToVerdict(score),
6361
+ hits,
6362
+ misses,
6363
+ expectedAspectCount: Math.max(hits.length + misses.length, 1),
6364
+ reasoning,
6365
+ evaluatorRawRequest,
6366
+ evaluatorResults
6017
6367
  };
6018
- }
6019
- const hits = [];
6020
- const misses = [];
6021
- let actualIndex = 0;
6022
- for (let i = 0; i < expected.length; i++) {
6023
- const expectedItem = expected[i];
6024
- const expectedTool = expectedItem.tool;
6025
- let found = false;
6026
- let argsMismatch = false;
6027
- while (actualIndex < toolCalls.length) {
6028
- const actualCall = toolCalls[actualIndex];
6029
- if (actualCall.name === expectedTool) {
6030
- if (argsMatch(expectedItem.args, actualCall.args)) {
6031
- hits.push(`Found ${expectedTool} at position ${actualIndex}`);
6032
- actualIndex++;
6033
- found = true;
6034
- break;
6035
- }
6036
- misses.push(
6037
- `Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`
6038
- );
6039
- actualIndex++;
6040
- argsMismatch = true;
6041
- break;
6042
- }
6043
- actualIndex++;
6044
- }
6045
- if (!found && !argsMismatch) {
6046
- misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
6047
- }
6048
- }
6049
- const score = hits.length / expected.length;
6050
- return {
6051
- score,
6052
- verdict: scoreToVerdict(score),
6053
- hits,
6054
- misses,
6055
- expectedAspectCount: expected.length
6056
- };
6057
- }
6058
- evaluateExact(toolCalls) {
6059
- const expected = this.config.expected ?? [];
6060
- if (expected.length === 0) {
6368
+ } catch {
6061
6369
  return {
6062
- score: 1,
6063
- verdict: "pass",
6064
- hits: ["No tool sequence specified"],
6370
+ score: 0,
6371
+ verdict: "fail",
6372
+ hits: [],
6065
6373
  misses: [],
6066
- expectedAspectCount: 0
6374
+ expectedAspectCount: 1,
6375
+ evaluatorRawRequest,
6376
+ evaluatorResults
6067
6377
  };
6068
6378
  }
6069
- const hits = [];
6070
- const misses = [];
6071
- if (toolCalls.length !== expected.length) {
6072
- misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
6073
- }
6074
- const checkLength = Math.min(expected.length, toolCalls.length);
6075
- for (let i = 0; i < checkLength; i++) {
6076
- const expectedItem = expected[i];
6077
- const expectedTool = expectedItem.tool;
6078
- const actualCall = toolCalls[i];
6079
- const actualTool = actualCall.name;
6080
- if (actualTool === expectedTool) {
6081
- if (argsMatch(expectedItem.args, actualCall.args)) {
6082
- hits.push(`Position ${i}: ${expectedTool}`);
6083
- } else {
6084
- misses.push(`Position ${i}: ${expectedTool} args mismatch`);
6379
+ }
6380
+ };
6381
+
6382
+ // src/evaluation/evaluators/cost.ts
6383
+ var CostEvaluator = class {
6384
+ kind = "cost";
6385
+ config;
6386
+ constructor(options) {
6387
+ this.config = options.config;
6388
+ }
6389
+ evaluate(context) {
6390
+ const { budget } = this.config;
6391
+ const costUsd = context.traceSummary?.costUsd;
6392
+ if (costUsd === void 0) {
6393
+ return {
6394
+ score: 0,
6395
+ verdict: "fail",
6396
+ hits: [],
6397
+ misses: ["No cost data available in trace"],
6398
+ expectedAspectCount: 1,
6399
+ reasoning: "Execution cost not reported by provider",
6400
+ evaluatorRawRequest: {
6401
+ type: "cost",
6402
+ budget,
6403
+ costUsd: null
6085
6404
  }
6086
- } else {
6087
- misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
6088
- }
6089
- }
6090
- for (let i = checkLength; i < expected.length; i++) {
6091
- misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
6405
+ };
6092
6406
  }
6093
- const score = hits.length / expected.length;
6407
+ const passed = costUsd <= budget;
6408
+ const score = passed ? 1 : 0;
6409
+ const formatCost = (n) => `$${n.toFixed(4)}`;
6094
6410
  return {
6095
6411
  score,
6096
- verdict: scoreToVerdict(score),
6097
- hits,
6098
- misses,
6099
- expectedAspectCount: expected.length
6412
+ verdict: passed ? "pass" : "fail",
6413
+ hits: passed ? [`Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`] : [],
6414
+ misses: passed ? [] : [`Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`],
6415
+ expectedAspectCount: 1,
6416
+ reasoning: `Execution cost ${formatCost(costUsd)} (budget: ${formatCost(budget)})`,
6417
+ evaluatorRawRequest: {
6418
+ type: "cost",
6419
+ budget,
6420
+ costUsd
6421
+ }
6100
6422
  };
6101
6423
  }
6102
6424
  };
6425
+
6426
+ // src/evaluation/evaluators/field-accuracy.ts
6103
6427
  var DEFAULT_DATE_FORMATS = [
6104
6428
  "YYYY-MM-DDTHH:mm:ssZ",
6105
6429
  // ISO with timezone
@@ -6312,434 +6636,209 @@ var FieldAccuracyEvaluator = class {
6312
6636
  }
6313
6637
  if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
6314
6638
  return {
6315
- path: path15,
6316
- score: 0,
6317
- weight,
6318
- hit: false,
6319
- message: `${path15} (invalid numeric value)`
6320
- };
6321
- }
6322
- const diff = Math.abs(candidateNum - expectedNum);
6323
- let withinTolerance;
6324
- if (relative) {
6325
- const relativeDiff = expectedNum === 0 ? diff : diff / Math.abs(expectedNum);
6326
- withinTolerance = relativeDiff <= tolerance;
6327
- } else {
6328
- withinTolerance = diff <= tolerance;
6329
- }
6330
- if (withinTolerance) {
6331
- return {
6332
- path: path15,
6333
- score: 1,
6334
- weight,
6335
- hit: true,
6336
- message: `${path15} (within tolerance: diff=${diff.toFixed(2)})`
6337
- };
6338
- }
6339
- return {
6340
- path: path15,
6341
- score: 0,
6342
- weight,
6343
- hit: false,
6344
- message: `${path15} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
6345
- };
6346
- }
6347
- /**
6348
- * Date comparison with format normalization.
6349
- */
6350
- compareDate(path15, candidateValue, expectedValue, fieldConfig, weight) {
6351
- const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
6352
- const candidateDate = parseDate(String(candidateValue), formats);
6353
- const expectedDate = parseDate(String(expectedValue), formats);
6354
- if (candidateDate === null) {
6355
- return {
6356
- path: path15,
6357
- score: 0,
6358
- weight,
6359
- hit: false,
6360
- message: `${path15} (unparseable candidate date)`
6361
- };
6362
- }
6363
- if (expectedDate === null) {
6364
- return {
6365
- path: path15,
6366
- score: 0,
6367
- weight,
6368
- hit: false,
6369
- message: `${path15} (unparseable expected date)`
6370
- };
6371
- }
6372
- if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
6373
- return {
6374
- path: path15,
6375
- score: 1,
6376
- weight,
6377
- hit: true,
6378
- message: path15
6379
- };
6380
- }
6381
- return {
6382
- path: path15,
6383
- score: 0,
6384
- weight,
6385
- hit: false,
6386
- message: `${path15} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
6387
- };
6388
- }
6389
- /**
6390
- * Aggregate field results using configured strategy.
6391
- */
6392
- aggregateResults(results) {
6393
- const aggregation = this.config.aggregation ?? "weighted_average";
6394
- const hits = [];
6395
- const misses = [];
6396
- for (const result of results) {
6397
- if (result.hit) {
6398
- hits.push(result.message);
6399
- } else {
6400
- misses.push(result.message);
6401
- }
6402
- }
6403
- let score;
6404
- if (aggregation === "all_or_nothing") {
6405
- score = misses.length === 0 ? 1 : 0;
6406
- } else {
6407
- const totalWeight = results.reduce((sum, r) => sum + r.weight, 0);
6408
- if (totalWeight === 0) {
6409
- score = results.length === 0 ? 1 : 0;
6410
- } else {
6411
- const weightedSum = results.reduce((sum, r) => sum + r.score * r.weight, 0);
6412
- score = weightedSum / totalWeight;
6413
- }
6414
- }
6415
- const reasoning = `${hits.length}/${results.length} fields matched`;
6416
- return {
6417
- score: clampScore(score),
6418
- verdict: scoreToVerdict(score),
6419
- hits: hits.slice(0, 4),
6420
- misses: misses.slice(0, 4),
6421
- expectedAspectCount: results.length,
6422
- reasoning
6423
- };
6424
- }
6425
- };
6426
- function resolvePath(obj, path15) {
6427
- if (!path15 || !obj) {
6428
- return void 0;
6429
- }
6430
- const parts = path15.split(/\.|\[|\]/).filter((p) => p.length > 0);
6431
- let current = obj;
6432
- for (const part of parts) {
6433
- if (current === null || current === void 0) {
6434
- return void 0;
6435
- }
6436
- if (typeof current !== "object") {
6437
- return void 0;
6438
- }
6439
- const isIndex = /^\d+$/.test(part);
6440
- if (isIndex && Array.isArray(current)) {
6441
- current = current[Number.parseInt(part, 10)];
6442
- } else {
6443
- current = current[part];
6444
- }
6445
- }
6446
- return current;
6447
- }
6448
- function toNumber(value) {
6449
- if (typeof value === "number") {
6450
- return value;
6451
- }
6452
- if (typeof value === "string") {
6453
- const num = Number.parseFloat(value);
6454
- return Number.isNaN(num) ? null : num;
6455
- }
6456
- return null;
6457
- }
6458
- function parseDate(dateStr, formats) {
6459
- if (!dateStr) return null;
6460
- const trimmed = dateStr.trim();
6461
- const isoDate = new Date(trimmed);
6462
- if (!Number.isNaN(isoDate.getTime())) {
6463
- return isoDate;
6464
- }
6465
- const localizedMatch = trimmed.match(/^(\d{1,2})-([A-Za-z]{3,9})-(\d{4})$/);
6466
- if (localizedMatch) {
6467
- const day = Number.parseInt(localizedMatch[1], 10);
6468
- const monthName = localizedMatch[2].toLowerCase();
6469
- const year = Number.parseInt(localizedMatch[3], 10);
6470
- const month = MONTH_NAMES[monthName];
6471
- if (month !== void 0) {
6472
- return new Date(year, month, day);
6473
- }
6474
- }
6475
- const usMatch = trimmed.match(/^(\d{1,2})[\/\-](\d{1,2})[\/\-](\d{4})$/);
6476
- if (usMatch) {
6477
- const hasUSFormat = formats.some((f) => f.includes("MM/DD") || f.includes("MM-DD"));
6478
- const hasEUFormat = formats.some((f) => f.includes("DD/MM") || f.includes("DD-MM"));
6479
- if (hasUSFormat && !hasEUFormat) {
6480
- const month = Number.parseInt(usMatch[1], 10) - 1;
6481
- const day = Number.parseInt(usMatch[2], 10);
6482
- const year = Number.parseInt(usMatch[3], 10);
6483
- if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
6484
- return new Date(year, month, day);
6485
- }
6486
- } else if (hasEUFormat && !hasUSFormat) {
6487
- const day = Number.parseInt(usMatch[1], 10);
6488
- const month = Number.parseInt(usMatch[2], 10) - 1;
6489
- const year = Number.parseInt(usMatch[3], 10);
6490
- if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
6491
- return new Date(year, month, day);
6492
- }
6493
- } else {
6494
- const num1 = Number.parseInt(usMatch[1], 10);
6495
- const num2 = Number.parseInt(usMatch[2], 10);
6496
- const year = Number.parseInt(usMatch[3], 10);
6497
- if (num1 > 12 && num2 <= 12) {
6498
- return new Date(year, num2 - 1, num1);
6499
- }
6500
- if (num2 > 12 && num1 <= 12) {
6501
- return new Date(year, num1 - 1, num2);
6502
- }
6503
- if (num1 <= 12 && num2 <= 31) {
6504
- return new Date(year, num1 - 1, num2);
6505
- }
6506
- }
6507
- }
6508
- return null;
6509
- }
6510
- function formatDateISO(date) {
6511
- return date.toISOString().split("T")[0];
6512
- }
6513
- function parseJsonFromTextSafe(text) {
6514
- const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
6515
- const match = cleaned.match(/\{[\s\S]*\}/);
6516
- const blob = match?.[0] ?? cleaned;
6517
- return JSON.parse(blob);
6518
- }
6519
- var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
6520
- {{EVALUATOR_RESULTS_JSON}}
6521
-
6522
- Decide the final score and verdict based on all evaluator results.
6523
- Return a JSON object with: score (0.0-1.0), verdict (pass/fail/borderline), and reasoning.`;
6524
- var CompositeEvaluator = class {
6525
- kind = "composite";
6526
- config;
6527
- evaluatorFactory;
6528
- cwd;
6529
- constructor(options) {
6530
- this.config = options.config;
6531
- this.evaluatorFactory = options.evaluatorFactory;
6532
- this.cwd = options.cwd;
6533
- }
6534
- async evaluate(context) {
6535
- const memberResults = await Promise.all(
6536
- this.config.evaluators.map(async (memberConfig) => {
6537
- const evaluator = this.evaluatorFactory.create(memberConfig, context);
6538
- return {
6539
- id: memberConfig.name,
6540
- type: memberConfig.type,
6541
- result: await evaluator.evaluate(context)
6542
- };
6543
- })
6544
- );
6545
- return this.aggregate(memberResults, context);
6546
- }
6547
- async aggregate(results, context) {
6548
- const aggregator = this.config.aggregator;
6549
- switch (aggregator.type) {
6550
- case "code_judge":
6551
- return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
6552
- case "llm_judge":
6553
- return this.runLlmAggregator(results, context, aggregator);
6554
- default:
6555
- return this.runWeightedAverage(results, aggregator.weights);
6556
- }
6557
- }
6558
- runWeightedAverage(results, weights) {
6559
- let totalWeight = 0;
6560
- let weightedSum = 0;
6561
- const allHits = [];
6562
- const allMisses = [];
6563
- const reasoningParts = [];
6564
- const evaluatorResults = [];
6565
- for (const member of results) {
6566
- const weight = weights?.[member.id] ?? 1;
6567
- totalWeight += weight;
6568
- weightedSum += member.result.score * weight;
6569
- allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
6570
- allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
6571
- if (member.result.reasoning) {
6572
- reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
6573
- }
6574
- evaluatorResults.push({
6575
- name: member.id,
6576
- type: member.type,
6577
- score: member.result.score,
6578
- weight,
6579
- verdict: member.result.verdict,
6580
- hits: [...member.result.hits],
6581
- misses: [...member.result.misses],
6582
- reasoning: member.result.reasoning,
6583
- evaluatorRawRequest: member.result.evaluatorRawRequest,
6584
- evaluatorResults: member.result.evaluatorResults
6585
- });
6586
- }
6587
- const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
6588
- return {
6589
- score: clampScore(finalScore),
6590
- verdict: scoreToVerdict(finalScore),
6591
- hits: allHits,
6592
- misses: allMisses,
6593
- expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
6594
- reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
6595
- evaluatorRawRequest: {
6596
- aggregator: "weighted_average",
6597
- ...weights ? { weights } : {}
6598
- },
6599
- evaluatorResults
6600
- };
6601
- }
6602
- async runCodeAggregator(results, scriptPath, cwd, weights) {
6603
- const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
6604
- const inputPayload = JSON.stringify({ results: resultsObject }, null, 2);
6605
- const evaluatorResults = results.map((member) => ({
6606
- name: member.id,
6607
- type: member.type,
6608
- score: member.result.score,
6609
- weight: weights?.[member.id] ?? 1,
6610
- verdict: member.result.verdict,
6611
- hits: [...member.result.hits],
6612
- misses: [...member.result.misses],
6613
- reasoning: member.result.reasoning,
6614
- evaluatorRawRequest: member.result.evaluatorRawRequest,
6615
- evaluatorResults: member.result.evaluatorResults
6616
- }));
6617
- try {
6618
- const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
6619
- const parsed = parseJsonSafe(stdout);
6620
- const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
6621
- const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
6622
- const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
6623
- const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
6624
- const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
6625
- return {
6626
- score,
6627
- verdict,
6628
- hits,
6629
- misses,
6630
- expectedAspectCount: hits.length + misses.length || 1,
6631
- reasoning,
6632
- evaluatorRawRequest: {
6633
- aggregator: "code_judge",
6634
- script: scriptPath
6635
- },
6636
- evaluatorResults
6637
- };
6638
- } catch (error) {
6639
- const message = error instanceof Error ? error.message : String(error);
6640
- return {
6641
- score: 0,
6642
- verdict: "fail",
6643
- hits: [],
6644
- misses: [`Code aggregator failed: ${message}`],
6645
- expectedAspectCount: 1,
6646
- reasoning: message,
6647
- evaluatorRawRequest: {
6648
- aggregator: "code_judge",
6649
- script: scriptPath,
6650
- error: message
6651
- },
6652
- evaluatorResults
6653
- };
6654
- }
6655
- }
6656
- async runLlmAggregator(results, context, config) {
6657
- const judgeProvider = context.judgeProvider;
6658
- if (!judgeProvider) {
6659
- throw new Error("No judge provider available for LLM aggregation");
6660
- }
6661
- const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
6662
- const resultsJson = JSON.stringify(resultsObject, null, 2);
6663
- const evaluatorResults = results.map((member) => ({
6664
- name: member.id,
6665
- type: member.type,
6666
- score: member.result.score,
6667
- verdict: member.result.verdict,
6668
- hits: [...member.result.hits],
6669
- misses: [...member.result.misses],
6670
- reasoning: member.result.reasoning,
6671
- evaluatorRawRequest: member.result.evaluatorRawRequest,
6672
- evaluatorResults: member.result.evaluatorResults
6673
- }));
6674
- const promptTemplate = config.prompt ?? DEFAULT_COMPOSITE_AGGREGATOR_PROMPT;
6675
- const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
6676
- const systemPrompt = buildOutputSchema();
6677
- const evaluatorRawRequest = {
6678
- aggregator: "llm_judge",
6679
- userPrompt,
6680
- systemPrompt,
6681
- target: judgeProvider.targetName
6682
- };
6683
- try {
6684
- const model = judgeProvider.asLanguageModel?.();
6685
- if (model) {
6686
- const { text } = await generateText2({
6687
- model,
6688
- system: systemPrompt,
6689
- prompt: userPrompt
6690
- });
6691
- const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text));
6692
- const score2 = clampScore(data2.score);
6693
- const hits2 = Array.isArray(data2.hits) ? data2.hits.filter(isNonEmptyString).slice(0, 4) : [];
6694
- const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
6695
- const reasoning2 = data2.reasoning;
6696
- return {
6697
- score: score2,
6698
- verdict: scoreToVerdict(score2),
6699
- hits: hits2,
6700
- misses: misses2,
6701
- expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
6702
- reasoning: reasoning2,
6703
- evaluatorRawRequest,
6704
- evaluatorResults
6705
- };
6706
- }
6707
- const response = await judgeProvider.invoke({
6708
- question: userPrompt,
6709
- systemPrompt,
6710
- evalCaseId: context.evalCase.id,
6711
- attempt: context.attempt
6712
- });
6713
- const data = freeformEvaluationSchema.parse(
6714
- parseJsonFromText(extractLastAssistantContent(response.outputMessages))
6715
- );
6716
- const score = clampScore(data.score);
6717
- const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
6718
- const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
6719
- const reasoning = data.reasoning;
6720
- return {
6721
- score,
6722
- verdict: scoreToVerdict(score),
6723
- hits,
6724
- misses,
6725
- expectedAspectCount: Math.max(hits.length + misses.length, 1),
6726
- reasoning,
6727
- evaluatorRawRequest,
6728
- evaluatorResults
6639
+ path: path15,
6640
+ score: 0,
6641
+ weight,
6642
+ hit: false,
6643
+ message: `${path15} (invalid numeric value)`
6729
6644
  };
6730
- } catch {
6645
+ }
6646
+ const diff = Math.abs(candidateNum - expectedNum);
6647
+ let withinTolerance;
6648
+ if (relative) {
6649
+ const relativeDiff = expectedNum === 0 ? diff : diff / Math.abs(expectedNum);
6650
+ withinTolerance = relativeDiff <= tolerance;
6651
+ } else {
6652
+ withinTolerance = diff <= tolerance;
6653
+ }
6654
+ if (withinTolerance) {
6655
+ return {
6656
+ path: path15,
6657
+ score: 1,
6658
+ weight,
6659
+ hit: true,
6660
+ message: `${path15} (within tolerance: diff=${diff.toFixed(2)})`
6661
+ };
6662
+ }
6663
+ return {
6664
+ path: path15,
6665
+ score: 0,
6666
+ weight,
6667
+ hit: false,
6668
+ message: `${path15} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
6669
+ };
6670
+ }
6671
+ /**
6672
+ * Date comparison with format normalization.
6673
+ */
6674
+ compareDate(path15, candidateValue, expectedValue, fieldConfig, weight) {
6675
+ const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
6676
+ const candidateDate = parseDate(String(candidateValue), formats);
6677
+ const expectedDate = parseDate(String(expectedValue), formats);
6678
+ if (candidateDate === null) {
6731
6679
  return {
6680
+ path: path15,
6732
6681
  score: 0,
6733
- verdict: "fail",
6734
- hits: [],
6735
- misses: [],
6736
- expectedAspectCount: 1,
6737
- evaluatorRawRequest,
6738
- evaluatorResults
6682
+ weight,
6683
+ hit: false,
6684
+ message: `${path15} (unparseable candidate date)`
6685
+ };
6686
+ }
6687
+ if (expectedDate === null) {
6688
+ return {
6689
+ path: path15,
6690
+ score: 0,
6691
+ weight,
6692
+ hit: false,
6693
+ message: `${path15} (unparseable expected date)`
6694
+ };
6695
+ }
6696
+ if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
6697
+ return {
6698
+ path: path15,
6699
+ score: 1,
6700
+ weight,
6701
+ hit: true,
6702
+ message: path15
6739
6703
  };
6740
6704
  }
6705
+ return {
6706
+ path: path15,
6707
+ score: 0,
6708
+ weight,
6709
+ hit: false,
6710
+ message: `${path15} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
6711
+ };
6712
+ }
6713
+ /**
6714
+ * Aggregate field results using configured strategy.
6715
+ */
6716
+ aggregateResults(results) {
6717
+ const aggregation = this.config.aggregation ?? "weighted_average";
6718
+ const hits = [];
6719
+ const misses = [];
6720
+ for (const result of results) {
6721
+ if (result.hit) {
6722
+ hits.push(result.message);
6723
+ } else {
6724
+ misses.push(result.message);
6725
+ }
6726
+ }
6727
+ let score;
6728
+ if (aggregation === "all_or_nothing") {
6729
+ score = misses.length === 0 ? 1 : 0;
6730
+ } else {
6731
+ const totalWeight = results.reduce((sum, r) => sum + r.weight, 0);
6732
+ if (totalWeight === 0) {
6733
+ score = results.length === 0 ? 1 : 0;
6734
+ } else {
6735
+ const weightedSum = results.reduce((sum, r) => sum + r.score * r.weight, 0);
6736
+ score = weightedSum / totalWeight;
6737
+ }
6738
+ }
6739
+ const reasoning = `${hits.length}/${results.length} fields matched`;
6740
+ return {
6741
+ score: clampScore(score),
6742
+ verdict: scoreToVerdict(score),
6743
+ hits: hits.slice(0, 4),
6744
+ misses: misses.slice(0, 4),
6745
+ expectedAspectCount: results.length,
6746
+ reasoning
6747
+ };
6741
6748
  }
6742
6749
  };
6750
+ function resolvePath(obj, path15) {
6751
+ if (!path15 || !obj) {
6752
+ return void 0;
6753
+ }
6754
+ const parts = path15.split(/\.|\[|\]/).filter((p) => p.length > 0);
6755
+ let current = obj;
6756
+ for (const part of parts) {
6757
+ if (current === null || current === void 0) {
6758
+ return void 0;
6759
+ }
6760
+ if (typeof current !== "object") {
6761
+ return void 0;
6762
+ }
6763
+ const isIndex = /^\d+$/.test(part);
6764
+ if (isIndex && Array.isArray(current)) {
6765
+ current = current[Number.parseInt(part, 10)];
6766
+ } else {
6767
+ current = current[part];
6768
+ }
6769
+ }
6770
+ return current;
6771
+ }
6772
+ function toNumber(value) {
6773
+ if (typeof value === "number") {
6774
+ return value;
6775
+ }
6776
+ if (typeof value === "string") {
6777
+ const num = Number.parseFloat(value);
6778
+ return Number.isNaN(num) ? null : num;
6779
+ }
6780
+ return null;
6781
+ }
6782
+ function parseDate(dateStr, formats) {
6783
+ if (!dateStr) return null;
6784
+ const trimmed = dateStr.trim();
6785
+ const isoDate = new Date(trimmed);
6786
+ if (!Number.isNaN(isoDate.getTime())) {
6787
+ return isoDate;
6788
+ }
6789
+ const localizedMatch = trimmed.match(/^(\d{1,2})-([A-Za-z]{3,9})-(\d{4})$/);
6790
+ if (localizedMatch) {
6791
+ const day = Number.parseInt(localizedMatch[1], 10);
6792
+ const monthName = localizedMatch[2].toLowerCase();
6793
+ const year = Number.parseInt(localizedMatch[3], 10);
6794
+ const month = MONTH_NAMES[monthName];
6795
+ if (month !== void 0) {
6796
+ return new Date(year, month, day);
6797
+ }
6798
+ }
6799
+ const usMatch = trimmed.match(/^(\d{1,2})[\/\-](\d{1,2})[\/\-](\d{4})$/);
6800
+ if (usMatch) {
6801
+ const hasUSFormat = formats.some((f) => f.includes("MM/DD") || f.includes("MM-DD"));
6802
+ const hasEUFormat = formats.some((f) => f.includes("DD/MM") || f.includes("DD-MM"));
6803
+ if (hasUSFormat && !hasEUFormat) {
6804
+ const month = Number.parseInt(usMatch[1], 10) - 1;
6805
+ const day = Number.parseInt(usMatch[2], 10);
6806
+ const year = Number.parseInt(usMatch[3], 10);
6807
+ if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
6808
+ return new Date(year, month, day);
6809
+ }
6810
+ } else if (hasEUFormat && !hasUSFormat) {
6811
+ const day = Number.parseInt(usMatch[1], 10);
6812
+ const month = Number.parseInt(usMatch[2], 10) - 1;
6813
+ const year = Number.parseInt(usMatch[3], 10);
6814
+ if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
6815
+ return new Date(year, month, day);
6816
+ }
6817
+ } else {
6818
+ const num1 = Number.parseInt(usMatch[1], 10);
6819
+ const num2 = Number.parseInt(usMatch[2], 10);
6820
+ const year = Number.parseInt(usMatch[3], 10);
6821
+ if (num1 > 12 && num2 <= 12) {
6822
+ return new Date(year, num2 - 1, num1);
6823
+ }
6824
+ if (num2 > 12 && num1 <= 12) {
6825
+ return new Date(year, num1 - 1, num2);
6826
+ }
6827
+ if (num1 <= 12 && num2 <= 31) {
6828
+ return new Date(year, num1 - 1, num2);
6829
+ }
6830
+ }
6831
+ }
6832
+ return null;
6833
+ }
6834
+ function formatDateISO(date) {
6835
+ return date.toISOString().split("T")[0];
6836
+ }
6837
+ function parseJsonFromTextSafe(text) {
6838
+ return parseJsonFromText(text);
6839
+ }
6840
+
6841
+ // src/evaluation/evaluators/latency.ts
6743
6842
  var LatencyEvaluator = class {
6744
6843
  kind = "latency";
6745
6844
  config;
@@ -6772,57 +6871,17 @@ var LatencyEvaluator = class {
6772
6871
  hits: passed ? [`Duration ${durationMs}ms <= ${threshold}ms threshold`] : [],
6773
6872
  misses: passed ? [] : [`Duration ${durationMs}ms > ${threshold}ms threshold`],
6774
6873
  expectedAspectCount: 1,
6775
- reasoning: `Execution took ${durationMs}ms (threshold: ${threshold}ms)`,
6776
- evaluatorRawRequest: {
6777
- type: "latency",
6778
- threshold,
6779
- durationMs
6780
- }
6781
- };
6782
- }
6783
- };
6784
- var CostEvaluator = class {
6785
- kind = "cost";
6786
- config;
6787
- constructor(options) {
6788
- this.config = options.config;
6789
- }
6790
- evaluate(context) {
6791
- const { budget } = this.config;
6792
- const costUsd = context.traceSummary?.costUsd;
6793
- if (costUsd === void 0) {
6794
- return {
6795
- score: 0,
6796
- verdict: "fail",
6797
- hits: [],
6798
- misses: ["No cost data available in trace"],
6799
- expectedAspectCount: 1,
6800
- reasoning: "Execution cost not reported by provider",
6801
- evaluatorRawRequest: {
6802
- type: "cost",
6803
- budget,
6804
- costUsd: null
6805
- }
6806
- };
6807
- }
6808
- const passed = costUsd <= budget;
6809
- const score = passed ? 1 : 0;
6810
- const formatCost = (n) => `$${n.toFixed(4)}`;
6811
- return {
6812
- score,
6813
- verdict: passed ? "pass" : "fail",
6814
- hits: passed ? [`Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`] : [],
6815
- misses: passed ? [] : [`Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`],
6816
- expectedAspectCount: 1,
6817
- reasoning: `Execution cost ${formatCost(costUsd)} (budget: ${formatCost(budget)})`,
6818
- evaluatorRawRequest: {
6819
- type: "cost",
6820
- budget,
6821
- costUsd
6874
+ reasoning: `Execution took ${durationMs}ms (threshold: ${threshold}ms)`,
6875
+ evaluatorRawRequest: {
6876
+ type: "latency",
6877
+ threshold,
6878
+ durationMs
6822
6879
  }
6823
6880
  };
6824
6881
  }
6825
6882
  };
6883
+
6884
+ // src/evaluation/evaluators/token-usage.ts
6826
6885
  var TokenUsageEvaluator = class {
6827
6886
  kind = "token_usage";
6828
6887
  config;
@@ -6906,6 +6965,226 @@ var TokenUsageEvaluator = class {
6906
6965
  }
6907
6966
  };
6908
6967
 
6968
+ // src/evaluation/evaluators/tool-trajectory.ts
6969
+ function argsMatch(expected, actual) {
6970
+ if (expected === void 0) return true;
6971
+ if (expected === "any") return true;
6972
+ if (actual === void 0) return false;
6973
+ for (const key of Object.keys(expected)) {
6974
+ if (!Object.hasOwn(actual, key)) return false;
6975
+ if (!deepEqual(expected[key], actual[key])) return false;
6976
+ }
6977
+ return true;
6978
+ }
6979
+ var ToolTrajectoryEvaluator = class {
6980
+ kind = "tool_trajectory";
6981
+ config;
6982
+ constructor(options) {
6983
+ this.config = options.config;
6984
+ }
6985
+ evaluate(context) {
6986
+ const { outputMessages, traceSummary } = context;
6987
+ const toolCalls = this.extractToolCallsFromMessages(outputMessages);
6988
+ if (toolCalls.length === 0 && !traceSummary) {
6989
+ return {
6990
+ score: 0,
6991
+ verdict: "fail",
6992
+ hits: [],
6993
+ misses: ["No trace available for evaluation"],
6994
+ expectedAspectCount: 1
6995
+ };
6996
+ }
6997
+ const summary = toolCalls.length > 0 ? this.buildSummary(toolCalls) : traceSummary;
6998
+ if (!summary) {
6999
+ return {
7000
+ score: 0,
7001
+ verdict: "fail",
7002
+ hits: [],
7003
+ misses: ["No trace available for evaluation"],
7004
+ expectedAspectCount: 1
7005
+ };
7006
+ }
7007
+ switch (this.config.mode) {
7008
+ case "any_order":
7009
+ return this.evaluateAnyOrder(summary);
7010
+ case "in_order":
7011
+ return this.evaluateInOrder(toolCalls);
7012
+ case "exact":
7013
+ return this.evaluateExact(toolCalls);
7014
+ default:
7015
+ return {
7016
+ score: 0,
7017
+ verdict: "fail",
7018
+ hits: [],
7019
+ misses: [`Unknown mode: ${this.config.mode}`],
7020
+ expectedAspectCount: 1
7021
+ };
7022
+ }
7023
+ }
7024
+ /**
7025
+ * Extract tool calls from output messages.
7026
+ */
7027
+ extractToolCallsFromMessages(messages) {
7028
+ if (!messages) {
7029
+ return [];
7030
+ }
7031
+ const toolCalls = [];
7032
+ for (const message of messages) {
7033
+ if (message.toolCalls) {
7034
+ for (const call of message.toolCalls) {
7035
+ toolCalls.push({
7036
+ name: call.tool,
7037
+ args: call.input
7038
+ });
7039
+ }
7040
+ }
7041
+ }
7042
+ return toolCalls;
7043
+ }
7044
+ /**
7045
+ * Build a summary from extracted tool calls.
7046
+ */
7047
+ buildSummary(toolCalls) {
7048
+ const toolCallsByName = {};
7049
+ for (const call of toolCalls) {
7050
+ toolCallsByName[call.name] = (toolCallsByName[call.name] ?? 0) + 1;
7051
+ }
7052
+ const toolNames = Object.keys(toolCallsByName).sort();
7053
+ return {
7054
+ eventCount: toolCalls.length,
7055
+ toolNames,
7056
+ toolCallsByName,
7057
+ errorCount: 0
7058
+ };
7059
+ }
7060
+ evaluateAnyOrder(summary) {
7061
+ const minimums = this.config.minimums ?? {};
7062
+ const toolNames = Object.keys(minimums);
7063
+ if (toolNames.length === 0) {
7064
+ return {
7065
+ score: 1,
7066
+ verdict: "pass",
7067
+ hits: ["No tool requirements specified"],
7068
+ misses: [],
7069
+ expectedAspectCount: 0
7070
+ };
7071
+ }
7072
+ const hits = [];
7073
+ const misses = [];
7074
+ for (const toolName of toolNames) {
7075
+ const required = minimums[toolName];
7076
+ const actual = summary.toolCallsByName[toolName] ?? 0;
7077
+ if (actual >= required) {
7078
+ hits.push(`${toolName}: called ${actual} times (required \u2265${required})`);
7079
+ } else {
7080
+ misses.push(`${toolName}: called ${actual} times (required \u2265${required})`);
7081
+ }
7082
+ }
7083
+ const score = hits.length / toolNames.length;
7084
+ return {
7085
+ score,
7086
+ verdict: scoreToVerdict(score),
7087
+ hits,
7088
+ misses,
7089
+ expectedAspectCount: toolNames.length
7090
+ };
7091
+ }
7092
+ evaluateInOrder(toolCalls) {
7093
+ const expected = this.config.expected ?? [];
7094
+ if (expected.length === 0) {
7095
+ return {
7096
+ score: 1,
7097
+ verdict: "pass",
7098
+ hits: ["No tool sequence specified"],
7099
+ misses: [],
7100
+ expectedAspectCount: 0
7101
+ };
7102
+ }
7103
+ const hits = [];
7104
+ const misses = [];
7105
+ let actualIndex = 0;
7106
+ for (let i = 0; i < expected.length; i++) {
7107
+ const expectedItem = expected[i];
7108
+ const expectedTool = expectedItem.tool;
7109
+ let found = false;
7110
+ let argsMismatch = false;
7111
+ while (actualIndex < toolCalls.length) {
7112
+ const actualCall = toolCalls[actualIndex];
7113
+ if (actualCall.name === expectedTool) {
7114
+ if (argsMatch(expectedItem.args, actualCall.args)) {
7115
+ hits.push(`Found ${expectedTool} at position ${actualIndex}`);
7116
+ actualIndex++;
7117
+ found = true;
7118
+ break;
7119
+ }
7120
+ misses.push(
7121
+ `Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`
7122
+ );
7123
+ actualIndex++;
7124
+ argsMismatch = true;
7125
+ break;
7126
+ }
7127
+ actualIndex++;
7128
+ }
7129
+ if (!found && !argsMismatch) {
7130
+ misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
7131
+ }
7132
+ }
7133
+ const score = hits.length / expected.length;
7134
+ return {
7135
+ score,
7136
+ verdict: scoreToVerdict(score),
7137
+ hits,
7138
+ misses,
7139
+ expectedAspectCount: expected.length
7140
+ };
7141
+ }
7142
+ evaluateExact(toolCalls) {
7143
+ const expected = this.config.expected ?? [];
7144
+ if (expected.length === 0) {
7145
+ return {
7146
+ score: 1,
7147
+ verdict: "pass",
7148
+ hits: ["No tool sequence specified"],
7149
+ misses: [],
7150
+ expectedAspectCount: 0
7151
+ };
7152
+ }
7153
+ const hits = [];
7154
+ const misses = [];
7155
+ if (toolCalls.length !== expected.length) {
7156
+ misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
7157
+ }
7158
+ const checkLength = Math.min(expected.length, toolCalls.length);
7159
+ for (let i = 0; i < checkLength; i++) {
7160
+ const expectedItem = expected[i];
7161
+ const expectedTool = expectedItem.tool;
7162
+ const actualCall = toolCalls[i];
7163
+ const actualTool = actualCall.name;
7164
+ if (actualTool === expectedTool) {
7165
+ if (argsMatch(expectedItem.args, actualCall.args)) {
7166
+ hits.push(`Position ${i}: ${expectedTool}`);
7167
+ } else {
7168
+ misses.push(`Position ${i}: ${expectedTool} args mismatch`);
7169
+ }
7170
+ } else {
7171
+ misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
7172
+ }
7173
+ }
7174
+ for (let i = checkLength; i < expected.length; i++) {
7175
+ misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
7176
+ }
7177
+ const score = hits.length / expected.length;
7178
+ return {
7179
+ score,
7180
+ verdict: scoreToVerdict(score),
7181
+ hits,
7182
+ misses,
7183
+ expectedAspectCount: expected.length
7184
+ };
7185
+ }
7186
+ };
7187
+
6909
7188
  // src/evaluation/orchestrator.ts
6910
7189
  import { createHash } from "node:crypto";
6911
7190
  import path14 from "node:path";
@@ -7119,6 +7398,17 @@ async function runEvaluation(options) {
7119
7398
  }
7120
7399
  return getOrCreateProvider(resolvedJudge);
7121
7400
  };
7401
+ const targetResolver = (name) => {
7402
+ const resolved = resolveTargetByName(name);
7403
+ if (!resolved) {
7404
+ return void 0;
7405
+ }
7406
+ return getOrCreateProvider(resolved);
7407
+ };
7408
+ const availableTargets = [
7409
+ target.name,
7410
+ ...Array.from(targetDefinitions.keys())
7411
+ ];
7122
7412
  const evaluatorRegistry = buildEvaluatorRegistry(evaluators, resolveJudgeProvider);
7123
7413
  const primaryProvider = getOrCreateProvider(target);
7124
7414
  const providerSupportsBatch = target.providerBatching === true && primaryProvider.supportsBatch === true && typeof primaryProvider.invokeBatch === "function";
@@ -7148,7 +7438,9 @@ async function runEvaluation(options) {
7148
7438
  onResult,
7149
7439
  verbose,
7150
7440
  resolveJudgeProvider,
7151
- agentTimeoutMs
7441
+ agentTimeoutMs,
7442
+ targetResolver,
7443
+ availableTargets
7152
7444
  });
7153
7445
  } catch (error) {
7154
7446
  if (verbose) {
@@ -7187,7 +7479,9 @@ async function runEvaluation(options) {
7187
7479
  cache,
7188
7480
  useCache,
7189
7481
  now,
7190
- judgeProvider
7482
+ judgeProvider,
7483
+ targetResolver,
7484
+ availableTargets
7191
7485
  });
7192
7486
  if (onProgress) {
7193
7487
  await onProgress({
@@ -7254,7 +7548,9 @@ async function runBatchEvaluation(options) {
7254
7548
  onProgress,
7255
7549
  onResult,
7256
7550
  resolveJudgeProvider,
7257
- agentTimeoutMs
7551
+ agentTimeoutMs,
7552
+ targetResolver,
7553
+ availableTargets
7258
7554
  } = options;
7259
7555
  const promptInputsList = [];
7260
7556
  const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
@@ -7329,7 +7625,9 @@ async function runBatchEvaluation(options) {
7329
7625
  judgeProvider: await resolveJudgeProvider(target),
7330
7626
  agentTimeoutMs,
7331
7627
  outputMessages,
7332
- traceSummary
7628
+ traceSummary,
7629
+ targetResolver,
7630
+ availableTargets
7333
7631
  });
7334
7632
  if (providerError) {
7335
7633
  result = { ...result, error: providerError };
@@ -7387,7 +7685,9 @@ async function runEvalCase(options) {
7387
7685
  cache,
7388
7686
  useCache,
7389
7687
  signal,
7390
- judgeProvider
7688
+ judgeProvider,
7689
+ targetResolver,
7690
+ availableTargets
7391
7691
  } = options;
7392
7692
  const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
7393
7693
  const promptInputs = await buildPromptInputs(evalCase, formattingMode);
@@ -7461,7 +7761,9 @@ async function runEvalCase(options) {
7461
7761
  judgeProvider,
7462
7762
  agentTimeoutMs,
7463
7763
  outputMessages,
7464
- traceSummary
7764
+ traceSummary,
7765
+ targetResolver,
7766
+ availableTargets
7465
7767
  });
7466
7768
  return providerError ? { ...result, error: providerError } : result;
7467
7769
  } catch (error) {
@@ -7481,7 +7783,9 @@ async function evaluateCandidate(options) {
7481
7783
  judgeProvider,
7482
7784
  agentTimeoutMs,
7483
7785
  outputMessages,
7484
- traceSummary
7786
+ traceSummary,
7787
+ targetResolver,
7788
+ availableTargets
7485
7789
  } = options;
7486
7790
  const gradeTimestamp = nowFn();
7487
7791
  const { score, evaluatorResults } = await runEvaluatorsForCase({
@@ -7496,7 +7800,9 @@ async function evaluateCandidate(options) {
7496
7800
  judgeProvider,
7497
7801
  agentTimeoutMs,
7498
7802
  outputMessages,
7499
- traceSummary
7803
+ traceSummary,
7804
+ targetResolver,
7805
+ availableTargets
7500
7806
  });
7501
7807
  const completedAt = nowFn();
7502
7808
  let agentProviderRequest;
@@ -7549,7 +7855,9 @@ async function runEvaluatorsForCase(options) {
7549
7855
  judgeProvider,
7550
7856
  agentTimeoutMs,
7551
7857
  outputMessages,
7552
- traceSummary
7858
+ traceSummary,
7859
+ targetResolver,
7860
+ availableTargets
7553
7861
  } = options;
7554
7862
  if (evalCase.evaluators && evalCase.evaluators.length > 0) {
7555
7863
  return runEvaluatorList({
@@ -7565,7 +7873,9 @@ async function runEvaluatorsForCase(options) {
7565
7873
  judgeProvider,
7566
7874
  agentTimeoutMs,
7567
7875
  outputMessages,
7568
- traceSummary
7876
+ traceSummary,
7877
+ targetResolver,
7878
+ availableTargets
7569
7879
  });
7570
7880
  }
7571
7881
  const evaluatorKind = evalCase.evaluator ?? "llm_judge";
@@ -7583,7 +7893,9 @@ async function runEvaluatorsForCase(options) {
7583
7893
  now,
7584
7894
  judgeProvider,
7585
7895
  outputMessages,
7586
- traceSummary
7896
+ traceSummary,
7897
+ targetResolver,
7898
+ availableTargets
7587
7899
  });
7588
7900
  return { score };
7589
7901
  }
@@ -7601,7 +7913,9 @@ async function runEvaluatorList(options) {
7601
7913
  judgeProvider,
7602
7914
  agentTimeoutMs,
7603
7915
  outputMessages,
7604
- traceSummary
7916
+ traceSummary,
7917
+ targetResolver,
7918
+ availableTargets
7605
7919
  } = options;
7606
7920
  const scored = [];
7607
7921
  const evaluatorResults = [];
@@ -7639,7 +7953,8 @@ async function runEvaluatorList(options) {
7639
7953
  script: evaluator.script,
7640
7954
  cwd: evaluator.resolvedCwd ?? evaluator.cwd,
7641
7955
  agentTimeoutMs,
7642
- config: evaluator.config
7956
+ config: evaluator.config,
7957
+ target: evaluator.target
7643
7958
  });
7644
7959
  const score2 = await codeEvaluator.evaluate({
7645
7960
  evalCase,
@@ -7649,8 +7964,11 @@ async function runEvaluatorList(options) {
7649
7964
  attempt,
7650
7965
  promptInputs,
7651
7966
  now,
7967
+ judgeProvider,
7652
7968
  outputMessages,
7653
- traceSummary
7969
+ traceSummary,
7970
+ targetResolver,
7971
+ availableTargets
7654
7972
  });
7655
7973
  const weight = evaluator.weight ?? 1;
7656
7974
  scored.push({ score: score2, name: evaluator.name, type: "code_judge", weight });
@@ -7663,7 +7981,8 @@ async function runEvaluatorList(options) {
7663
7981
  hits: score2.hits,
7664
7982
  misses: score2.misses,
7665
7983
  reasoning: score2.reasoning,
7666
- evaluatorProviderRequest: score2.evaluatorRawRequest
7984
+ evaluatorProviderRequest: score2.evaluatorRawRequest,
7985
+ details: score2.details
7667
7986
  });
7668
7987
  }
7669
7988
  if (evaluator.type === "composite") {
@@ -7677,7 +7996,8 @@ async function runEvaluatorList(options) {
7677
7996
  script: memberConfig.script,
7678
7997
  cwd: memberConfig.resolvedCwd ?? memberConfig.cwd,
7679
7998
  agentTimeoutMs,
7680
- config: memberConfig.config
7999
+ config: memberConfig.config,
8000
+ target: memberConfig.target
7681
8001
  });
7682
8002
  case "composite":
7683
8003
  return new CompositeEvaluator({
@@ -7726,7 +8046,9 @@ async function runEvaluatorList(options) {
7726
8046
  now,
7727
8047
  judgeProvider,
7728
8048
  outputMessages,
7729
- traceSummary
8049
+ traceSummary,
8050
+ targetResolver,
8051
+ availableTargets
7730
8052
  });
7731
8053
  const weight = evaluator.weight ?? 1;
7732
8054
  scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
@@ -7922,11 +8244,11 @@ async function runEvaluatorList(options) {
7922
8244
  (total, entry) => total + (entry.score.expectedAspectCount ?? 0),
7923
8245
  0
7924
8246
  );
7925
- const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString2);
8247
+ const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString);
7926
8248
  const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
7927
8249
  const score = {
7928
8250
  score: aggregateScore,
7929
- verdict: scoreToVerdict2(aggregateScore),
8251
+ verdict: scoreToVerdict(aggregateScore),
7930
8252
  hits,
7931
8253
  misses,
7932
8254
  expectedAspectCount,
@@ -7973,18 +8295,6 @@ async function resolveCustomPrompt(config) {
7973
8295
  }
7974
8296
  return config.prompt;
7975
8297
  }
7976
- function isNonEmptyString2(value) {
7977
- return typeof value === "string" && value.trim().length > 0;
7978
- }
7979
- function scoreToVerdict2(score) {
7980
- if (score >= 0.8) {
7981
- return "pass";
7982
- }
7983
- if (score >= 0.6) {
7984
- return "borderline";
7985
- }
7986
- return "fail";
7987
- }
7988
8298
  function filterEvalCases(evalCases, evalId) {
7989
8299
  if (!evalId) {
7990
8300
  return evalCases;
@@ -8127,7 +8437,8 @@ function mapChildResults(children) {
8127
8437
  misses: child.misses,
8128
8438
  reasoning: child.reasoning,
8129
8439
  evaluatorProviderRequest: child.evaluatorRawRequest,
8130
- evaluatorResults: mapChildResults(child.evaluatorResults)
8440
+ evaluatorResults: mapChildResults(child.evaluatorResults),
8441
+ details: child.details
8131
8442
  }));
8132
8443
  }
8133
8444
  function computeWeightedMean(entries) {
@@ -8142,7 +8453,7 @@ function computeWeightedMean(entries) {
8142
8453
  }
8143
8454
 
8144
8455
  // src/evaluation/generators/rubric-generator.ts
8145
- import { generateText as generateText3 } from "ai";
8456
+ import { generateText as generateText4 } from "ai";
8146
8457
  import { z as z3 } from "zod";
8147
8458
  var rubricItemSchema = z3.object({
8148
8459
  id: z3.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
@@ -8176,7 +8487,7 @@ You must return a valid JSON object matching this schema:
8176
8487
  let lastError;
8177
8488
  for (let attempt = 1; attempt <= 3; attempt++) {
8178
8489
  try {
8179
- const { text } = await generateText3({
8490
+ const { text } = await generateText4({
8180
8491
  model,
8181
8492
  system,
8182
8493
  prompt
@@ -8238,31 +8549,39 @@ export {
8238
8549
  ToolTrajectoryEvaluator,
8239
8550
  avgToolDurationMs,
8240
8551
  buildDirectoryChain,
8552
+ buildOutputSchema,
8241
8553
  buildPromptInputs,
8242
8554
  buildSearchRoots,
8555
+ clampScore,
8243
8556
  computeTraceSummary,
8244
8557
  consumeClaudeCodeLogEntries,
8245
8558
  consumeCodexLogEntries,
8246
8559
  consumePiLogEntries,
8247
8560
  createAgentKernel,
8248
8561
  createProvider,
8562
+ deepEqual,
8249
8563
  ensureVSCodeSubagents,
8564
+ executeScript,
8250
8565
  explorationRatio,
8251
- extractCodeBlocks,
8566
+ extractJsonBlob,
8252
8567
  fileExists,
8253
8568
  findGitRoot,
8569
+ freeformEvaluationSchema,
8254
8570
  generateRubrics,
8255
8571
  getHitCount,
8256
8572
  isEvaluatorKind,
8257
8573
  isGuidelineFile,
8258
8574
  isJsonObject,
8259
8575
  isJsonValue,
8576
+ isNonEmptyString,
8260
8577
  isTestMessage,
8261
8578
  isTestMessageRole,
8262
8579
  listTargetNames,
8263
8580
  loadEvalCases,
8264
8581
  mergeExecutionMetrics,
8265
8582
  normalizeLineEndings,
8583
+ parseJsonFromText,
8584
+ parseJsonSafe,
8266
8585
  readJsonFile,
8267
8586
  readTargetDefinitions,
8268
8587
  readTestSuiteMetadata,
@@ -8272,6 +8591,7 @@ export {
8272
8591
  resolveTargetDefinition,
8273
8592
  runEvalCase,
8274
8593
  runEvaluation,
8594
+ scoreToVerdict,
8275
8595
  subscribeToClaudeCodeLogEntries,
8276
8596
  subscribeToCodexLogEntries,
8277
8597
  subscribeToPiLogEntries,