@agentv/core 2.0.2 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -150,85 +150,6 @@ import { readFile as readFile5 } from "node:fs/promises";
150
150
  import path6 from "node:path";
151
151
  import { parse as parse2 } from "yaml";
152
152
 
153
- // src/evaluation/formatting/segment-formatter.ts
154
- function extractCodeBlocks(segments) {
155
- const CODE_BLOCK_PATTERN = /```[\s\S]*?```/g;
156
- const codeBlocks = [];
157
- for (const segment of segments) {
158
- const typeValue = segment.type;
159
- if (typeof typeValue !== "string" || typeValue !== "text") {
160
- continue;
161
- }
162
- const textValue = segment.value;
163
- if (typeof textValue !== "string") {
164
- continue;
165
- }
166
- const matches = textValue.match(CODE_BLOCK_PATTERN);
167
- if (matches) {
168
- codeBlocks.push(...matches);
169
- }
170
- }
171
- return codeBlocks;
172
- }
173
- function formatFileContents(parts) {
174
- const fileCount = parts.filter((p) => p.isFile).length;
175
- if (fileCount > 0) {
176
- return parts.map((part) => {
177
- if (part.isFile && part.displayPath) {
178
- return `<file path="${part.displayPath}">
179
- ${part.content}
180
- </file>`;
181
- }
182
- return part.content;
183
- }).join("\n\n");
184
- }
185
- return parts.map((p) => p.content).join(" ");
186
- }
187
- function formatSegment(segment, mode = "lm") {
188
- const type = asString(segment.type);
189
- if (type === "text") {
190
- return asString(segment.value);
191
- }
192
- if (type === "guideline_ref") {
193
- const refPath = asString(segment.path);
194
- return refPath ? `<Attached: ${refPath}>` : void 0;
195
- }
196
- if (type === "file") {
197
- const filePath = asString(segment.path);
198
- if (!filePath) {
199
- return void 0;
200
- }
201
- if (mode === "agent") {
202
- return `<file: path="${filePath}">`;
203
- }
204
- const text = asString(segment.text);
205
- if (text && filePath) {
206
- return formatFileContents([{ content: text.trim(), isFile: true, displayPath: filePath }]);
207
- }
208
- }
209
- return void 0;
210
- }
211
- function hasVisibleContent(segments) {
212
- return segments.some((segment) => {
213
- const type = asString(segment.type);
214
- if (type === "text") {
215
- const value = asString(segment.value);
216
- return value !== void 0 && value.trim().length > 0;
217
- }
218
- if (type === "guideline_ref") {
219
- return false;
220
- }
221
- if (type === "file") {
222
- const text = asString(segment.text);
223
- return text !== void 0 && text.trim().length > 0;
224
- }
225
- return false;
226
- });
227
- }
228
- function asString(value) {
229
- return typeof value === "string" ? value : void 0;
230
- }
231
-
232
153
  // src/evaluation/loaders/config-loader.ts
233
154
  import { readFile } from "node:fs/promises";
234
155
  import path2 from "node:path";
@@ -483,7 +404,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
483
404
  logWarning2(`Skipping invalid evaluator entry for '${evalId}' (expected object)`);
484
405
  continue;
485
406
  }
486
- const name = asString2(rawEvaluator.name);
407
+ const name = asString(rawEvaluator.name);
487
408
  const typeValue = rawEvaluator.type;
488
409
  if (!name || !isEvaluatorKind(typeValue)) {
489
410
  logWarning2(`Skipping evaluator with invalid name/type in '${evalId}'`);
@@ -511,7 +432,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
511
432
  continue;
512
433
  }
513
434
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
514
- const cwd = asString2(rawEvaluator.cwd);
435
+ const cwd = asString(rawEvaluator.cwd);
515
436
  let resolvedCwd;
516
437
  if (cwd) {
517
438
  const resolved = await resolveFileReference2(cwd, searchRoots);
@@ -526,7 +447,29 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
526
447
  } else {
527
448
  resolvedCwd = searchRoots[0];
528
449
  }
529
- const knownProps = /* @__PURE__ */ new Set(["name", "type", "script", "cwd", "weight"]);
450
+ const rawTarget = rawEvaluator.target;
451
+ let targetConfig;
452
+ if (rawTarget !== void 0) {
453
+ if (isJsonObject2(rawTarget)) {
454
+ const maxCalls = rawTarget.max_calls;
455
+ if (maxCalls !== void 0 && (typeof maxCalls !== "number" || maxCalls < 0)) {
456
+ logWarning2(
457
+ `Invalid target.max_calls for evaluator '${name}' in '${evalId}': must be a non-negative number`
458
+ );
459
+ } else {
460
+ targetConfig = {
461
+ ...typeof maxCalls === "number" ? { max_calls: maxCalls } : {}
462
+ };
463
+ }
464
+ } else if (rawTarget === true) {
465
+ targetConfig = {};
466
+ } else {
467
+ logWarning2(
468
+ `Invalid target config for evaluator '${name}' in '${evalId}': expected object or true`
469
+ );
470
+ }
471
+ }
472
+ const knownProps = /* @__PURE__ */ new Set(["name", "type", "script", "cwd", "weight", "target"]);
530
473
  const config = {};
531
474
  for (const [key, value] of Object.entries(rawEvaluator)) {
532
475
  if (!knownProps.has(key) && value !== void 0) {
@@ -540,7 +483,8 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
540
483
  cwd,
541
484
  resolvedCwd,
542
485
  ...weight2 !== void 0 ? { weight: weight2 } : {},
543
- ...Object.keys(config).length > 0 ? { config } : {}
486
+ ...Object.keys(config).length > 0 ? { config } : {},
487
+ ...targetConfig !== void 0 ? { target: targetConfig } : {}
544
488
  });
545
489
  continue;
546
490
  }
@@ -557,7 +501,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
557
501
  logWarning2(`Skipping composite evaluator '${name}' in '${evalId}': missing aggregator`);
558
502
  continue;
559
503
  }
560
- const aggregatorType = asString2(rawAggregator.type);
504
+ const aggregatorType = asString(rawAggregator.type);
561
505
  if (aggregatorType !== "weighted_average" && aggregatorType !== "code_judge" && aggregatorType !== "llm_judge") {
562
506
  logWarning2(
563
507
  `Skipping composite evaluator '${name}' in '${evalId}': invalid aggregator type '${aggregatorType}'`
@@ -570,7 +514,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
570
514
  logWarning2(`Skipping invalid member evaluator in composite '${name}' (expected object)`);
571
515
  continue;
572
516
  }
573
- const memberName = asString2(rawMember.name);
517
+ const memberName = asString(rawMember.name);
574
518
  const memberType = rawMember.type;
575
519
  if (!memberName || !isEvaluatorKind(memberType)) {
576
520
  logWarning2(`Skipping member evaluator with invalid name/type in composite '${name}'`);
@@ -608,7 +552,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
608
552
  ...Object.keys(parsedWeights).length > 0 ? { weights: parsedWeights } : {}
609
553
  };
610
554
  } else if (aggregatorType === "code_judge") {
611
- const aggregatorPath = asString2(rawAggregator.path);
555
+ const aggregatorPath = asString(rawAggregator.path);
612
556
  if (!aggregatorPath) {
613
557
  logWarning2(
614
558
  `Skipping composite evaluator '${name}' in '${evalId}': code_judge aggregator missing path`
@@ -621,7 +565,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
621
565
  cwd: searchRoots[0]
622
566
  };
623
567
  } else {
624
- const aggregatorPrompt = asString2(rawAggregator.prompt);
568
+ const aggregatorPrompt = asString(rawAggregator.prompt);
625
569
  let promptPath2;
626
570
  if (aggregatorPrompt) {
627
571
  const resolved = await resolveFileReference2(aggregatorPrompt, searchRoots);
@@ -646,7 +590,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
646
590
  continue;
647
591
  }
648
592
  if (typeValue === "tool_trajectory") {
649
- const mode = asString2(rawEvaluator.mode);
593
+ const mode = asString(rawEvaluator.mode);
650
594
  if (mode !== "any_order" && mode !== "in_order" && mode !== "exact") {
651
595
  logWarning2(
652
596
  `Skipping tool_trajectory evaluator '${name}' in '${evalId}': invalid mode '${mode}' (must be any_order, in_order, or exact)`
@@ -737,8 +681,8 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
737
681
  );
738
682
  continue;
739
683
  }
740
- const fieldPath = asString2(rawField.path);
741
- const match = asString2(rawField.match);
684
+ const fieldPath = asString(rawField.path);
685
+ const match = asString(rawField.match);
742
686
  if (!fieldPath) {
743
687
  logWarning2(
744
688
  `Skipping field without path in field_accuracy evaluator '${name}' in '${evalId}'`
@@ -768,7 +712,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
768
712
  );
769
713
  continue;
770
714
  }
771
- const aggregation = asString2(rawEvaluator.aggregation);
715
+ const aggregation = asString(rawEvaluator.aggregation);
772
716
  const validAggregation = isValidFieldAggregationType(aggregation) ? aggregation : void 0;
773
717
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
774
718
  evaluators.push({
@@ -849,7 +793,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
849
793
  });
850
794
  continue;
851
795
  }
852
- const prompt = asString2(rawEvaluator.prompt);
796
+ const prompt = asString(rawEvaluator.prompt);
853
797
  let promptPath;
854
798
  if (prompt) {
855
799
  const resolved = await resolveFileReference2(prompt, searchRoots);
@@ -868,11 +812,11 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
868
812
  );
869
813
  }
870
814
  }
871
- const _model = asString2(rawEvaluator.model);
815
+ const _model = asString(rawEvaluator.model);
872
816
  const rawRubrics = rawEvaluator.rubrics;
873
817
  const parsedRubrics = Array.isArray(rawRubrics) ? rawRubrics.filter((r) => isJsonObject2(r)).map((rubric, index) => ({
874
- id: asString2(rubric.id) ?? `rubric-${index + 1}`,
875
- description: asString2(rubric.description) ?? "",
818
+ id: asString(rubric.id) ?? `rubric-${index + 1}`,
819
+ description: asString(rubric.description) ?? "",
876
820
  weight: typeof rubric.weight === "number" ? rubric.weight : 1,
877
821
  required: typeof rubric.required === "boolean" ? rubric.required : true
878
822
  })).filter((r) => r.description.length > 0) : void 0;
@@ -916,7 +860,7 @@ function coerceEvaluator(candidate, contextId) {
916
860
  logWarning2(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
917
861
  return void 0;
918
862
  }
919
- function asString2(value) {
863
+ function asString(value) {
920
864
  return typeof value === "string" ? value : void 0;
921
865
  }
922
866
  function asStringArray(value, description) {
@@ -992,6 +936,68 @@ function isValidFieldAggregationType(value) {
992
936
  // src/evaluation/loaders/message-processor.ts
993
937
  import { readFile as readFile3 } from "node:fs/promises";
994
938
  import path4 from "node:path";
939
+
940
+ // src/evaluation/formatting/segment-formatter.ts
941
+ function formatFileContents(parts) {
942
+ const fileCount = parts.filter((p) => p.isFile).length;
943
+ if (fileCount > 0) {
944
+ return parts.map((part) => {
945
+ if (part.isFile && part.displayPath) {
946
+ return `<file path="${part.displayPath}">
947
+ ${part.content}
948
+ </file>`;
949
+ }
950
+ return part.content;
951
+ }).join("\n\n");
952
+ }
953
+ return parts.map((p) => p.content).join(" ");
954
+ }
955
+ function formatSegment(segment, mode = "lm") {
956
+ const type = asString2(segment.type);
957
+ if (type === "text") {
958
+ return asString2(segment.value);
959
+ }
960
+ if (type === "guideline_ref") {
961
+ const refPath = asString2(segment.path);
962
+ return refPath ? `<Attached: ${refPath}>` : void 0;
963
+ }
964
+ if (type === "file") {
965
+ const filePath = asString2(segment.path);
966
+ if (!filePath) {
967
+ return void 0;
968
+ }
969
+ if (mode === "agent") {
970
+ return `<file: path="${filePath}">`;
971
+ }
972
+ const text = asString2(segment.text);
973
+ if (text && filePath) {
974
+ return formatFileContents([{ content: text.trim(), isFile: true, displayPath: filePath }]);
975
+ }
976
+ }
977
+ return void 0;
978
+ }
979
+ function hasVisibleContent(segments) {
980
+ return segments.some((segment) => {
981
+ const type = asString2(segment.type);
982
+ if (type === "text") {
983
+ const value = asString2(segment.value);
984
+ return value !== void 0 && value.trim().length > 0;
985
+ }
986
+ if (type === "guideline_ref") {
987
+ return false;
988
+ }
989
+ if (type === "file") {
990
+ const text = asString2(segment.text);
991
+ return text !== void 0 && text.trim().length > 0;
992
+ }
993
+ return false;
994
+ });
995
+ }
996
+ function asString2(value) {
997
+ return typeof value === "string" ? value : void 0;
998
+ }
999
+
1000
+ // src/evaluation/loaders/message-processor.ts
995
1001
  var ANSI_YELLOW4 = "\x1B[33m";
996
1002
  var ANSI_RESET4 = "\x1B[0m";
997
1003
  async function processMessages(options) {
@@ -1297,9 +1303,6 @@ ${messageContent}`);
1297
1303
  questionParts.push(formattedContent);
1298
1304
  }
1299
1305
  }
1300
- if (testCase.code_snippets.length > 0) {
1301
- questionParts.push(testCase.code_snippets.join("\n"));
1302
- }
1303
1306
  question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
1304
1307
  }
1305
1308
  const chatPrompt = useRoleMarkers ? buildChatPromptFromSegments({
@@ -1498,7 +1501,6 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
1498
1501
  repoRootPath,
1499
1502
  verbose
1500
1503
  }) : [];
1501
- const codeSnippets = extractCodeBlocks(inputSegments);
1502
1504
  let referenceAnswer = "";
1503
1505
  if (outputSegments.length > 0) {
1504
1506
  const lastMessage = outputSegments[outputSegments.length - 1];
@@ -1571,7 +1573,6 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
1571
1573
  guideline_paths: guidelinePaths.map((guidelinePath) => path6.resolve(guidelinePath)),
1572
1574
  guideline_patterns: guidelinePatterns,
1573
1575
  file_paths: allFilePaths,
1574
- code_snippets: codeSnippets,
1575
1576
  expected_outcome: outcome,
1576
1577
  evaluator: evalCaseEvaluatorKind,
1577
1578
  evaluators
@@ -5311,9 +5312,64 @@ function resolveAndCreateProvider(definition, env = process.env) {
5311
5312
  return createProvider(resolved);
5312
5313
  }
5313
5314
 
5314
- // src/evaluation/evaluators.ts
5315
- import { generateText as generateText2 } from "ai";
5316
- import { z as z2 } from "zod";
5315
+ // src/evaluation/evaluators/scoring.ts
5316
+ function scoreToVerdict(score) {
5317
+ if (score >= 0.8) {
5318
+ return "pass";
5319
+ }
5320
+ if (score >= 0.6) {
5321
+ return "borderline";
5322
+ }
5323
+ return "fail";
5324
+ }
5325
+ function clampScore(value) {
5326
+ if (Number.isNaN(value) || !Number.isFinite(value)) {
5327
+ return 0;
5328
+ }
5329
+ if (value < 0) {
5330
+ return 0;
5331
+ }
5332
+ if (value > 1) {
5333
+ return 1;
5334
+ }
5335
+ return value;
5336
+ }
5337
+ function extractJsonBlob(text) {
5338
+ const match = text.match(/\{[\s\S]*\}/);
5339
+ return match?.[0];
5340
+ }
5341
+ function parseJsonFromText(text) {
5342
+ const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
5343
+ const blob = extractJsonBlob(cleaned) ?? cleaned;
5344
+ return JSON.parse(blob);
5345
+ }
5346
+ function isNonEmptyString(value) {
5347
+ return typeof value === "string" && value.trim().length > 0;
5348
+ }
5349
+ function parseJsonSafe(payload) {
5350
+ try {
5351
+ return JSON.parse(payload);
5352
+ } catch {
5353
+ return void 0;
5354
+ }
5355
+ }
5356
+ function deepEqual(a, b) {
5357
+ if (a === b) return true;
5358
+ if (a === null || b === null) return a === b;
5359
+ if (typeof a !== typeof b) return false;
5360
+ if (typeof a !== "object") return a === b;
5361
+ if (Array.isArray(a) !== Array.isArray(b)) return false;
5362
+ if (Array.isArray(a) && Array.isArray(b)) {
5363
+ if (a.length !== b.length) return false;
5364
+ return a.every((val, i) => deepEqual(val, b[i]));
5365
+ }
5366
+ const aObj = a;
5367
+ const bObj = b;
5368
+ const aKeys = Object.keys(aObj);
5369
+ const bKeys = Object.keys(bObj);
5370
+ if (aKeys.length !== bKeys.length) return false;
5371
+ return aKeys.every((key) => Object.hasOwn(bObj, key) && deepEqual(aObj[key], bObj[key]));
5372
+ }
5317
5373
 
5318
5374
  // src/runtime/exec.ts
5319
5375
  function shellEscapePath(value) {
@@ -5338,7 +5394,9 @@ async function execFileWithStdinBun(argv, stdinPayload, options) {
5338
5394
  cwd: options.cwd,
5339
5395
  stdin: encoder.encode(stdinPayload),
5340
5396
  stdout: "pipe",
5341
- stderr: "pipe"
5397
+ stderr: "pipe",
5398
+ // Merge additional env vars with process.env
5399
+ env: options.env ? { ...process.env, ...options.env } : process.env
5342
5400
  });
5343
5401
  let timedOut = false;
5344
5402
  const timeout = options.timeoutMs !== void 0 ? setTimeout(() => {
@@ -5373,7 +5431,9 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
5373
5431
  const [cmd, ...args] = argv;
5374
5432
  const child = spawn4(cmd, args, {
5375
5433
  cwd: options.cwd,
5376
- stdio: ["pipe", "pipe", "pipe"]
5434
+ stdio: ["pipe", "pipe", "pipe"],
5435
+ // Merge additional env vars with process.env
5436
+ env: options.env ? { ...process.env, ...options.env } : process.env
5377
5437
  });
5378
5438
  const stdoutChunks = [];
5379
5439
  const stderrChunks = [];
@@ -5426,7 +5486,9 @@ async function execShellWithStdin(command, stdinPayload, options = {}) {
5426
5486
  const child = spawn4(wrappedCommand, {
5427
5487
  shell: true,
5428
5488
  cwd: options.cwd,
5429
- stdio: ["ignore", "ignore", "ignore"]
5489
+ stdio: ["ignore", "ignore", "ignore"],
5490
+ // Merge additional env vars with process.env
5491
+ env: options.env ? { ...process.env, ...options.env } : process.env
5430
5492
  });
5431
5493
  const timeout = options.timeoutMs ? setTimeout(() => {
5432
5494
  child.kill();
@@ -5453,32 +5515,387 @@ async function execShellWithStdin(command, stdinPayload, options = {}) {
5453
5515
  }
5454
5516
  }
5455
5517
 
5456
- // src/evaluation/case-conversion.ts
5457
- function toSnakeCase(str) {
5458
- if (/^[A-Z]/.test(str)) {
5459
- return str;
5518
+ // src/runtime/target-proxy.ts
5519
+ import { randomBytes } from "node:crypto";
5520
+ import { createServer } from "node:http";
5521
+ var DEFAULT_MAX_CALLS = 50;
5522
+ async function createTargetProxy(options) {
5523
+ const { defaultProvider, targetResolver, availableTargets, maxCalls } = options;
5524
+ const token = randomBytes(32).toString("hex");
5525
+ let callCount = 0;
5526
+ let isShutdown = false;
5527
+ const targetsList = availableTargets ?? [defaultProvider.targetName];
5528
+ function resolveProvider(targetName) {
5529
+ if (targetName === void 0 || targetName === defaultProvider.targetName) {
5530
+ return defaultProvider;
5531
+ }
5532
+ if (targetResolver) {
5533
+ return targetResolver(targetName);
5534
+ }
5535
+ return void 0;
5460
5536
  }
5461
- return str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
5462
- }
5463
- function toSnakeCaseDeep(obj) {
5464
- if (obj === null || obj === void 0) {
5465
- return obj;
5537
+ const server = createServer(async (req, res) => {
5538
+ res.setHeader("Access-Control-Allow-Origin", "*");
5539
+ res.setHeader("Access-Control-Allow-Methods", "GET, POST, OPTIONS");
5540
+ res.setHeader("Access-Control-Allow-Headers", "Content-Type, Authorization");
5541
+ if (req.method === "OPTIONS") {
5542
+ res.writeHead(204);
5543
+ res.end();
5544
+ return;
5545
+ }
5546
+ const authHeader = req.headers.authorization;
5547
+ if (!authHeader || authHeader !== `Bearer ${token}`) {
5548
+ sendJson(res, 401, { error: "Unauthorized" });
5549
+ return;
5550
+ }
5551
+ if (isShutdown) {
5552
+ sendJson(res, 503, { error: "Proxy is shutting down" });
5553
+ return;
5554
+ }
5555
+ const url2 = req.url ?? "";
5556
+ if (req.method === "GET" && url2 === "/info") {
5557
+ handleInfo(res);
5558
+ return;
5559
+ }
5560
+ if (req.method === "POST" && url2 === "/invoke") {
5561
+ await handleInvoke(req, res);
5562
+ return;
5563
+ }
5564
+ if (req.method === "POST" && url2 === "/invokeBatch") {
5565
+ await handleInvokeBatch(req, res);
5566
+ return;
5567
+ }
5568
+ sendJson(res, 404, { error: "Not found" });
5569
+ });
5570
+ function handleInfo(res) {
5571
+ const response = {
5572
+ targetName: defaultProvider.targetName,
5573
+ maxCalls,
5574
+ callCount,
5575
+ availableTargets: targetsList
5576
+ };
5577
+ sendJson(res, 200, response);
5466
5578
  }
5467
- if (Array.isArray(obj)) {
5468
- return obj.map((item) => toSnakeCaseDeep(item));
5579
+ async function handleInvoke(req, res) {
5580
+ if (callCount >= maxCalls) {
5581
+ sendJson(res, 429, { error: `Max calls exceeded (limit: ${maxCalls})` });
5582
+ return;
5583
+ }
5584
+ try {
5585
+ const body = await readBody(req);
5586
+ const request = JSON.parse(body);
5587
+ if (!request.question || typeof request.question !== "string") {
5588
+ sendJson(res, 400, { error: "Missing required field: question" });
5589
+ return;
5590
+ }
5591
+ const provider = resolveProvider(request.target);
5592
+ if (!provider) {
5593
+ sendJson(res, 400, {
5594
+ error: `Unknown target '${request.target}'. Available: ${targetsList.join(", ")}`
5595
+ });
5596
+ return;
5597
+ }
5598
+ callCount++;
5599
+ const response = await provider.invoke({
5600
+ question: request.question,
5601
+ systemPrompt: request.systemPrompt,
5602
+ evalCaseId: request.evalCaseId ?? "proxy",
5603
+ attempt: request.attempt ?? 1
5604
+ });
5605
+ const outputMessages = response.outputMessages ?? [];
5606
+ const rawText = extractLastAssistantContent2(outputMessages);
5607
+ const result = {
5608
+ outputMessages,
5609
+ rawText
5610
+ };
5611
+ sendJson(res, 200, result);
5612
+ } catch (error) {
5613
+ const message = error instanceof Error ? error.message : String(error);
5614
+ sendJson(res, 500, { error: message });
5615
+ }
5469
5616
  }
5470
- if (typeof obj === "object") {
5471
- const result = {};
5472
- for (const [key, value] of Object.entries(obj)) {
5473
- const snakeKey = toSnakeCase(key);
5474
- result[snakeKey] = toSnakeCaseDeep(value);
5617
+ async function handleInvokeBatch(req, res) {
5618
+ try {
5619
+ const body = await readBody(req);
5620
+ const { requests } = JSON.parse(body);
5621
+ if (!Array.isArray(requests)) {
5622
+ sendJson(res, 400, { error: "Missing required field: requests (array)" });
5623
+ return;
5624
+ }
5625
+ if (callCount + requests.length > maxCalls) {
5626
+ sendJson(res, 429, {
5627
+ error: `Batch would exceed max calls (current: ${callCount}, batch: ${requests.length}, limit: ${maxCalls})`
5628
+ });
5629
+ return;
5630
+ }
5631
+ const responses = [];
5632
+ for (const request of requests) {
5633
+ if (!request.question || typeof request.question !== "string") {
5634
+ responses.push({
5635
+ outputMessages: [],
5636
+ rawText: "Error: Missing required field: question"
5637
+ });
5638
+ continue;
5639
+ }
5640
+ const provider = resolveProvider(request.target);
5641
+ if (!provider) {
5642
+ responses.push({
5643
+ outputMessages: [],
5644
+ rawText: `Error: Unknown target '${request.target}'. Available: ${targetsList.join(", ")}`
5645
+ });
5646
+ continue;
5647
+ }
5648
+ callCount++;
5649
+ try {
5650
+ const response = await provider.invoke({
5651
+ question: request.question,
5652
+ systemPrompt: request.systemPrompt,
5653
+ evalCaseId: request.evalCaseId ?? "proxy",
5654
+ attempt: request.attempt ?? 1
5655
+ });
5656
+ const outputMessages = response.outputMessages ?? [];
5657
+ responses.push({
5658
+ outputMessages,
5659
+ rawText: extractLastAssistantContent2(outputMessages)
5660
+ });
5661
+ } catch (error) {
5662
+ const message = error instanceof Error ? error.message : String(error);
5663
+ responses.push({
5664
+ outputMessages: [],
5665
+ rawText: `Error: ${message}`
5666
+ });
5667
+ }
5668
+ }
5669
+ sendJson(res, 200, { responses });
5670
+ } catch (error) {
5671
+ const message = error instanceof Error ? error.message : String(error);
5672
+ sendJson(res, 500, { error: message });
5475
5673
  }
5476
- return result;
5477
5674
  }
5478
- return obj;
5479
- }
5480
-
5481
- // src/evaluation/evaluators.ts
5675
+ await new Promise((resolve, reject) => {
5676
+ server.once("error", reject);
5677
+ server.listen(0, "127.0.0.1", () => {
5678
+ server.removeListener("error", reject);
5679
+ resolve();
5680
+ });
5681
+ });
5682
+ const address = server.address();
5683
+ const url = `http://127.0.0.1:${address.port}`;
5684
+ return {
5685
+ url,
5686
+ token,
5687
+ shutdown: async () => {
5688
+ isShutdown = true;
5689
+ return new Promise((resolve, reject) => {
5690
+ server.close((err) => {
5691
+ if (err) reject(err);
5692
+ else resolve();
5693
+ });
5694
+ });
5695
+ },
5696
+ getUsageMetadata: () => ({
5697
+ callCount,
5698
+ maxCalls
5699
+ })
5700
+ };
5701
+ }
5702
+ function sendJson(res, statusCode, body) {
5703
+ res.writeHead(statusCode, { "Content-Type": "application/json" });
5704
+ res.end(JSON.stringify(body));
5705
+ }
5706
+ function readBody(req) {
5707
+ return new Promise((resolve, reject) => {
5708
+ const chunks = [];
5709
+ req.on("data", (chunk) => chunks.push(chunk));
5710
+ req.on("end", () => resolve(Buffer.concat(chunks).toString("utf8")));
5711
+ req.on("error", reject);
5712
+ });
5713
+ }
5714
+ function extractLastAssistantContent2(messages) {
5715
+ for (let i = messages.length - 1; i >= 0; i--) {
5716
+ const msg = messages[i];
5717
+ if (msg.role === "assistant" && msg.content !== void 0) {
5718
+ if (typeof msg.content === "string") {
5719
+ return msg.content;
5720
+ }
5721
+ if (Array.isArray(msg.content)) {
5722
+ for (const part of msg.content) {
5723
+ if (typeof part === "object" && part !== null && "text" in part) {
5724
+ return String(part.text);
5725
+ }
5726
+ }
5727
+ }
5728
+ }
5729
+ }
5730
+ return void 0;
5731
+ }
5732
+
5733
+ // src/evaluation/case-conversion.ts
5734
+ function toSnakeCase(str) {
5735
+ if (/^[A-Z]/.test(str)) {
5736
+ return str;
5737
+ }
5738
+ return str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
5739
+ }
5740
+ function toSnakeCaseDeep(obj) {
5741
+ if (obj === null || obj === void 0) {
5742
+ return obj;
5743
+ }
5744
+ if (Array.isArray(obj)) {
5745
+ return obj.map((item) => toSnakeCaseDeep(item));
5746
+ }
5747
+ if (typeof obj === "object") {
5748
+ const result = {};
5749
+ for (const [key, value] of Object.entries(obj)) {
5750
+ const snakeKey = toSnakeCase(key);
5751
+ result[snakeKey] = toSnakeCaseDeep(value);
5752
+ }
5753
+ return result;
5754
+ }
5755
+ return obj;
5756
+ }
5757
+
5758
+ // src/evaluation/evaluators/code-evaluator.ts
5759
+ var CodeEvaluator = class {
5760
+ kind = "code";
5761
+ script;
5762
+ cwd;
5763
+ agentTimeoutMs;
5764
+ config;
5765
+ target;
5766
+ constructor(options) {
5767
+ this.script = options.script;
5768
+ this.cwd = options.cwd;
5769
+ this.agentTimeoutMs = options.agentTimeoutMs;
5770
+ this.config = options.config;
5771
+ this.target = options.target;
5772
+ }
5773
+ async evaluate(context) {
5774
+ const payload = {
5775
+ question: context.evalCase.question,
5776
+ expectedOutcome: context.evalCase.expected_outcome,
5777
+ expectedMessages: context.evalCase.expected_messages,
5778
+ referenceAnswer: context.evalCase.reference_answer,
5779
+ candidateAnswer: context.candidate,
5780
+ outputMessages: context.outputMessages ?? null,
5781
+ guidelineFiles: context.evalCase.guideline_paths,
5782
+ inputFiles: context.evalCase.file_paths.filter(
5783
+ (path15) => !context.evalCase.guideline_paths.includes(path15)
5784
+ ),
5785
+ inputMessages: context.evalCase.input_messages,
5786
+ traceSummary: context.traceSummary ?? null,
5787
+ config: this.config ?? null
5788
+ };
5789
+ const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
5790
+ let proxyEnv;
5791
+ let proxyShutdown;
5792
+ let getProxyUsage;
5793
+ if (this.target !== void 0 && context.judgeProvider) {
5794
+ const maxCalls = this.target.max_calls ?? DEFAULT_MAX_CALLS;
5795
+ const proxy = await createTargetProxy({
5796
+ defaultProvider: context.judgeProvider,
5797
+ targetResolver: context.targetResolver,
5798
+ availableTargets: context.availableTargets,
5799
+ maxCalls
5800
+ });
5801
+ proxyEnv = {
5802
+ AGENTV_TARGET_PROXY_URL: proxy.url,
5803
+ AGENTV_TARGET_PROXY_TOKEN: proxy.token
5804
+ };
5805
+ proxyShutdown = proxy.shutdown;
5806
+ getProxyUsage = proxy.getUsageMetadata;
5807
+ }
5808
+ try {
5809
+ const stdout = await executeScript(
5810
+ this.script,
5811
+ inputPayload,
5812
+ this.agentTimeoutMs,
5813
+ this.cwd,
5814
+ proxyEnv
5815
+ );
5816
+ const parsed = parseJsonSafe(stdout);
5817
+ const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
5818
+ const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
5819
+ const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
5820
+ const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
5821
+ const details = parsed?.details && typeof parsed.details === "object" && !Array.isArray(parsed.details) ? parsed.details : void 0;
5822
+ const proxyUsage = getProxyUsage?.();
5823
+ const evaluatorRawRequest = {
5824
+ script: this.script,
5825
+ ...this.cwd ? { cwd: this.cwd } : {},
5826
+ ...proxyUsage ? {
5827
+ target_proxy: {
5828
+ call_count: proxyUsage.callCount,
5829
+ max_calls: proxyUsage.maxCalls
5830
+ }
5831
+ } : {}
5832
+ };
5833
+ return {
5834
+ score,
5835
+ verdict: scoreToVerdict(score),
5836
+ hits,
5837
+ misses,
5838
+ expectedAspectCount: hits.length + misses.length || 1,
5839
+ reasoning,
5840
+ evaluatorRawRequest,
5841
+ ...details ? { details } : {}
5842
+ };
5843
+ } catch (error) {
5844
+ const message = error instanceof Error ? error.message : String(error);
5845
+ const proxyUsage = getProxyUsage?.();
5846
+ return {
5847
+ score: 0,
5848
+ verdict: "fail",
5849
+ hits: [],
5850
+ misses: [`Code evaluator failed: ${message}`],
5851
+ expectedAspectCount: 1,
5852
+ reasoning: message,
5853
+ evaluatorRawRequest: {
5854
+ script: this.script,
5855
+ ...this.cwd ? { cwd: this.cwd } : {},
5856
+ ...proxyUsage ? {
5857
+ target_proxy: {
5858
+ call_count: proxyUsage.callCount,
5859
+ max_calls: proxyUsage.maxCalls
5860
+ }
5861
+ } : {},
5862
+ error: message
5863
+ }
5864
+ };
5865
+ } finally {
5866
+ if (proxyShutdown) {
5867
+ await proxyShutdown();
5868
+ }
5869
+ }
5870
+ }
5871
+ };
5872
+ async function executeScript(scriptPath, input, agentTimeoutMs, cwd, env) {
5873
+ const { stdout, stderr, exitCode } = typeof scriptPath === "string" ? await execShellWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env }) : await execFileWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs, env });
5874
+ if (exitCode !== 0) {
5875
+ const trimmedErr = formatStderr(stderr);
5876
+ throw new Error(
5877
+ trimmedErr.length > 0 ? `Code evaluator exited with code ${exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${exitCode}`
5878
+ );
5879
+ }
5880
+ return stdout.trim();
5881
+ }
5882
+ function formatStderr(stderr) {
5883
+ const trimmed = stderr.trim();
5884
+ const maxLength = 2e3;
5885
+ if (trimmed.length <= maxLength) {
5886
+ return trimmed;
5887
+ }
5888
+ const tail = trimmed.slice(-maxLength);
5889
+ return `...(truncated, last ${maxLength} chars)
5890
+ ${tail}`;
5891
+ }
5892
+
5893
+ // src/evaluation/evaluators/composite.ts
5894
+ import { generateText as generateText3 } from "ai";
5895
+
5896
+ // src/evaluation/evaluators/llm-judge.ts
5897
+ import { generateText as generateText2 } from "ai";
5898
+ import { z as z2 } from "zod";
5482
5899
  var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
5483
5900
 
5484
5901
  Use the reference_answer as a gold standard for a high-quality response (if provided). The reference_answer may be a simple text response, or it may contain a sequence of expected agent messages including tool calls. When it contains multiple messages, the last message represents the final expected answer. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
@@ -5558,7 +5975,7 @@ var LlmJudgeEvaluator = class {
5558
5975
  target: judgeProvider.targetName
5559
5976
  };
5560
5977
  try {
5561
- const { data, providerResponse } = await this.runWithRetry({
5978
+ const { data } = await this.runWithRetry({
5562
5979
  context,
5563
5980
  judgeProvider,
5564
5981
  systemPrompt,
@@ -5707,105 +6124,11 @@ You must return a valid JSON object matching this schema:
5707
6124
  "overall_reasoning": "string (summary)"
5708
6125
  }`;
5709
6126
  }
5710
- function scoreToVerdict(score) {
5711
- if (score >= 0.8) {
5712
- return "pass";
5713
- }
5714
- if (score >= 0.6) {
5715
- return "borderline";
5716
- }
5717
- return "fail";
5718
- }
5719
- function clampScore(value) {
5720
- if (Number.isNaN(value) || !Number.isFinite(value)) {
5721
- return 0;
5722
- }
5723
- if (value < 0) {
5724
- return 0;
5725
- }
5726
- if (value > 1) {
5727
- return 1;
5728
- }
5729
- return value;
5730
- }
5731
- function extractJsonBlob(text) {
5732
- const match = text.match(/\{[\s\S]*\}/);
5733
- return match?.[0];
5734
- }
5735
- function parseJsonFromText(text) {
5736
- const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
5737
- const blob = extractJsonBlob(cleaned) ?? cleaned;
5738
- return JSON.parse(blob);
5739
- }
5740
- function isNonEmptyString(value) {
5741
- return typeof value === "string" && value.trim().length > 0;
6127
+ function substituteVariables(template, variables) {
6128
+ return template.replace(/\{\{\s*([a-zA-Z0-9_]+)\s*\}\}/g, (match, varName) => {
6129
+ return variables[varName] ?? match;
6130
+ });
5742
6131
  }
5743
- var CodeEvaluator = class {
5744
- kind = "code";
5745
- script;
5746
- cwd;
5747
- agentTimeoutMs;
5748
- config;
5749
- constructor(options) {
5750
- this.script = options.script;
5751
- this.cwd = options.cwd;
5752
- this.agentTimeoutMs = options.agentTimeoutMs;
5753
- this.config = options.config;
5754
- }
5755
- async evaluate(context) {
5756
- const payload = {
5757
- question: context.evalCase.question,
5758
- expectedOutcome: context.evalCase.expected_outcome,
5759
- expectedMessages: context.evalCase.expected_messages,
5760
- referenceAnswer: context.evalCase.reference_answer,
5761
- candidateAnswer: context.candidate,
5762
- outputMessages: context.outputMessages ?? null,
5763
- guidelineFiles: context.evalCase.guideline_paths,
5764
- inputFiles: context.evalCase.file_paths.filter(
5765
- (path15) => !context.evalCase.guideline_paths.includes(path15)
5766
- ),
5767
- inputMessages: context.evalCase.input_messages,
5768
- traceSummary: context.traceSummary ?? null,
5769
- config: this.config ?? null
5770
- };
5771
- const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
5772
- try {
5773
- const stdout = await executeScript(this.script, inputPayload, this.agentTimeoutMs, this.cwd);
5774
- const parsed = parseJsonSafe(stdout);
5775
- const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
5776
- const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
5777
- const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
5778
- const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
5779
- return {
5780
- score,
5781
- verdict: scoreToVerdict(score),
5782
- hits,
5783
- misses,
5784
- expectedAspectCount: hits.length + misses.length || 1,
5785
- reasoning,
5786
- evaluatorRawRequest: {
5787
- script: this.script,
5788
- ...this.cwd ? { cwd: this.cwd } : {}
5789
- }
5790
- };
5791
- } catch (error) {
5792
- const message = error instanceof Error ? error.message : String(error);
5793
- return {
5794
- score: 0,
5795
- verdict: "fail",
5796
- hits: [],
5797
- misses: [`Code evaluator failed: ${message}`],
5798
- expectedAspectCount: 1,
5799
- reasoning: message,
5800
- evaluatorRawRequest: {
5801
- script: this.script,
5802
- ...this.cwd ? { cwd: this.cwd } : {},
5803
- error: message
5804
- }
5805
- };
5806
- }
5807
- }
5808
- };
5809
6132
  function calculateRubricScore(result, rubrics) {
5810
6133
  const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
5811
6134
  const hits = [];
@@ -5833,273 +6156,281 @@ function calculateRubricScore(result, rubrics) {
5833
6156
  const verdict = failedRequired ? "fail" : scoreToVerdict(score);
5834
6157
  return { score, verdict, hits, misses };
5835
6158
  }
5836
- async function executeScript(scriptPath, input, agentTimeoutMs, cwd) {
5837
- const { stdout, stderr, exitCode } = typeof scriptPath === "string" ? await execShellWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs }) : await execFileWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs });
5838
- if (exitCode !== 0) {
5839
- const trimmedErr = formatStderr(stderr);
5840
- throw new Error(
5841
- trimmedErr.length > 0 ? `Code evaluator exited with code ${exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${exitCode}`
5842
- );
5843
- }
5844
- return stdout.trim();
5845
- }
5846
- function formatStderr(stderr) {
5847
- const trimmed = stderr.trim();
5848
- const maxLength = 2e3;
5849
- if (trimmed.length <= maxLength) {
5850
- return trimmed;
5851
- }
5852
- const tail = trimmed.slice(-maxLength);
5853
- return `...(truncated, last ${maxLength} chars)
5854
- ${tail}`;
5855
- }
5856
- function parseJsonSafe(payload) {
5857
- try {
5858
- return JSON.parse(payload);
5859
- } catch {
5860
- return void 0;
5861
- }
5862
- }
5863
- function substituteVariables(template, variables) {
5864
- return template.replace(/\{\{\s*([a-zA-Z0-9_]+)\s*\}\}/g, (match, varName) => {
5865
- return variables[varName] ?? match;
5866
- });
5867
- }
5868
- function deepEqual(a, b) {
5869
- if (a === b) return true;
5870
- if (a === null || b === null) return a === b;
5871
- if (typeof a !== typeof b) return false;
5872
- if (typeof a !== "object") return a === b;
5873
- if (Array.isArray(a) !== Array.isArray(b)) return false;
5874
- if (Array.isArray(a) && Array.isArray(b)) {
5875
- if (a.length !== b.length) return false;
5876
- return a.every((val, i) => deepEqual(val, b[i]));
5877
- }
5878
- const aObj = a;
5879
- const bObj = b;
5880
- const aKeys = Object.keys(aObj);
5881
- const bKeys = Object.keys(bObj);
5882
- if (aKeys.length !== bKeys.length) return false;
5883
- return aKeys.every((key) => Object.hasOwn(bObj, key) && deepEqual(aObj[key], bObj[key]));
5884
- }
5885
- function argsMatch(expected, actual) {
5886
- if (expected === void 0) return true;
5887
- if (expected === "any") return true;
5888
- if (actual === void 0) return false;
5889
- for (const key of Object.keys(expected)) {
5890
- if (!Object.hasOwn(actual, key)) return false;
5891
- if (!deepEqual(expected[key], actual[key])) return false;
5892
- }
5893
- return true;
5894
- }
5895
- var ToolTrajectoryEvaluator = class {
5896
- kind = "tool_trajectory";
6159
+
6160
+ // src/evaluation/evaluators/composite.ts
6161
+ var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
6162
+ {{EVALUATOR_RESULTS_JSON}}
6163
+
6164
+ Decide the final score and verdict based on all evaluator results.
6165
+ Return a JSON object with: score (0.0-1.0), verdict (pass/fail/borderline), and reasoning.`;
6166
+ var CompositeEvaluator = class {
6167
+ kind = "composite";
5897
6168
  config;
6169
+ evaluatorFactory;
6170
+ cwd;
5898
6171
  constructor(options) {
5899
6172
  this.config = options.config;
6173
+ this.evaluatorFactory = options.evaluatorFactory;
6174
+ this.cwd = options.cwd;
5900
6175
  }
5901
- evaluate(context) {
5902
- const { outputMessages, traceSummary } = context;
5903
- const toolCalls = this.extractToolCallsFromMessages(outputMessages);
5904
- if (toolCalls.length === 0 && !traceSummary) {
5905
- return {
5906
- score: 0,
5907
- verdict: "fail",
5908
- hits: [],
5909
- misses: ["No trace available for evaluation"],
5910
- expectedAspectCount: 1
5911
- };
5912
- }
5913
- const summary = toolCalls.length > 0 ? this.buildSummary(toolCalls) : traceSummary;
5914
- if (!summary) {
5915
- return {
5916
- score: 0,
5917
- verdict: "fail",
5918
- hits: [],
5919
- misses: ["No trace available for evaluation"],
5920
- expectedAspectCount: 1
5921
- };
5922
- }
5923
- switch (this.config.mode) {
5924
- case "any_order":
5925
- return this.evaluateAnyOrder(summary);
5926
- case "in_order":
5927
- return this.evaluateInOrder(toolCalls);
5928
- case "exact":
5929
- return this.evaluateExact(toolCalls);
5930
- default:
6176
+ async evaluate(context) {
6177
+ const memberResults = await Promise.all(
6178
+ this.config.evaluators.map(async (memberConfig) => {
6179
+ const evaluator = this.evaluatorFactory.create(memberConfig, context);
5931
6180
  return {
5932
- score: 0,
5933
- verdict: "fail",
5934
- hits: [],
5935
- misses: [`Unknown mode: ${this.config.mode}`],
5936
- expectedAspectCount: 1
6181
+ id: memberConfig.name,
6182
+ type: memberConfig.type,
6183
+ result: await evaluator.evaluate(context)
5937
6184
  };
5938
- }
5939
- }
5940
- /**
5941
- * Extract tool calls from output messages.
5942
- */
5943
- extractToolCallsFromMessages(messages) {
5944
- if (!messages) {
5945
- return [];
5946
- }
5947
- const toolCalls = [];
5948
- for (const message of messages) {
5949
- if (message.toolCalls) {
5950
- for (const call of message.toolCalls) {
5951
- toolCalls.push({
5952
- name: call.tool,
5953
- args: call.input
5954
- });
5955
- }
5956
- }
5957
- }
5958
- return toolCalls;
6185
+ })
6186
+ );
6187
+ return this.aggregate(memberResults, context);
5959
6188
  }
5960
- /**
5961
- * Build a summary from extracted tool calls.
5962
- */
5963
- buildSummary(toolCalls) {
5964
- const toolCallsByName = {};
5965
- for (const call of toolCalls) {
5966
- toolCallsByName[call.name] = (toolCallsByName[call.name] ?? 0) + 1;
6189
+ async aggregate(results, context) {
6190
+ const aggregator = this.config.aggregator;
6191
+ switch (aggregator.type) {
6192
+ case "code_judge":
6193
+ return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
6194
+ case "llm_judge":
6195
+ return this.runLlmAggregator(results, context, aggregator);
6196
+ default:
6197
+ return this.runWeightedAverage(results, aggregator.weights);
5967
6198
  }
5968
- const toolNames = Object.keys(toolCallsByName).sort();
5969
- return {
5970
- eventCount: toolCalls.length,
5971
- toolNames,
5972
- toolCallsByName,
5973
- errorCount: 0
5974
- };
5975
6199
  }
5976
- evaluateAnyOrder(summary) {
5977
- const minimums = this.config.minimums ?? {};
5978
- const toolNames = Object.keys(minimums);
5979
- if (toolNames.length === 0) {
5980
- return {
5981
- score: 1,
5982
- verdict: "pass",
5983
- hits: ["No tool requirements specified"],
5984
- misses: [],
5985
- expectedAspectCount: 0
5986
- };
5987
- }
5988
- const hits = [];
5989
- const misses = [];
5990
- for (const toolName of toolNames) {
5991
- const required = minimums[toolName];
5992
- const actual = summary.toolCallsByName[toolName] ?? 0;
5993
- if (actual >= required) {
5994
- hits.push(`${toolName}: called ${actual} times (required \u2265${required})`);
5995
- } else {
5996
- misses.push(`${toolName}: called ${actual} times (required \u2265${required})`);
6200
+ runWeightedAverage(results, weights) {
6201
+ let totalWeight = 0;
6202
+ let weightedSum = 0;
6203
+ const allHits = [];
6204
+ const allMisses = [];
6205
+ const reasoningParts = [];
6206
+ const evaluatorResults = [];
6207
+ for (const member of results) {
6208
+ const weight = weights?.[member.id] ?? 1;
6209
+ totalWeight += weight;
6210
+ weightedSum += member.result.score * weight;
6211
+ allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
6212
+ allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
6213
+ if (member.result.reasoning) {
6214
+ reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
5997
6215
  }
6216
+ evaluatorResults.push({
6217
+ name: member.id,
6218
+ type: member.type,
6219
+ score: member.result.score,
6220
+ weight,
6221
+ verdict: member.result.verdict,
6222
+ hits: [...member.result.hits],
6223
+ misses: [...member.result.misses],
6224
+ reasoning: member.result.reasoning,
6225
+ evaluatorRawRequest: member.result.evaluatorRawRequest,
6226
+ evaluatorResults: member.result.evaluatorResults,
6227
+ details: member.result.details
6228
+ });
5998
6229
  }
5999
- const score = hits.length / toolNames.length;
6230
+ const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
6000
6231
  return {
6001
- score,
6002
- verdict: scoreToVerdict(score),
6003
- hits,
6004
- misses,
6005
- expectedAspectCount: toolNames.length
6232
+ score: clampScore(finalScore),
6233
+ verdict: scoreToVerdict(finalScore),
6234
+ hits: allHits,
6235
+ misses: allMisses,
6236
+ expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
6237
+ reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
6238
+ evaluatorRawRequest: {
6239
+ aggregator: "weighted_average",
6240
+ ...weights ? { weights } : {}
6241
+ },
6242
+ evaluatorResults
6006
6243
  };
6007
6244
  }
6008
- evaluateInOrder(toolCalls) {
6009
- const expected = this.config.expected ?? [];
6010
- if (expected.length === 0) {
6245
+ async runCodeAggregator(results, scriptPath, cwd, weights) {
6246
+ const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
6247
+ const inputPayload = JSON.stringify({ results: resultsObject }, null, 2);
6248
+ const evaluatorResults = results.map((member) => ({
6249
+ name: member.id,
6250
+ type: member.type,
6251
+ score: member.result.score,
6252
+ weight: weights?.[member.id] ?? 1,
6253
+ verdict: member.result.verdict,
6254
+ hits: [...member.result.hits],
6255
+ misses: [...member.result.misses],
6256
+ reasoning: member.result.reasoning,
6257
+ evaluatorRawRequest: member.result.evaluatorRawRequest,
6258
+ evaluatorResults: member.result.evaluatorResults,
6259
+ details: member.result.details
6260
+ }));
6261
+ try {
6262
+ const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
6263
+ const parsed = parseJsonSafe(stdout);
6264
+ const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
6265
+ const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
6266
+ const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
6267
+ const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
6268
+ const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
6011
6269
  return {
6012
- score: 1,
6013
- verdict: "pass",
6014
- hits: ["No tool sequence specified"],
6015
- misses: [],
6016
- expectedAspectCount: 0
6270
+ score,
6271
+ verdict,
6272
+ hits,
6273
+ misses,
6274
+ expectedAspectCount: hits.length + misses.length || 1,
6275
+ reasoning,
6276
+ evaluatorRawRequest: {
6277
+ aggregator: "code_judge",
6278
+ script: scriptPath
6279
+ },
6280
+ evaluatorResults
6281
+ };
6282
+ } catch (error) {
6283
+ const message = error instanceof Error ? error.message : String(error);
6284
+ return {
6285
+ score: 0,
6286
+ verdict: "fail",
6287
+ hits: [],
6288
+ misses: [`Code aggregator failed: ${message}`],
6289
+ expectedAspectCount: 1,
6290
+ reasoning: message,
6291
+ evaluatorRawRequest: {
6292
+ aggregator: "code_judge",
6293
+ script: scriptPath,
6294
+ error: message
6295
+ },
6296
+ evaluatorResults
6017
6297
  };
6018
6298
  }
6019
- const hits = [];
6020
- const misses = [];
6021
- let actualIndex = 0;
6022
- for (let i = 0; i < expected.length; i++) {
6023
- const expectedItem = expected[i];
6024
- const expectedTool = expectedItem.tool;
6025
- let found = false;
6026
- let argsMismatch = false;
6027
- while (actualIndex < toolCalls.length) {
6028
- const actualCall = toolCalls[actualIndex];
6029
- if (actualCall.name === expectedTool) {
6030
- if (argsMatch(expectedItem.args, actualCall.args)) {
6031
- hits.push(`Found ${expectedTool} at position ${actualIndex}`);
6032
- actualIndex++;
6033
- found = true;
6034
- break;
6035
- }
6036
- misses.push(
6037
- `Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`
6038
- );
6039
- actualIndex++;
6040
- argsMismatch = true;
6041
- break;
6042
- }
6043
- actualIndex++;
6044
- }
6045
- if (!found && !argsMismatch) {
6046
- misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
6047
- }
6048
- }
6049
- const score = hits.length / expected.length;
6050
- return {
6051
- score,
6052
- verdict: scoreToVerdict(score),
6053
- hits,
6054
- misses,
6055
- expectedAspectCount: expected.length
6056
- };
6057
6299
  }
6058
- evaluateExact(toolCalls) {
6059
- const expected = this.config.expected ?? [];
6060
- if (expected.length === 0) {
6300
+ async runLlmAggregator(results, context, config) {
6301
+ const judgeProvider = context.judgeProvider;
6302
+ if (!judgeProvider) {
6303
+ throw new Error("No judge provider available for LLM aggregation");
6304
+ }
6305
+ const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
6306
+ const resultsJson = JSON.stringify(resultsObject, null, 2);
6307
+ const evaluatorResults = results.map((member) => ({
6308
+ name: member.id,
6309
+ type: member.type,
6310
+ score: member.result.score,
6311
+ verdict: member.result.verdict,
6312
+ hits: [...member.result.hits],
6313
+ misses: [...member.result.misses],
6314
+ reasoning: member.result.reasoning,
6315
+ evaluatorRawRequest: member.result.evaluatorRawRequest,
6316
+ evaluatorResults: member.result.evaluatorResults,
6317
+ details: member.result.details
6318
+ }));
6319
+ const promptTemplate = config.prompt ?? DEFAULT_COMPOSITE_AGGREGATOR_PROMPT;
6320
+ const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
6321
+ const systemPrompt = buildOutputSchema();
6322
+ const evaluatorRawRequest = {
6323
+ aggregator: "llm_judge",
6324
+ userPrompt,
6325
+ systemPrompt,
6326
+ target: judgeProvider.targetName
6327
+ };
6328
+ try {
6329
+ const model = judgeProvider.asLanguageModel?.();
6330
+ if (model) {
6331
+ const { text } = await generateText3({
6332
+ model,
6333
+ system: systemPrompt,
6334
+ prompt: userPrompt
6335
+ });
6336
+ const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text));
6337
+ const score2 = clampScore(data2.score);
6338
+ const hits2 = Array.isArray(data2.hits) ? data2.hits.filter(isNonEmptyString).slice(0, 4) : [];
6339
+ const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
6340
+ const reasoning2 = data2.reasoning;
6341
+ return {
6342
+ score: score2,
6343
+ verdict: scoreToVerdict(score2),
6344
+ hits: hits2,
6345
+ misses: misses2,
6346
+ expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
6347
+ reasoning: reasoning2,
6348
+ evaluatorRawRequest,
6349
+ evaluatorResults
6350
+ };
6351
+ }
6352
+ const response = await judgeProvider.invoke({
6353
+ question: userPrompt,
6354
+ systemPrompt,
6355
+ evalCaseId: context.evalCase.id,
6356
+ attempt: context.attempt
6357
+ });
6358
+ const data = freeformEvaluationSchema.parse(
6359
+ parseJsonFromText(extractLastAssistantContent(response.outputMessages))
6360
+ );
6361
+ const score = clampScore(data.score);
6362
+ const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
6363
+ const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
6364
+ const reasoning = data.reasoning;
6061
6365
  return {
6062
- score: 1,
6063
- verdict: "pass",
6064
- hits: ["No tool sequence specified"],
6366
+ score,
6367
+ verdict: scoreToVerdict(score),
6368
+ hits,
6369
+ misses,
6370
+ expectedAspectCount: Math.max(hits.length + misses.length, 1),
6371
+ reasoning,
6372
+ evaluatorRawRequest,
6373
+ evaluatorResults
6374
+ };
6375
+ } catch {
6376
+ return {
6377
+ score: 0,
6378
+ verdict: "fail",
6379
+ hits: [],
6065
6380
  misses: [],
6066
- expectedAspectCount: 0
6381
+ expectedAspectCount: 1,
6382
+ evaluatorRawRequest,
6383
+ evaluatorResults
6067
6384
  };
6068
6385
  }
6069
- const hits = [];
6070
- const misses = [];
6071
- if (toolCalls.length !== expected.length) {
6072
- misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
6073
- }
6074
- const checkLength = Math.min(expected.length, toolCalls.length);
6075
- for (let i = 0; i < checkLength; i++) {
6076
- const expectedItem = expected[i];
6077
- const expectedTool = expectedItem.tool;
6078
- const actualCall = toolCalls[i];
6079
- const actualTool = actualCall.name;
6080
- if (actualTool === expectedTool) {
6081
- if (argsMatch(expectedItem.args, actualCall.args)) {
6082
- hits.push(`Position ${i}: ${expectedTool}`);
6083
- } else {
6084
- misses.push(`Position ${i}: ${expectedTool} args mismatch`);
6386
+ }
6387
+ };
6388
+
6389
+ // src/evaluation/evaluators/cost.ts
6390
+ var CostEvaluator = class {
6391
+ kind = "cost";
6392
+ config;
6393
+ constructor(options) {
6394
+ this.config = options.config;
6395
+ }
6396
+ evaluate(context) {
6397
+ const { budget } = this.config;
6398
+ const costUsd = context.traceSummary?.costUsd;
6399
+ if (costUsd === void 0) {
6400
+ return {
6401
+ score: 0,
6402
+ verdict: "fail",
6403
+ hits: [],
6404
+ misses: ["No cost data available in trace"],
6405
+ expectedAspectCount: 1,
6406
+ reasoning: "Execution cost not reported by provider",
6407
+ evaluatorRawRequest: {
6408
+ type: "cost",
6409
+ budget,
6410
+ costUsd: null
6085
6411
  }
6086
- } else {
6087
- misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
6088
- }
6089
- }
6090
- for (let i = checkLength; i < expected.length; i++) {
6091
- misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
6412
+ };
6092
6413
  }
6093
- const score = hits.length / expected.length;
6414
+ const passed = costUsd <= budget;
6415
+ const score = passed ? 1 : 0;
6416
+ const formatCost = (n) => `$${n.toFixed(4)}`;
6094
6417
  return {
6095
6418
  score,
6096
- verdict: scoreToVerdict(score),
6097
- hits,
6098
- misses,
6099
- expectedAspectCount: expected.length
6419
+ verdict: passed ? "pass" : "fail",
6420
+ hits: passed ? [`Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`] : [],
6421
+ misses: passed ? [] : [`Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`],
6422
+ expectedAspectCount: 1,
6423
+ reasoning: `Execution cost ${formatCost(costUsd)} (budget: ${formatCost(budget)})`,
6424
+ evaluatorRawRequest: {
6425
+ type: "cost",
6426
+ budget,
6427
+ costUsd
6428
+ }
6100
6429
  };
6101
6430
  }
6102
6431
  };
6432
+
6433
+ // src/evaluation/evaluators/field-accuracy.ts
6103
6434
  var DEFAULT_DATE_FORMATS = [
6104
6435
  "YYYY-MM-DDTHH:mm:ssZ",
6105
6436
  // ISO with timezone
@@ -6312,434 +6643,209 @@ var FieldAccuracyEvaluator = class {
6312
6643
  }
6313
6644
  if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
6314
6645
  return {
6315
- path: path15,
6316
- score: 0,
6317
- weight,
6318
- hit: false,
6319
- message: `${path15} (invalid numeric value)`
6320
- };
6321
- }
6322
- const diff = Math.abs(candidateNum - expectedNum);
6323
- let withinTolerance;
6324
- if (relative) {
6325
- const relativeDiff = expectedNum === 0 ? diff : diff / Math.abs(expectedNum);
6326
- withinTolerance = relativeDiff <= tolerance;
6327
- } else {
6328
- withinTolerance = diff <= tolerance;
6329
- }
6330
- if (withinTolerance) {
6331
- return {
6332
- path: path15,
6333
- score: 1,
6334
- weight,
6335
- hit: true,
6336
- message: `${path15} (within tolerance: diff=${diff.toFixed(2)})`
6337
- };
6338
- }
6339
- return {
6340
- path: path15,
6341
- score: 0,
6342
- weight,
6343
- hit: false,
6344
- message: `${path15} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
6345
- };
6346
- }
6347
- /**
6348
- * Date comparison with format normalization.
6349
- */
6350
- compareDate(path15, candidateValue, expectedValue, fieldConfig, weight) {
6351
- const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
6352
- const candidateDate = parseDate(String(candidateValue), formats);
6353
- const expectedDate = parseDate(String(expectedValue), formats);
6354
- if (candidateDate === null) {
6355
- return {
6356
- path: path15,
6357
- score: 0,
6358
- weight,
6359
- hit: false,
6360
- message: `${path15} (unparseable candidate date)`
6361
- };
6362
- }
6363
- if (expectedDate === null) {
6364
- return {
6365
- path: path15,
6366
- score: 0,
6367
- weight,
6368
- hit: false,
6369
- message: `${path15} (unparseable expected date)`
6370
- };
6371
- }
6372
- if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
6373
- return {
6374
- path: path15,
6375
- score: 1,
6376
- weight,
6377
- hit: true,
6378
- message: path15
6379
- };
6380
- }
6381
- return {
6382
- path: path15,
6383
- score: 0,
6384
- weight,
6385
- hit: false,
6386
- message: `${path15} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
6387
- };
6388
- }
6389
- /**
6390
- * Aggregate field results using configured strategy.
6391
- */
6392
- aggregateResults(results) {
6393
- const aggregation = this.config.aggregation ?? "weighted_average";
6394
- const hits = [];
6395
- const misses = [];
6396
- for (const result of results) {
6397
- if (result.hit) {
6398
- hits.push(result.message);
6399
- } else {
6400
- misses.push(result.message);
6401
- }
6402
- }
6403
- let score;
6404
- if (aggregation === "all_or_nothing") {
6405
- score = misses.length === 0 ? 1 : 0;
6406
- } else {
6407
- const totalWeight = results.reduce((sum, r) => sum + r.weight, 0);
6408
- if (totalWeight === 0) {
6409
- score = results.length === 0 ? 1 : 0;
6410
- } else {
6411
- const weightedSum = results.reduce((sum, r) => sum + r.score * r.weight, 0);
6412
- score = weightedSum / totalWeight;
6413
- }
6414
- }
6415
- const reasoning = `${hits.length}/${results.length} fields matched`;
6416
- return {
6417
- score: clampScore(score),
6418
- verdict: scoreToVerdict(score),
6419
- hits: hits.slice(0, 4),
6420
- misses: misses.slice(0, 4),
6421
- expectedAspectCount: results.length,
6422
- reasoning
6423
- };
6424
- }
6425
- };
6426
- function resolvePath(obj, path15) {
6427
- if (!path15 || !obj) {
6428
- return void 0;
6429
- }
6430
- const parts = path15.split(/\.|\[|\]/).filter((p) => p.length > 0);
6431
- let current = obj;
6432
- for (const part of parts) {
6433
- if (current === null || current === void 0) {
6434
- return void 0;
6435
- }
6436
- if (typeof current !== "object") {
6437
- return void 0;
6438
- }
6439
- const isIndex = /^\d+$/.test(part);
6440
- if (isIndex && Array.isArray(current)) {
6441
- current = current[Number.parseInt(part, 10)];
6442
- } else {
6443
- current = current[part];
6444
- }
6445
- }
6446
- return current;
6447
- }
6448
- function toNumber(value) {
6449
- if (typeof value === "number") {
6450
- return value;
6451
- }
6452
- if (typeof value === "string") {
6453
- const num = Number.parseFloat(value);
6454
- return Number.isNaN(num) ? null : num;
6455
- }
6456
- return null;
6457
- }
6458
- function parseDate(dateStr, formats) {
6459
- if (!dateStr) return null;
6460
- const trimmed = dateStr.trim();
6461
- const isoDate = new Date(trimmed);
6462
- if (!Number.isNaN(isoDate.getTime())) {
6463
- return isoDate;
6464
- }
6465
- const localizedMatch = trimmed.match(/^(\d{1,2})-([A-Za-z]{3,9})-(\d{4})$/);
6466
- if (localizedMatch) {
6467
- const day = Number.parseInt(localizedMatch[1], 10);
6468
- const monthName = localizedMatch[2].toLowerCase();
6469
- const year = Number.parseInt(localizedMatch[3], 10);
6470
- const month = MONTH_NAMES[monthName];
6471
- if (month !== void 0) {
6472
- return new Date(year, month, day);
6473
- }
6474
- }
6475
- const usMatch = trimmed.match(/^(\d{1,2})[\/\-](\d{1,2})[\/\-](\d{4})$/);
6476
- if (usMatch) {
6477
- const hasUSFormat = formats.some((f) => f.includes("MM/DD") || f.includes("MM-DD"));
6478
- const hasEUFormat = formats.some((f) => f.includes("DD/MM") || f.includes("DD-MM"));
6479
- if (hasUSFormat && !hasEUFormat) {
6480
- const month = Number.parseInt(usMatch[1], 10) - 1;
6481
- const day = Number.parseInt(usMatch[2], 10);
6482
- const year = Number.parseInt(usMatch[3], 10);
6483
- if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
6484
- return new Date(year, month, day);
6485
- }
6486
- } else if (hasEUFormat && !hasUSFormat) {
6487
- const day = Number.parseInt(usMatch[1], 10);
6488
- const month = Number.parseInt(usMatch[2], 10) - 1;
6489
- const year = Number.parseInt(usMatch[3], 10);
6490
- if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
6491
- return new Date(year, month, day);
6492
- }
6493
- } else {
6494
- const num1 = Number.parseInt(usMatch[1], 10);
6495
- const num2 = Number.parseInt(usMatch[2], 10);
6496
- const year = Number.parseInt(usMatch[3], 10);
6497
- if (num1 > 12 && num2 <= 12) {
6498
- return new Date(year, num2 - 1, num1);
6499
- }
6500
- if (num2 > 12 && num1 <= 12) {
6501
- return new Date(year, num1 - 1, num2);
6502
- }
6503
- if (num1 <= 12 && num2 <= 31) {
6504
- return new Date(year, num1 - 1, num2);
6505
- }
6506
- }
6507
- }
6508
- return null;
6509
- }
6510
- function formatDateISO(date) {
6511
- return date.toISOString().split("T")[0];
6512
- }
6513
- function parseJsonFromTextSafe(text) {
6514
- const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
6515
- const match = cleaned.match(/\{[\s\S]*\}/);
6516
- const blob = match?.[0] ?? cleaned;
6517
- return JSON.parse(blob);
6518
- }
6519
- var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
6520
- {{EVALUATOR_RESULTS_JSON}}
6521
-
6522
- Decide the final score and verdict based on all evaluator results.
6523
- Return a JSON object with: score (0.0-1.0), verdict (pass/fail/borderline), and reasoning.`;
6524
- var CompositeEvaluator = class {
6525
- kind = "composite";
6526
- config;
6527
- evaluatorFactory;
6528
- cwd;
6529
- constructor(options) {
6530
- this.config = options.config;
6531
- this.evaluatorFactory = options.evaluatorFactory;
6532
- this.cwd = options.cwd;
6533
- }
6534
- async evaluate(context) {
6535
- const memberResults = await Promise.all(
6536
- this.config.evaluators.map(async (memberConfig) => {
6537
- const evaluator = this.evaluatorFactory.create(memberConfig, context);
6538
- return {
6539
- id: memberConfig.name,
6540
- type: memberConfig.type,
6541
- result: await evaluator.evaluate(context)
6542
- };
6543
- })
6544
- );
6545
- return this.aggregate(memberResults, context);
6546
- }
6547
- async aggregate(results, context) {
6548
- const aggregator = this.config.aggregator;
6549
- switch (aggregator.type) {
6550
- case "code_judge":
6551
- return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
6552
- case "llm_judge":
6553
- return this.runLlmAggregator(results, context, aggregator);
6554
- default:
6555
- return this.runWeightedAverage(results, aggregator.weights);
6556
- }
6557
- }
6558
- runWeightedAverage(results, weights) {
6559
- let totalWeight = 0;
6560
- let weightedSum = 0;
6561
- const allHits = [];
6562
- const allMisses = [];
6563
- const reasoningParts = [];
6564
- const evaluatorResults = [];
6565
- for (const member of results) {
6566
- const weight = weights?.[member.id] ?? 1;
6567
- totalWeight += weight;
6568
- weightedSum += member.result.score * weight;
6569
- allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
6570
- allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
6571
- if (member.result.reasoning) {
6572
- reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
6573
- }
6574
- evaluatorResults.push({
6575
- name: member.id,
6576
- type: member.type,
6577
- score: member.result.score,
6578
- weight,
6579
- verdict: member.result.verdict,
6580
- hits: [...member.result.hits],
6581
- misses: [...member.result.misses],
6582
- reasoning: member.result.reasoning,
6583
- evaluatorRawRequest: member.result.evaluatorRawRequest,
6584
- evaluatorResults: member.result.evaluatorResults
6585
- });
6586
- }
6587
- const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
6588
- return {
6589
- score: clampScore(finalScore),
6590
- verdict: scoreToVerdict(finalScore),
6591
- hits: allHits,
6592
- misses: allMisses,
6593
- expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
6594
- reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
6595
- evaluatorRawRequest: {
6596
- aggregator: "weighted_average",
6597
- ...weights ? { weights } : {}
6598
- },
6599
- evaluatorResults
6600
- };
6601
- }
6602
- async runCodeAggregator(results, scriptPath, cwd, weights) {
6603
- const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
6604
- const inputPayload = JSON.stringify({ results: resultsObject }, null, 2);
6605
- const evaluatorResults = results.map((member) => ({
6606
- name: member.id,
6607
- type: member.type,
6608
- score: member.result.score,
6609
- weight: weights?.[member.id] ?? 1,
6610
- verdict: member.result.verdict,
6611
- hits: [...member.result.hits],
6612
- misses: [...member.result.misses],
6613
- reasoning: member.result.reasoning,
6614
- evaluatorRawRequest: member.result.evaluatorRawRequest,
6615
- evaluatorResults: member.result.evaluatorResults
6616
- }));
6617
- try {
6618
- const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
6619
- const parsed = parseJsonSafe(stdout);
6620
- const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
6621
- const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
6622
- const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
6623
- const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
6624
- const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
6625
- return {
6626
- score,
6627
- verdict,
6628
- hits,
6629
- misses,
6630
- expectedAspectCount: hits.length + misses.length || 1,
6631
- reasoning,
6632
- evaluatorRawRequest: {
6633
- aggregator: "code_judge",
6634
- script: scriptPath
6635
- },
6636
- evaluatorResults
6637
- };
6638
- } catch (error) {
6639
- const message = error instanceof Error ? error.message : String(error);
6640
- return {
6641
- score: 0,
6642
- verdict: "fail",
6643
- hits: [],
6644
- misses: [`Code aggregator failed: ${message}`],
6645
- expectedAspectCount: 1,
6646
- reasoning: message,
6647
- evaluatorRawRequest: {
6648
- aggregator: "code_judge",
6649
- script: scriptPath,
6650
- error: message
6651
- },
6652
- evaluatorResults
6653
- };
6654
- }
6655
- }
6656
- async runLlmAggregator(results, context, config) {
6657
- const judgeProvider = context.judgeProvider;
6658
- if (!judgeProvider) {
6659
- throw new Error("No judge provider available for LLM aggregation");
6660
- }
6661
- const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
6662
- const resultsJson = JSON.stringify(resultsObject, null, 2);
6663
- const evaluatorResults = results.map((member) => ({
6664
- name: member.id,
6665
- type: member.type,
6666
- score: member.result.score,
6667
- verdict: member.result.verdict,
6668
- hits: [...member.result.hits],
6669
- misses: [...member.result.misses],
6670
- reasoning: member.result.reasoning,
6671
- evaluatorRawRequest: member.result.evaluatorRawRequest,
6672
- evaluatorResults: member.result.evaluatorResults
6673
- }));
6674
- const promptTemplate = config.prompt ?? DEFAULT_COMPOSITE_AGGREGATOR_PROMPT;
6675
- const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
6676
- const systemPrompt = buildOutputSchema();
6677
- const evaluatorRawRequest = {
6678
- aggregator: "llm_judge",
6679
- userPrompt,
6680
- systemPrompt,
6681
- target: judgeProvider.targetName
6682
- };
6683
- try {
6684
- const model = judgeProvider.asLanguageModel?.();
6685
- if (model) {
6686
- const { text } = await generateText2({
6687
- model,
6688
- system: systemPrompt,
6689
- prompt: userPrompt
6690
- });
6691
- const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text));
6692
- const score2 = clampScore(data2.score);
6693
- const hits2 = Array.isArray(data2.hits) ? data2.hits.filter(isNonEmptyString).slice(0, 4) : [];
6694
- const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
6695
- const reasoning2 = data2.reasoning;
6696
- return {
6697
- score: score2,
6698
- verdict: scoreToVerdict(score2),
6699
- hits: hits2,
6700
- misses: misses2,
6701
- expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
6702
- reasoning: reasoning2,
6703
- evaluatorRawRequest,
6704
- evaluatorResults
6705
- };
6706
- }
6707
- const response = await judgeProvider.invoke({
6708
- question: userPrompt,
6709
- systemPrompt,
6710
- evalCaseId: context.evalCase.id,
6711
- attempt: context.attempt
6712
- });
6713
- const data = freeformEvaluationSchema.parse(
6714
- parseJsonFromText(extractLastAssistantContent(response.outputMessages))
6715
- );
6716
- const score = clampScore(data.score);
6717
- const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
6718
- const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
6719
- const reasoning = data.reasoning;
6720
- return {
6721
- score,
6722
- verdict: scoreToVerdict(score),
6723
- hits,
6724
- misses,
6725
- expectedAspectCount: Math.max(hits.length + misses.length, 1),
6726
- reasoning,
6727
- evaluatorRawRequest,
6728
- evaluatorResults
6646
+ path: path15,
6647
+ score: 0,
6648
+ weight,
6649
+ hit: false,
6650
+ message: `${path15} (invalid numeric value)`
6729
6651
  };
6730
- } catch {
6652
+ }
6653
+ const diff = Math.abs(candidateNum - expectedNum);
6654
+ let withinTolerance;
6655
+ if (relative) {
6656
+ const relativeDiff = expectedNum === 0 ? diff : diff / Math.abs(expectedNum);
6657
+ withinTolerance = relativeDiff <= tolerance;
6658
+ } else {
6659
+ withinTolerance = diff <= tolerance;
6660
+ }
6661
+ if (withinTolerance) {
6662
+ return {
6663
+ path: path15,
6664
+ score: 1,
6665
+ weight,
6666
+ hit: true,
6667
+ message: `${path15} (within tolerance: diff=${diff.toFixed(2)})`
6668
+ };
6669
+ }
6670
+ return {
6671
+ path: path15,
6672
+ score: 0,
6673
+ weight,
6674
+ hit: false,
6675
+ message: `${path15} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
6676
+ };
6677
+ }
6678
+ /**
6679
+ * Date comparison with format normalization.
6680
+ */
6681
+ compareDate(path15, candidateValue, expectedValue, fieldConfig, weight) {
6682
+ const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
6683
+ const candidateDate = parseDate(String(candidateValue), formats);
6684
+ const expectedDate = parseDate(String(expectedValue), formats);
6685
+ if (candidateDate === null) {
6731
6686
  return {
6687
+ path: path15,
6732
6688
  score: 0,
6733
- verdict: "fail",
6734
- hits: [],
6735
- misses: [],
6736
- expectedAspectCount: 1,
6737
- evaluatorRawRequest,
6738
- evaluatorResults
6689
+ weight,
6690
+ hit: false,
6691
+ message: `${path15} (unparseable candidate date)`
6692
+ };
6693
+ }
6694
+ if (expectedDate === null) {
6695
+ return {
6696
+ path: path15,
6697
+ score: 0,
6698
+ weight,
6699
+ hit: false,
6700
+ message: `${path15} (unparseable expected date)`
6701
+ };
6702
+ }
6703
+ if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
6704
+ return {
6705
+ path: path15,
6706
+ score: 1,
6707
+ weight,
6708
+ hit: true,
6709
+ message: path15
6739
6710
  };
6740
6711
  }
6712
+ return {
6713
+ path: path15,
6714
+ score: 0,
6715
+ weight,
6716
+ hit: false,
6717
+ message: `${path15} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
6718
+ };
6719
+ }
6720
+ /**
6721
+ * Aggregate field results using configured strategy.
6722
+ */
6723
+ aggregateResults(results) {
6724
+ const aggregation = this.config.aggregation ?? "weighted_average";
6725
+ const hits = [];
6726
+ const misses = [];
6727
+ for (const result of results) {
6728
+ if (result.hit) {
6729
+ hits.push(result.message);
6730
+ } else {
6731
+ misses.push(result.message);
6732
+ }
6733
+ }
6734
+ let score;
6735
+ if (aggregation === "all_or_nothing") {
6736
+ score = misses.length === 0 ? 1 : 0;
6737
+ } else {
6738
+ const totalWeight = results.reduce((sum, r) => sum + r.weight, 0);
6739
+ if (totalWeight === 0) {
6740
+ score = results.length === 0 ? 1 : 0;
6741
+ } else {
6742
+ const weightedSum = results.reduce((sum, r) => sum + r.score * r.weight, 0);
6743
+ score = weightedSum / totalWeight;
6744
+ }
6745
+ }
6746
+ const reasoning = `${hits.length}/${results.length} fields matched`;
6747
+ return {
6748
+ score: clampScore(score),
6749
+ verdict: scoreToVerdict(score),
6750
+ hits: hits.slice(0, 4),
6751
+ misses: misses.slice(0, 4),
6752
+ expectedAspectCount: results.length,
6753
+ reasoning
6754
+ };
6741
6755
  }
6742
6756
  };
6757
+ function resolvePath(obj, path15) {
6758
+ if (!path15 || !obj) {
6759
+ return void 0;
6760
+ }
6761
+ const parts = path15.split(/\.|\[|\]/).filter((p) => p.length > 0);
6762
+ let current = obj;
6763
+ for (const part of parts) {
6764
+ if (current === null || current === void 0) {
6765
+ return void 0;
6766
+ }
6767
+ if (typeof current !== "object") {
6768
+ return void 0;
6769
+ }
6770
+ const isIndex = /^\d+$/.test(part);
6771
+ if (isIndex && Array.isArray(current)) {
6772
+ current = current[Number.parseInt(part, 10)];
6773
+ } else {
6774
+ current = current[part];
6775
+ }
6776
+ }
6777
+ return current;
6778
+ }
6779
+ function toNumber(value) {
6780
+ if (typeof value === "number") {
6781
+ return value;
6782
+ }
6783
+ if (typeof value === "string") {
6784
+ const num = Number.parseFloat(value);
6785
+ return Number.isNaN(num) ? null : num;
6786
+ }
6787
+ return null;
6788
+ }
6789
+ function parseDate(dateStr, formats) {
6790
+ if (!dateStr) return null;
6791
+ const trimmed = dateStr.trim();
6792
+ const isoDate = new Date(trimmed);
6793
+ if (!Number.isNaN(isoDate.getTime())) {
6794
+ return isoDate;
6795
+ }
6796
+ const localizedMatch = trimmed.match(/^(\d{1,2})-([A-Za-z]{3,9})-(\d{4})$/);
6797
+ if (localizedMatch) {
6798
+ const day = Number.parseInt(localizedMatch[1], 10);
6799
+ const monthName = localizedMatch[2].toLowerCase();
6800
+ const year = Number.parseInt(localizedMatch[3], 10);
6801
+ const month = MONTH_NAMES[monthName];
6802
+ if (month !== void 0) {
6803
+ return new Date(year, month, day);
6804
+ }
6805
+ }
6806
+ const usMatch = trimmed.match(/^(\d{1,2})[\/\-](\d{1,2})[\/\-](\d{4})$/);
6807
+ if (usMatch) {
6808
+ const hasUSFormat = formats.some((f) => f.includes("MM/DD") || f.includes("MM-DD"));
6809
+ const hasEUFormat = formats.some((f) => f.includes("DD/MM") || f.includes("DD-MM"));
6810
+ if (hasUSFormat && !hasEUFormat) {
6811
+ const month = Number.parseInt(usMatch[1], 10) - 1;
6812
+ const day = Number.parseInt(usMatch[2], 10);
6813
+ const year = Number.parseInt(usMatch[3], 10);
6814
+ if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
6815
+ return new Date(year, month, day);
6816
+ }
6817
+ } else if (hasEUFormat && !hasUSFormat) {
6818
+ const day = Number.parseInt(usMatch[1], 10);
6819
+ const month = Number.parseInt(usMatch[2], 10) - 1;
6820
+ const year = Number.parseInt(usMatch[3], 10);
6821
+ if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
6822
+ return new Date(year, month, day);
6823
+ }
6824
+ } else {
6825
+ const num1 = Number.parseInt(usMatch[1], 10);
6826
+ const num2 = Number.parseInt(usMatch[2], 10);
6827
+ const year = Number.parseInt(usMatch[3], 10);
6828
+ if (num1 > 12 && num2 <= 12) {
6829
+ return new Date(year, num2 - 1, num1);
6830
+ }
6831
+ if (num2 > 12 && num1 <= 12) {
6832
+ return new Date(year, num1 - 1, num2);
6833
+ }
6834
+ if (num1 <= 12 && num2 <= 31) {
6835
+ return new Date(year, num1 - 1, num2);
6836
+ }
6837
+ }
6838
+ }
6839
+ return null;
6840
+ }
6841
+ function formatDateISO(date) {
6842
+ return date.toISOString().split("T")[0];
6843
+ }
6844
+ function parseJsonFromTextSafe(text) {
6845
+ return parseJsonFromText(text);
6846
+ }
6847
+
6848
+ // src/evaluation/evaluators/latency.ts
6743
6849
  var LatencyEvaluator = class {
6744
6850
  kind = "latency";
6745
6851
  config;
@@ -6772,57 +6878,17 @@ var LatencyEvaluator = class {
6772
6878
  hits: passed ? [`Duration ${durationMs}ms <= ${threshold}ms threshold`] : [],
6773
6879
  misses: passed ? [] : [`Duration ${durationMs}ms > ${threshold}ms threshold`],
6774
6880
  expectedAspectCount: 1,
6775
- reasoning: `Execution took ${durationMs}ms (threshold: ${threshold}ms)`,
6776
- evaluatorRawRequest: {
6777
- type: "latency",
6778
- threshold,
6779
- durationMs
6780
- }
6781
- };
6782
- }
6783
- };
6784
- var CostEvaluator = class {
6785
- kind = "cost";
6786
- config;
6787
- constructor(options) {
6788
- this.config = options.config;
6789
- }
6790
- evaluate(context) {
6791
- const { budget } = this.config;
6792
- const costUsd = context.traceSummary?.costUsd;
6793
- if (costUsd === void 0) {
6794
- return {
6795
- score: 0,
6796
- verdict: "fail",
6797
- hits: [],
6798
- misses: ["No cost data available in trace"],
6799
- expectedAspectCount: 1,
6800
- reasoning: "Execution cost not reported by provider",
6801
- evaluatorRawRequest: {
6802
- type: "cost",
6803
- budget,
6804
- costUsd: null
6805
- }
6806
- };
6807
- }
6808
- const passed = costUsd <= budget;
6809
- const score = passed ? 1 : 0;
6810
- const formatCost = (n) => `$${n.toFixed(4)}`;
6811
- return {
6812
- score,
6813
- verdict: passed ? "pass" : "fail",
6814
- hits: passed ? [`Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`] : [],
6815
- misses: passed ? [] : [`Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`],
6816
- expectedAspectCount: 1,
6817
- reasoning: `Execution cost ${formatCost(costUsd)} (budget: ${formatCost(budget)})`,
6818
- evaluatorRawRequest: {
6819
- type: "cost",
6820
- budget,
6821
- costUsd
6881
+ reasoning: `Execution took ${durationMs}ms (threshold: ${threshold}ms)`,
6882
+ evaluatorRawRequest: {
6883
+ type: "latency",
6884
+ threshold,
6885
+ durationMs
6822
6886
  }
6823
6887
  };
6824
6888
  }
6825
6889
  };
6890
+
6891
+ // src/evaluation/evaluators/token-usage.ts
6826
6892
  var TokenUsageEvaluator = class {
6827
6893
  kind = "token_usage";
6828
6894
  config;
@@ -6906,6 +6972,226 @@ var TokenUsageEvaluator = class {
6906
6972
  }
6907
6973
  };
6908
6974
 
6975
+ // src/evaluation/evaluators/tool-trajectory.ts
6976
+ function argsMatch(expected, actual) {
6977
+ if (expected === void 0) return true;
6978
+ if (expected === "any") return true;
6979
+ if (actual === void 0) return false;
6980
+ for (const key of Object.keys(expected)) {
6981
+ if (!Object.hasOwn(actual, key)) return false;
6982
+ if (!deepEqual(expected[key], actual[key])) return false;
6983
+ }
6984
+ return true;
6985
+ }
6986
+ var ToolTrajectoryEvaluator = class {
6987
+ kind = "tool_trajectory";
6988
+ config;
6989
+ constructor(options) {
6990
+ this.config = options.config;
6991
+ }
6992
+ evaluate(context) {
6993
+ const { outputMessages, traceSummary } = context;
6994
+ const toolCalls = this.extractToolCallsFromMessages(outputMessages);
6995
+ if (toolCalls.length === 0 && !traceSummary) {
6996
+ return {
6997
+ score: 0,
6998
+ verdict: "fail",
6999
+ hits: [],
7000
+ misses: ["No trace available for evaluation"],
7001
+ expectedAspectCount: 1
7002
+ };
7003
+ }
7004
+ const summary = toolCalls.length > 0 ? this.buildSummary(toolCalls) : traceSummary;
7005
+ if (!summary) {
7006
+ return {
7007
+ score: 0,
7008
+ verdict: "fail",
7009
+ hits: [],
7010
+ misses: ["No trace available for evaluation"],
7011
+ expectedAspectCount: 1
7012
+ };
7013
+ }
7014
+ switch (this.config.mode) {
7015
+ case "any_order":
7016
+ return this.evaluateAnyOrder(summary);
7017
+ case "in_order":
7018
+ return this.evaluateInOrder(toolCalls);
7019
+ case "exact":
7020
+ return this.evaluateExact(toolCalls);
7021
+ default:
7022
+ return {
7023
+ score: 0,
7024
+ verdict: "fail",
7025
+ hits: [],
7026
+ misses: [`Unknown mode: ${this.config.mode}`],
7027
+ expectedAspectCount: 1
7028
+ };
7029
+ }
7030
+ }
7031
+ /**
7032
+ * Extract tool calls from output messages.
7033
+ */
7034
+ extractToolCallsFromMessages(messages) {
7035
+ if (!messages) {
7036
+ return [];
7037
+ }
7038
+ const toolCalls = [];
7039
+ for (const message of messages) {
7040
+ if (message.toolCalls) {
7041
+ for (const call of message.toolCalls) {
7042
+ toolCalls.push({
7043
+ name: call.tool,
7044
+ args: call.input
7045
+ });
7046
+ }
7047
+ }
7048
+ }
7049
+ return toolCalls;
7050
+ }
7051
+ /**
7052
+ * Build a summary from extracted tool calls.
7053
+ */
7054
+ buildSummary(toolCalls) {
7055
+ const toolCallsByName = {};
7056
+ for (const call of toolCalls) {
7057
+ toolCallsByName[call.name] = (toolCallsByName[call.name] ?? 0) + 1;
7058
+ }
7059
+ const toolNames = Object.keys(toolCallsByName).sort();
7060
+ return {
7061
+ eventCount: toolCalls.length,
7062
+ toolNames,
7063
+ toolCallsByName,
7064
+ errorCount: 0
7065
+ };
7066
+ }
7067
+ evaluateAnyOrder(summary) {
7068
+ const minimums = this.config.minimums ?? {};
7069
+ const toolNames = Object.keys(minimums);
7070
+ if (toolNames.length === 0) {
7071
+ return {
7072
+ score: 1,
7073
+ verdict: "pass",
7074
+ hits: ["No tool requirements specified"],
7075
+ misses: [],
7076
+ expectedAspectCount: 0
7077
+ };
7078
+ }
7079
+ const hits = [];
7080
+ const misses = [];
7081
+ for (const toolName of toolNames) {
7082
+ const required = minimums[toolName];
7083
+ const actual = summary.toolCallsByName[toolName] ?? 0;
7084
+ if (actual >= required) {
7085
+ hits.push(`${toolName}: called ${actual} times (required \u2265${required})`);
7086
+ } else {
7087
+ misses.push(`${toolName}: called ${actual} times (required \u2265${required})`);
7088
+ }
7089
+ }
7090
+ const score = hits.length / toolNames.length;
7091
+ return {
7092
+ score,
7093
+ verdict: scoreToVerdict(score),
7094
+ hits,
7095
+ misses,
7096
+ expectedAspectCount: toolNames.length
7097
+ };
7098
+ }
7099
+ evaluateInOrder(toolCalls) {
7100
+ const expected = this.config.expected ?? [];
7101
+ if (expected.length === 0) {
7102
+ return {
7103
+ score: 1,
7104
+ verdict: "pass",
7105
+ hits: ["No tool sequence specified"],
7106
+ misses: [],
7107
+ expectedAspectCount: 0
7108
+ };
7109
+ }
7110
+ const hits = [];
7111
+ const misses = [];
7112
+ let actualIndex = 0;
7113
+ for (let i = 0; i < expected.length; i++) {
7114
+ const expectedItem = expected[i];
7115
+ const expectedTool = expectedItem.tool;
7116
+ let found = false;
7117
+ let argsMismatch = false;
7118
+ while (actualIndex < toolCalls.length) {
7119
+ const actualCall = toolCalls[actualIndex];
7120
+ if (actualCall.name === expectedTool) {
7121
+ if (argsMatch(expectedItem.args, actualCall.args)) {
7122
+ hits.push(`Found ${expectedTool} at position ${actualIndex}`);
7123
+ actualIndex++;
7124
+ found = true;
7125
+ break;
7126
+ }
7127
+ misses.push(
7128
+ `Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`
7129
+ );
7130
+ actualIndex++;
7131
+ argsMismatch = true;
7132
+ break;
7133
+ }
7134
+ actualIndex++;
7135
+ }
7136
+ if (!found && !argsMismatch) {
7137
+ misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
7138
+ }
7139
+ }
7140
+ const score = hits.length / expected.length;
7141
+ return {
7142
+ score,
7143
+ verdict: scoreToVerdict(score),
7144
+ hits,
7145
+ misses,
7146
+ expectedAspectCount: expected.length
7147
+ };
7148
+ }
7149
+ evaluateExact(toolCalls) {
7150
+ const expected = this.config.expected ?? [];
7151
+ if (expected.length === 0) {
7152
+ return {
7153
+ score: 1,
7154
+ verdict: "pass",
7155
+ hits: ["No tool sequence specified"],
7156
+ misses: [],
7157
+ expectedAspectCount: 0
7158
+ };
7159
+ }
7160
+ const hits = [];
7161
+ const misses = [];
7162
+ if (toolCalls.length !== expected.length) {
7163
+ misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
7164
+ }
7165
+ const checkLength = Math.min(expected.length, toolCalls.length);
7166
+ for (let i = 0; i < checkLength; i++) {
7167
+ const expectedItem = expected[i];
7168
+ const expectedTool = expectedItem.tool;
7169
+ const actualCall = toolCalls[i];
7170
+ const actualTool = actualCall.name;
7171
+ if (actualTool === expectedTool) {
7172
+ if (argsMatch(expectedItem.args, actualCall.args)) {
7173
+ hits.push(`Position ${i}: ${expectedTool}`);
7174
+ } else {
7175
+ misses.push(`Position ${i}: ${expectedTool} args mismatch`);
7176
+ }
7177
+ } else {
7178
+ misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
7179
+ }
7180
+ }
7181
+ for (let i = checkLength; i < expected.length; i++) {
7182
+ misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
7183
+ }
7184
+ const score = hits.length / expected.length;
7185
+ return {
7186
+ score,
7187
+ verdict: scoreToVerdict(score),
7188
+ hits,
7189
+ misses,
7190
+ expectedAspectCount: expected.length
7191
+ };
7192
+ }
7193
+ };
7194
+
6909
7195
  // src/evaluation/orchestrator.ts
6910
7196
  import { createHash } from "node:crypto";
6911
7197
  import path14 from "node:path";
@@ -7119,6 +7405,17 @@ async function runEvaluation(options) {
7119
7405
  }
7120
7406
  return getOrCreateProvider(resolvedJudge);
7121
7407
  };
7408
+ const targetResolver = (name) => {
7409
+ const resolved = resolveTargetByName(name);
7410
+ if (!resolved) {
7411
+ return void 0;
7412
+ }
7413
+ return getOrCreateProvider(resolved);
7414
+ };
7415
+ const availableTargets = [
7416
+ target.name,
7417
+ ...Array.from(targetDefinitions.keys())
7418
+ ];
7122
7419
  const evaluatorRegistry = buildEvaluatorRegistry(evaluators, resolveJudgeProvider);
7123
7420
  const primaryProvider = getOrCreateProvider(target);
7124
7421
  const providerSupportsBatch = target.providerBatching === true && primaryProvider.supportsBatch === true && typeof primaryProvider.invokeBatch === "function";
@@ -7148,7 +7445,9 @@ async function runEvaluation(options) {
7148
7445
  onResult,
7149
7446
  verbose,
7150
7447
  resolveJudgeProvider,
7151
- agentTimeoutMs
7448
+ agentTimeoutMs,
7449
+ targetResolver,
7450
+ availableTargets
7152
7451
  });
7153
7452
  } catch (error) {
7154
7453
  if (verbose) {
@@ -7187,7 +7486,9 @@ async function runEvaluation(options) {
7187
7486
  cache,
7188
7487
  useCache,
7189
7488
  now,
7190
- judgeProvider
7489
+ judgeProvider,
7490
+ targetResolver,
7491
+ availableTargets
7191
7492
  });
7192
7493
  if (onProgress) {
7193
7494
  await onProgress({
@@ -7254,7 +7555,9 @@ async function runBatchEvaluation(options) {
7254
7555
  onProgress,
7255
7556
  onResult,
7256
7557
  resolveJudgeProvider,
7257
- agentTimeoutMs
7558
+ agentTimeoutMs,
7559
+ targetResolver,
7560
+ availableTargets
7258
7561
  } = options;
7259
7562
  const promptInputsList = [];
7260
7563
  const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
@@ -7329,7 +7632,9 @@ async function runBatchEvaluation(options) {
7329
7632
  judgeProvider: await resolveJudgeProvider(target),
7330
7633
  agentTimeoutMs,
7331
7634
  outputMessages,
7332
- traceSummary
7635
+ traceSummary,
7636
+ targetResolver,
7637
+ availableTargets
7333
7638
  });
7334
7639
  if (providerError) {
7335
7640
  result = { ...result, error: providerError };
@@ -7387,7 +7692,9 @@ async function runEvalCase(options) {
7387
7692
  cache,
7388
7693
  useCache,
7389
7694
  signal,
7390
- judgeProvider
7695
+ judgeProvider,
7696
+ targetResolver,
7697
+ availableTargets
7391
7698
  } = options;
7392
7699
  const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
7393
7700
  const promptInputs = await buildPromptInputs(evalCase, formattingMode);
@@ -7461,7 +7768,9 @@ async function runEvalCase(options) {
7461
7768
  judgeProvider,
7462
7769
  agentTimeoutMs,
7463
7770
  outputMessages,
7464
- traceSummary
7771
+ traceSummary,
7772
+ targetResolver,
7773
+ availableTargets
7465
7774
  });
7466
7775
  return providerError ? { ...result, error: providerError } : result;
7467
7776
  } catch (error) {
@@ -7481,7 +7790,9 @@ async function evaluateCandidate(options) {
7481
7790
  judgeProvider,
7482
7791
  agentTimeoutMs,
7483
7792
  outputMessages,
7484
- traceSummary
7793
+ traceSummary,
7794
+ targetResolver,
7795
+ availableTargets
7485
7796
  } = options;
7486
7797
  const gradeTimestamp = nowFn();
7487
7798
  const { score, evaluatorResults } = await runEvaluatorsForCase({
@@ -7496,7 +7807,9 @@ async function evaluateCandidate(options) {
7496
7807
  judgeProvider,
7497
7808
  agentTimeoutMs,
7498
7809
  outputMessages,
7499
- traceSummary
7810
+ traceSummary,
7811
+ targetResolver,
7812
+ availableTargets
7500
7813
  });
7501
7814
  const completedAt = nowFn();
7502
7815
  let agentProviderRequest;
@@ -7549,7 +7862,9 @@ async function runEvaluatorsForCase(options) {
7549
7862
  judgeProvider,
7550
7863
  agentTimeoutMs,
7551
7864
  outputMessages,
7552
- traceSummary
7865
+ traceSummary,
7866
+ targetResolver,
7867
+ availableTargets
7553
7868
  } = options;
7554
7869
  if (evalCase.evaluators && evalCase.evaluators.length > 0) {
7555
7870
  return runEvaluatorList({
@@ -7565,7 +7880,9 @@ async function runEvaluatorsForCase(options) {
7565
7880
  judgeProvider,
7566
7881
  agentTimeoutMs,
7567
7882
  outputMessages,
7568
- traceSummary
7883
+ traceSummary,
7884
+ targetResolver,
7885
+ availableTargets
7569
7886
  });
7570
7887
  }
7571
7888
  const evaluatorKind = evalCase.evaluator ?? "llm_judge";
@@ -7583,7 +7900,9 @@ async function runEvaluatorsForCase(options) {
7583
7900
  now,
7584
7901
  judgeProvider,
7585
7902
  outputMessages,
7586
- traceSummary
7903
+ traceSummary,
7904
+ targetResolver,
7905
+ availableTargets
7587
7906
  });
7588
7907
  return { score };
7589
7908
  }
@@ -7601,7 +7920,9 @@ async function runEvaluatorList(options) {
7601
7920
  judgeProvider,
7602
7921
  agentTimeoutMs,
7603
7922
  outputMessages,
7604
- traceSummary
7923
+ traceSummary,
7924
+ targetResolver,
7925
+ availableTargets
7605
7926
  } = options;
7606
7927
  const scored = [];
7607
7928
  const evaluatorResults = [];
@@ -7639,7 +7960,8 @@ async function runEvaluatorList(options) {
7639
7960
  script: evaluator.script,
7640
7961
  cwd: evaluator.resolvedCwd ?? evaluator.cwd,
7641
7962
  agentTimeoutMs,
7642
- config: evaluator.config
7963
+ config: evaluator.config,
7964
+ target: evaluator.target
7643
7965
  });
7644
7966
  const score2 = await codeEvaluator.evaluate({
7645
7967
  evalCase,
@@ -7649,8 +7971,11 @@ async function runEvaluatorList(options) {
7649
7971
  attempt,
7650
7972
  promptInputs,
7651
7973
  now,
7974
+ judgeProvider,
7652
7975
  outputMessages,
7653
- traceSummary
7976
+ traceSummary,
7977
+ targetResolver,
7978
+ availableTargets
7654
7979
  });
7655
7980
  const weight = evaluator.weight ?? 1;
7656
7981
  scored.push({ score: score2, name: evaluator.name, type: "code_judge", weight });
@@ -7663,7 +7988,8 @@ async function runEvaluatorList(options) {
7663
7988
  hits: score2.hits,
7664
7989
  misses: score2.misses,
7665
7990
  reasoning: score2.reasoning,
7666
- evaluatorProviderRequest: score2.evaluatorRawRequest
7991
+ evaluatorProviderRequest: score2.evaluatorRawRequest,
7992
+ details: score2.details
7667
7993
  });
7668
7994
  }
7669
7995
  if (evaluator.type === "composite") {
@@ -7677,7 +8003,8 @@ async function runEvaluatorList(options) {
7677
8003
  script: memberConfig.script,
7678
8004
  cwd: memberConfig.resolvedCwd ?? memberConfig.cwd,
7679
8005
  agentTimeoutMs,
7680
- config: memberConfig.config
8006
+ config: memberConfig.config,
8007
+ target: memberConfig.target
7681
8008
  });
7682
8009
  case "composite":
7683
8010
  return new CompositeEvaluator({
@@ -7726,7 +8053,9 @@ async function runEvaluatorList(options) {
7726
8053
  now,
7727
8054
  judgeProvider,
7728
8055
  outputMessages,
7729
- traceSummary
8056
+ traceSummary,
8057
+ targetResolver,
8058
+ availableTargets
7730
8059
  });
7731
8060
  const weight = evaluator.weight ?? 1;
7732
8061
  scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
@@ -7922,11 +8251,11 @@ async function runEvaluatorList(options) {
7922
8251
  (total, entry) => total + (entry.score.expectedAspectCount ?? 0),
7923
8252
  0
7924
8253
  );
7925
- const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString2);
8254
+ const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString);
7926
8255
  const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
7927
8256
  const score = {
7928
8257
  score: aggregateScore,
7929
- verdict: scoreToVerdict2(aggregateScore),
8258
+ verdict: scoreToVerdict(aggregateScore),
7930
8259
  hits,
7931
8260
  misses,
7932
8261
  expectedAspectCount,
@@ -7973,18 +8302,6 @@ async function resolveCustomPrompt(config) {
7973
8302
  }
7974
8303
  return config.prompt;
7975
8304
  }
7976
- function isNonEmptyString2(value) {
7977
- return typeof value === "string" && value.trim().length > 0;
7978
- }
7979
- function scoreToVerdict2(score) {
7980
- if (score >= 0.8) {
7981
- return "pass";
7982
- }
7983
- if (score >= 0.6) {
7984
- return "borderline";
7985
- }
7986
- return "fail";
7987
- }
7988
8305
  function filterEvalCases(evalCases, evalId) {
7989
8306
  if (!evalId) {
7990
8307
  return evalCases;
@@ -8127,7 +8444,8 @@ function mapChildResults(children) {
8127
8444
  misses: child.misses,
8128
8445
  reasoning: child.reasoning,
8129
8446
  evaluatorProviderRequest: child.evaluatorRawRequest,
8130
- evaluatorResults: mapChildResults(child.evaluatorResults)
8447
+ evaluatorResults: mapChildResults(child.evaluatorResults),
8448
+ details: child.details
8131
8449
  }));
8132
8450
  }
8133
8451
  function computeWeightedMean(entries) {
@@ -8142,7 +8460,7 @@ function computeWeightedMean(entries) {
8142
8460
  }
8143
8461
 
8144
8462
  // src/evaluation/generators/rubric-generator.ts
8145
- import { generateText as generateText3 } from "ai";
8463
+ import { generateText as generateText4 } from "ai";
8146
8464
  import { z as z3 } from "zod";
8147
8465
  var rubricItemSchema = z3.object({
8148
8466
  id: z3.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
@@ -8176,7 +8494,7 @@ You must return a valid JSON object matching this schema:
8176
8494
  let lastError;
8177
8495
  for (let attempt = 1; attempt <= 3; attempt++) {
8178
8496
  try {
8179
- const { text } = await generateText3({
8497
+ const { text } = await generateText4({
8180
8498
  model,
8181
8499
  system,
8182
8500
  prompt
@@ -8238,31 +8556,39 @@ export {
8238
8556
  ToolTrajectoryEvaluator,
8239
8557
  avgToolDurationMs,
8240
8558
  buildDirectoryChain,
8559
+ buildOutputSchema,
8241
8560
  buildPromptInputs,
8242
8561
  buildSearchRoots,
8562
+ clampScore,
8243
8563
  computeTraceSummary,
8244
8564
  consumeClaudeCodeLogEntries,
8245
8565
  consumeCodexLogEntries,
8246
8566
  consumePiLogEntries,
8247
8567
  createAgentKernel,
8248
8568
  createProvider,
8569
+ deepEqual,
8249
8570
  ensureVSCodeSubagents,
8571
+ executeScript,
8250
8572
  explorationRatio,
8251
- extractCodeBlocks,
8573
+ extractJsonBlob,
8252
8574
  fileExists,
8253
8575
  findGitRoot,
8576
+ freeformEvaluationSchema,
8254
8577
  generateRubrics,
8255
8578
  getHitCount,
8256
8579
  isEvaluatorKind,
8257
8580
  isGuidelineFile,
8258
8581
  isJsonObject,
8259
8582
  isJsonValue,
8583
+ isNonEmptyString,
8260
8584
  isTestMessage,
8261
8585
  isTestMessageRole,
8262
8586
  listTargetNames,
8263
8587
  loadEvalCases,
8264
8588
  mergeExecutionMetrics,
8265
8589
  normalizeLineEndings,
8590
+ parseJsonFromText,
8591
+ parseJsonSafe,
8266
8592
  readJsonFile,
8267
8593
  readTargetDefinitions,
8268
8594
  readTestSuiteMetadata,
@@ -8272,6 +8598,7 @@ export {
8272
8598
  resolveTargetDefinition,
8273
8599
  runEvalCase,
8274
8600
  runEvaluation,
8601
+ scoreToVerdict,
8275
8602
  subscribeToClaudeCodeLogEntries,
8276
8603
  subscribeToCodexLogEntries,
8277
8604
  subscribeToPiLogEntries,