@agentv/core 0.22.2 → 0.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -5,10 +5,11 @@ import {
5
5
  findGitRoot,
6
6
  isAgentProvider,
7
7
  normalizeLineEndings,
8
+ readJsonFile,
8
9
  readTextFile,
9
10
  resolveFileReference,
10
11
  resolveTargetDefinition
11
- } from "./chunk-B2J23S7D.js";
12
+ } from "./chunk-OYTL3LNN.js";
12
13
 
13
14
  // src/evaluation/types.ts
14
15
  var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
@@ -51,7 +52,14 @@ function isTestMessage(value) {
51
52
  }
52
53
  return candidate.content.every(isJsonObject);
53
54
  }
54
- var EVALUATOR_KIND_VALUES = ["code", "llm_judge", "rubric"];
55
+ var EVALUATOR_KIND_VALUES = [
56
+ "code_judge",
57
+ "llm_judge",
58
+ "rubric",
59
+ "composite",
60
+ "tool_trajectory",
61
+ "expected_messages"
62
+ ];
55
63
  var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
56
64
  function isEvaluatorKind(value) {
57
65
  return typeof value === "string" && EVALUATOR_KIND_SET.has(value);
@@ -60,6 +68,44 @@ function getHitCount(result) {
60
68
  return result.hits.length;
61
69
  }
62
70
 
71
+ // src/evaluation/trace.ts
72
+ function isTraceEventType(value) {
73
+ return typeof value === "string" && ["model_step", "tool_call", "tool_result", "message", "error"].includes(value);
74
+ }
75
+ function isTraceEvent(value) {
76
+ if (typeof value !== "object" || value === null) {
77
+ return false;
78
+ }
79
+ const candidate = value;
80
+ return isTraceEventType(candidate.type) && typeof candidate.timestamp === "string";
81
+ }
82
+ function isExpectedToolCall(value) {
83
+ if (typeof value !== "object" || value === null) {
84
+ return false;
85
+ }
86
+ const candidate = value;
87
+ return typeof candidate.tool === "string";
88
+ }
89
+ function computeTraceSummary(trace) {
90
+ const toolCallCounts = {};
91
+ let errorCount = 0;
92
+ for (const event of trace) {
93
+ if (event.type === "tool_call" && event.name) {
94
+ toolCallCounts[event.name] = (toolCallCounts[event.name] ?? 0) + 1;
95
+ }
96
+ if (event.type === "error") {
97
+ errorCount++;
98
+ }
99
+ }
100
+ const toolNames = Object.keys(toolCallCounts).sort();
101
+ return {
102
+ eventCount: trace.length,
103
+ toolNames,
104
+ toolCallsByName: toolCallCounts,
105
+ errorCount
106
+ };
107
+ }
108
+
63
109
  // src/evaluation/yaml-parser.ts
64
110
  import { readFile as readFile5 } from "node:fs/promises";
65
111
  import path6 from "node:path";
@@ -403,10 +449,10 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
403
449
  logWarning2(`Skipping evaluator with invalid name/type in '${evalId}'`);
404
450
  continue;
405
451
  }
406
- if (typeValue === "code") {
452
+ if (typeValue === "code_judge") {
407
453
  const script = asString2(rawEvaluator.script);
408
454
  if (!script) {
409
- logWarning2(`Skipping code evaluator '${name}' in '${evalId}': missing script`);
455
+ logWarning2(`Skipping code_judge evaluator '${name}' in '${evalId}': missing script`);
410
456
  continue;
411
457
  }
412
458
  const cwd = asString2(rawEvaluator.cwd);
@@ -417,7 +463,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
417
463
  resolvedCwd = path3.resolve(resolved.resolvedPath);
418
464
  } else {
419
465
  logWarning2(
420
- `Code evaluator '${name}' in '${evalId}': cwd not found (${resolved.displayPath})`,
466
+ `Code_judge evaluator '${name}' in '${evalId}': cwd not found (${resolved.displayPath})`,
421
467
  resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
422
468
  );
423
469
  }
@@ -433,6 +479,174 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
433
479
  });
434
480
  continue;
435
481
  }
482
+ if (typeValue === "composite") {
483
+ const rawMembers = rawEvaluator.evaluators;
484
+ if (!Array.isArray(rawMembers)) {
485
+ logWarning2(
486
+ `Skipping composite evaluator '${name}' in '${evalId}': missing evaluators array`
487
+ );
488
+ continue;
489
+ }
490
+ const rawAggregator = rawEvaluator.aggregator;
491
+ if (!isJsonObject2(rawAggregator)) {
492
+ logWarning2(`Skipping composite evaluator '${name}' in '${evalId}': missing aggregator`);
493
+ continue;
494
+ }
495
+ const aggregatorType = asString2(rawAggregator.type);
496
+ if (aggregatorType !== "weighted_average" && aggregatorType !== "code_judge" && aggregatorType !== "llm_judge") {
497
+ logWarning2(
498
+ `Skipping composite evaluator '${name}' in '${evalId}': invalid aggregator type '${aggregatorType}'`
499
+ );
500
+ continue;
501
+ }
502
+ const memberEvaluators = [];
503
+ for (const rawMember of rawMembers) {
504
+ if (!isJsonObject2(rawMember)) {
505
+ logWarning2(`Skipping invalid member evaluator in composite '${name}' (expected object)`);
506
+ continue;
507
+ }
508
+ const memberName = asString2(rawMember.name);
509
+ const memberType = rawMember.type;
510
+ if (!memberName || !isEvaluatorKind(memberType)) {
511
+ logWarning2(`Skipping member evaluator with invalid name/type in composite '${name}'`);
512
+ continue;
513
+ }
514
+ const memberConfigs = await parseEvaluators(
515
+ { evaluators: [rawMember] },
516
+ void 0,
517
+ searchRoots,
518
+ `${evalId}:${name}:${memberName}`
519
+ );
520
+ if (memberConfigs && memberConfigs.length > 0) {
521
+ memberEvaluators.push(memberConfigs[0]);
522
+ }
523
+ }
524
+ if (memberEvaluators.length === 0) {
525
+ logWarning2(
526
+ `Skipping composite evaluator '${name}' in '${evalId}': no valid member evaluators`
527
+ );
528
+ continue;
529
+ }
530
+ let aggregator;
531
+ if (aggregatorType === "weighted_average") {
532
+ const weights = isJsonObject2(rawAggregator.weights) ? rawAggregator.weights : void 0;
533
+ const parsedWeights = {};
534
+ if (weights) {
535
+ for (const [key, value] of Object.entries(weights)) {
536
+ if (typeof value === "number") {
537
+ parsedWeights[key] = value;
538
+ }
539
+ }
540
+ }
541
+ aggregator = {
542
+ type: "weighted_average",
543
+ ...Object.keys(parsedWeights).length > 0 ? { weights: parsedWeights } : {}
544
+ };
545
+ } else if (aggregatorType === "code_judge") {
546
+ const aggregatorPath = asString2(rawAggregator.path);
547
+ if (!aggregatorPath) {
548
+ logWarning2(
549
+ `Skipping composite evaluator '${name}' in '${evalId}': code_judge aggregator missing path`
550
+ );
551
+ continue;
552
+ }
553
+ aggregator = {
554
+ type: "code_judge",
555
+ path: aggregatorPath,
556
+ cwd: searchRoots[0]
557
+ };
558
+ } else {
559
+ const aggregatorPrompt = asString2(rawAggregator.prompt);
560
+ let promptPath2;
561
+ if (aggregatorPrompt) {
562
+ const resolved = await resolveFileReference2(aggregatorPrompt, searchRoots);
563
+ if (resolved.resolvedPath) {
564
+ promptPath2 = path3.resolve(resolved.resolvedPath);
565
+ }
566
+ }
567
+ aggregator = {
568
+ type: "llm_judge",
569
+ ...aggregatorPrompt ? { prompt: aggregatorPrompt } : {},
570
+ ...promptPath2 ? { promptPath: promptPath2 } : {}
571
+ };
572
+ }
573
+ evaluators.push({
574
+ name,
575
+ type: "composite",
576
+ evaluators: memberEvaluators,
577
+ aggregator
578
+ });
579
+ continue;
580
+ }
581
+ if (typeValue === "expected_messages") {
582
+ evaluators.push({
583
+ name,
584
+ type: "expected_messages"
585
+ });
586
+ continue;
587
+ }
588
+ if (typeValue === "tool_trajectory") {
589
+ const mode = asString2(rawEvaluator.mode);
590
+ if (mode !== "any_order" && mode !== "in_order" && mode !== "exact") {
591
+ logWarning2(
592
+ `Skipping tool_trajectory evaluator '${name}' in '${evalId}': invalid mode '${mode}' (must be any_order, in_order, or exact)`
593
+ );
594
+ continue;
595
+ }
596
+ const rawMinimums = rawEvaluator.minimums;
597
+ let minimums;
598
+ if (rawMinimums !== void 0) {
599
+ if (!isJsonObject2(rawMinimums)) {
600
+ logWarning2(
601
+ `Skipping tool_trajectory evaluator '${name}' in '${evalId}': minimums must be an object`
602
+ );
603
+ continue;
604
+ }
605
+ minimums = {};
606
+ for (const [toolName, count] of Object.entries(rawMinimums)) {
607
+ if (typeof count === "number" && count >= 0) {
608
+ minimums[toolName] = count;
609
+ }
610
+ }
611
+ }
612
+ const rawExpected = rawEvaluator.expected;
613
+ let expected;
614
+ if (rawExpected !== void 0) {
615
+ if (!Array.isArray(rawExpected)) {
616
+ logWarning2(
617
+ `Skipping tool_trajectory evaluator '${name}' in '${evalId}': expected must be an array`
618
+ );
619
+ continue;
620
+ }
621
+ expected = [];
622
+ for (const item of rawExpected) {
623
+ if (isJsonObject2(item) && typeof item.tool === "string") {
624
+ expected.push({ tool: item.tool });
625
+ }
626
+ }
627
+ }
628
+ if (mode === "any_order" && !minimums) {
629
+ logWarning2(
630
+ `Skipping tool_trajectory evaluator '${name}' in '${evalId}': any_order mode requires minimums`
631
+ );
632
+ continue;
633
+ }
634
+ if ((mode === "in_order" || mode === "exact") && !expected) {
635
+ logWarning2(
636
+ `Skipping tool_trajectory evaluator '${name}' in '${evalId}': ${mode} mode requires expected`
637
+ );
638
+ continue;
639
+ }
640
+ const config = {
641
+ name,
642
+ type: "tool_trajectory",
643
+ mode,
644
+ ...minimums ? { minimums } : {},
645
+ ...expected ? { expected } : {}
646
+ };
647
+ evaluators.push(config);
648
+ continue;
649
+ }
436
650
  const prompt = asString2(rawEvaluator.prompt);
437
651
  let promptPath;
438
652
  if (prompt) {
@@ -686,6 +900,67 @@ ${detailBlock}${ANSI_RESET4}`);
686
900
  console.warn(`${ANSI_YELLOW4}Warning: ${message}${ANSI_RESET4}`);
687
901
  }
688
902
  }
903
+ async function processExpectedMessages(options) {
904
+ const { messages, searchRoots, repoRootPath, verbose } = options;
905
+ const segments = [];
906
+ for (const message of messages) {
907
+ const segment = {
908
+ role: message.role
909
+ };
910
+ if (message.role === "assistant" && message.tool_calls !== void 0) {
911
+ segment.tool_calls = message.tool_calls;
912
+ }
913
+ const content = message.content;
914
+ if (typeof content === "string") {
915
+ segment.content = content;
916
+ } else if (Array.isArray(content)) {
917
+ const processedContent = [];
918
+ for (const rawSegment of content) {
919
+ if (!isJsonObject(rawSegment)) {
920
+ continue;
921
+ }
922
+ const segmentType = asString3(rawSegment.type);
923
+ if (segmentType === "file") {
924
+ const rawValue = asString3(rawSegment.value);
925
+ if (!rawValue) {
926
+ continue;
927
+ }
928
+ const { displayPath, resolvedPath, attempted } = await resolveFileReference2(
929
+ rawValue,
930
+ searchRoots
931
+ );
932
+ if (!resolvedPath) {
933
+ const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
934
+ logWarning3(`File not found in expected_messages: ${displayPath}`, attempts);
935
+ continue;
936
+ }
937
+ try {
938
+ const fileContent = (await readFile3(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
939
+ processedContent.push({
940
+ type: "file",
941
+ path: displayPath,
942
+ text: fileContent,
943
+ resolvedPath: path4.resolve(resolvedPath)
944
+ });
945
+ if (verbose) {
946
+ console.log(` [Expected Output File] Found: ${displayPath}`);
947
+ console.log(` Resolved to: ${resolvedPath}`);
948
+ }
949
+ } catch (error) {
950
+ logWarning3(
951
+ `Could not read expected output file ${resolvedPath}: ${error.message}`
952
+ );
953
+ }
954
+ continue;
955
+ }
956
+ processedContent.push(cloneJsonObject(rawSegment));
957
+ }
958
+ segment.content = processedContent;
959
+ }
960
+ segments.push(segment);
961
+ }
962
+ return segments;
963
+ }
689
964
 
690
965
  // src/evaluation/formatting/prompt-builder.ts
691
966
  import { readFile as readFile4 } from "node:fs/promises";
@@ -990,12 +1265,10 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
990
1265
  messageType: "input",
991
1266
  verbose
992
1267
  });
993
- const outputSegments = hasExpectedMessages ? await processMessages({
1268
+ const outputSegments = hasExpectedMessages ? await processExpectedMessages({
994
1269
  messages: expectedMessages,
995
1270
  searchRoots,
996
1271
  repoRootPath,
997
- guidelinePatterns,
998
- messageType: "output",
999
1272
  verbose
1000
1273
  }) : [];
1001
1274
  const codeSnippets = extractCodeBlocks(inputSegments);
@@ -1519,9 +1792,11 @@ var CliProvider = class {
1519
1792
  const message = detail ? `${detail} (exit code ${codeText})` : `CLI exited with code ${codeText}`;
1520
1793
  throw new Error(message);
1521
1794
  }
1522
- const responseText = await this.readAndCleanupOutputFile(outputFilePath);
1795
+ const responseContent = await this.readAndCleanupOutputFile(outputFilePath);
1796
+ const parsed = this.parseOutputContent(responseContent);
1523
1797
  return {
1524
- text: responseText,
1798
+ text: parsed.text,
1799
+ trace: parsed.trace,
1525
1800
  raw: {
1526
1801
  command: renderedCommand,
1527
1802
  stderr: result.stderr,
@@ -1531,6 +1806,31 @@ var CliProvider = class {
1531
1806
  }
1532
1807
  };
1533
1808
  }
1809
+ /**
1810
+ * Parse output content from CLI.
1811
+ * If the content is valid JSON with a 'text' field, extract text and optional trace.
1812
+ * Otherwise, treat the entire content as plain text.
1813
+ */
1814
+ parseOutputContent(content) {
1815
+ try {
1816
+ const parsed = JSON.parse(content);
1817
+ if (typeof parsed === "object" && parsed !== null && "text" in parsed) {
1818
+ const obj = parsed;
1819
+ const text = typeof obj.text === "string" ? obj.text : String(obj.text);
1820
+ const trace = this.parseTrace(obj.trace);
1821
+ return { text, trace };
1822
+ }
1823
+ } catch {
1824
+ }
1825
+ return { text: content };
1826
+ }
1827
+ parseTrace(trace) {
1828
+ if (!Array.isArray(trace)) {
1829
+ return void 0;
1830
+ }
1831
+ const validEvents = trace.filter(isTraceEvent);
1832
+ return validEvents.length > 0 ? validEvents : void 0;
1833
+ }
1534
1834
  async readAndCleanupOutputFile(filePath) {
1535
1835
  try {
1536
1836
  const content = await readTextFile(filePath);
@@ -2517,6 +2817,7 @@ var MockProvider = class {
2517
2817
  delayMs;
2518
2818
  delayMinMs;
2519
2819
  delayMaxMs;
2820
+ trace;
2520
2821
  constructor(targetName, config) {
2521
2822
  this.id = `mock:${targetName}`;
2522
2823
  this.targetName = targetName;
@@ -2524,6 +2825,7 @@ var MockProvider = class {
2524
2825
  this.delayMs = config.delayMs ?? 0;
2525
2826
  this.delayMinMs = config.delayMinMs ?? 0;
2526
2827
  this.delayMaxMs = config.delayMaxMs ?? 0;
2828
+ this.trace = config.trace;
2527
2829
  }
2528
2830
  async invoke(request) {
2529
2831
  const delay = this.calculateDelay();
@@ -2535,7 +2837,8 @@ var MockProvider = class {
2535
2837
  raw: {
2536
2838
  question: request.question,
2537
2839
  guidelines: request.guidelines
2538
- }
2840
+ },
2841
+ trace: this.trace
2539
2842
  };
2540
2843
  }
2541
2844
  calculateDelay() {
@@ -3329,6 +3632,473 @@ function substituteVariables(template, variables) {
3329
3632
  return variables[varName] ?? match;
3330
3633
  });
3331
3634
  }
3635
+ var ToolTrajectoryEvaluator = class {
3636
+ kind = "tool_trajectory";
3637
+ config;
3638
+ constructor(options) {
3639
+ this.config = options.config;
3640
+ }
3641
+ evaluate(context) {
3642
+ const { candidateTrace, candidateTraceSummary } = context;
3643
+ if (!candidateTrace || !candidateTraceSummary) {
3644
+ return {
3645
+ score: 0,
3646
+ verdict: "fail",
3647
+ hits: [],
3648
+ misses: ["No trace available for evaluation"],
3649
+ expectedAspectCount: 1
3650
+ };
3651
+ }
3652
+ switch (this.config.mode) {
3653
+ case "any_order":
3654
+ return this.evaluateAnyOrder(candidateTraceSummary);
3655
+ case "in_order":
3656
+ return this.evaluateInOrder(candidateTrace);
3657
+ case "exact":
3658
+ return this.evaluateExact(candidateTrace);
3659
+ default:
3660
+ return {
3661
+ score: 0,
3662
+ verdict: "fail",
3663
+ hits: [],
3664
+ misses: [`Unknown mode: ${this.config.mode}`],
3665
+ expectedAspectCount: 1
3666
+ };
3667
+ }
3668
+ }
3669
+ evaluateAnyOrder(summary) {
3670
+ const minimums = this.config.minimums ?? {};
3671
+ const toolNames = Object.keys(minimums);
3672
+ if (toolNames.length === 0) {
3673
+ return {
3674
+ score: 1,
3675
+ verdict: "pass",
3676
+ hits: ["No tool requirements specified"],
3677
+ misses: [],
3678
+ expectedAspectCount: 0
3679
+ };
3680
+ }
3681
+ const hits = [];
3682
+ const misses = [];
3683
+ for (const toolName of toolNames) {
3684
+ const required = minimums[toolName];
3685
+ const actual = summary.toolCallsByName[toolName] ?? 0;
3686
+ if (actual >= required) {
3687
+ hits.push(`${toolName}: called ${actual} times (required \u2265${required})`);
3688
+ } else {
3689
+ misses.push(`${toolName}: called ${actual} times (required \u2265${required})`);
3690
+ }
3691
+ }
3692
+ const score = hits.length / toolNames.length;
3693
+ return {
3694
+ score,
3695
+ verdict: scoreToVerdict(score),
3696
+ hits,
3697
+ misses,
3698
+ expectedAspectCount: toolNames.length
3699
+ };
3700
+ }
3701
+ evaluateInOrder(trace) {
3702
+ const expected = this.config.expected ?? [];
3703
+ if (expected.length === 0) {
3704
+ return {
3705
+ score: 1,
3706
+ verdict: "pass",
3707
+ hits: ["No tool sequence specified"],
3708
+ misses: [],
3709
+ expectedAspectCount: 0
3710
+ };
3711
+ }
3712
+ const actualToolCalls = trace.filter((e) => e.type === "tool_call" && e.name);
3713
+ const hits = [];
3714
+ const misses = [];
3715
+ let actualIndex = 0;
3716
+ for (let i = 0; i < expected.length; i++) {
3717
+ const expectedTool = expected[i].tool;
3718
+ let found = false;
3719
+ while (actualIndex < actualToolCalls.length) {
3720
+ if (actualToolCalls[actualIndex].name === expectedTool) {
3721
+ hits.push(`Found ${expectedTool} at position ${actualIndex}`);
3722
+ actualIndex++;
3723
+ found = true;
3724
+ break;
3725
+ }
3726
+ actualIndex++;
3727
+ }
3728
+ if (!found) {
3729
+ misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
3730
+ }
3731
+ }
3732
+ const score = hits.length / expected.length;
3733
+ return {
3734
+ score,
3735
+ verdict: scoreToVerdict(score),
3736
+ hits,
3737
+ misses,
3738
+ expectedAspectCount: expected.length
3739
+ };
3740
+ }
3741
+ evaluateExact(trace) {
3742
+ const expected = this.config.expected ?? [];
3743
+ if (expected.length === 0) {
3744
+ return {
3745
+ score: 1,
3746
+ verdict: "pass",
3747
+ hits: ["No tool sequence specified"],
3748
+ misses: [],
3749
+ expectedAspectCount: 0
3750
+ };
3751
+ }
3752
+ const actualToolCalls = trace.filter((e) => e.type === "tool_call" && e.name);
3753
+ const hits = [];
3754
+ const misses = [];
3755
+ if (actualToolCalls.length !== expected.length) {
3756
+ misses.push(`Expected ${expected.length} tool calls, got ${actualToolCalls.length}`);
3757
+ }
3758
+ const checkLength = Math.min(expected.length, actualToolCalls.length);
3759
+ for (let i = 0; i < checkLength; i++) {
3760
+ const expectedTool = expected[i].tool;
3761
+ const actualTool = actualToolCalls[i].name;
3762
+ if (actualTool === expectedTool) {
3763
+ hits.push(`Position ${i}: ${expectedTool} \u2713`);
3764
+ } else {
3765
+ misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
3766
+ }
3767
+ }
3768
+ for (let i = checkLength; i < expected.length; i++) {
3769
+ misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
3770
+ }
3771
+ const score = hits.length / expected.length;
3772
+ return {
3773
+ score,
3774
+ verdict: scoreToVerdict(score),
3775
+ hits,
3776
+ misses,
3777
+ expectedAspectCount: expected.length
3778
+ };
3779
+ }
3780
+ };
3781
+ var ExpectedMessagesEvaluator = class {
3782
+ kind = "expected_messages";
3783
+ evaluate(context) {
3784
+ const { candidateTrace, evalCase } = context;
3785
+ const expectedSegments = evalCase.expected_segments;
3786
+ const expectedToolCalls = this.extractExpectedToolCalls(expectedSegments);
3787
+ if (expectedToolCalls.length === 0) {
3788
+ return {
3789
+ score: 1,
3790
+ verdict: "pass",
3791
+ hits: ["No tool_calls specified in expected_messages"],
3792
+ misses: [],
3793
+ expectedAspectCount: 1
3794
+ };
3795
+ }
3796
+ if (!candidateTrace || candidateTrace.length === 0) {
3797
+ return {
3798
+ score: 0,
3799
+ verdict: "fail",
3800
+ hits: [],
3801
+ misses: ["No trace available to validate tool_calls"],
3802
+ expectedAspectCount: expectedToolCalls.length
3803
+ };
3804
+ }
3805
+ const actualToolCalls = candidateTrace.filter((e) => e.type === "tool_call");
3806
+ return this.validateToolCalls(expectedToolCalls, actualToolCalls);
3807
+ }
3808
+ extractExpectedToolCalls(segments) {
3809
+ if (!segments) {
3810
+ return [];
3811
+ }
3812
+ const toolCalls = [];
3813
+ for (const segment of segments) {
3814
+ const role = segment.role;
3815
+ const segmentToolCalls = segment.tool_calls;
3816
+ if (role === "assistant" && Array.isArray(segmentToolCalls)) {
3817
+ for (const tc of segmentToolCalls) {
3818
+ if (typeof tc === "object" && tc !== null && typeof tc.tool === "string") {
3819
+ const toolCall = tc;
3820
+ toolCalls.push({ tool: toolCall.tool, input: toolCall.input });
3821
+ }
3822
+ }
3823
+ }
3824
+ }
3825
+ return toolCalls;
3826
+ }
3827
+ validateToolCalls(expected, actual) {
3828
+ const hits = [];
3829
+ const misses = [];
3830
+ for (let i = 0; i < expected.length; i++) {
3831
+ const expectedCall = expected[i];
3832
+ const actualCall = actual[i];
3833
+ if (!actualCall) {
3834
+ misses.push(
3835
+ `tool_calls[${i}]: expected ${expectedCall.tool}, but no more tool calls in trace`
3836
+ );
3837
+ continue;
3838
+ }
3839
+ if (actualCall.name !== expectedCall.tool) {
3840
+ misses.push(
3841
+ `tool_calls[${i}]: expected ${expectedCall.tool}, got ${actualCall.name ?? "unknown"}`
3842
+ );
3843
+ continue;
3844
+ }
3845
+ if (expectedCall.input !== void 0) {
3846
+ if (!this.deepEquals(expectedCall.input, actualCall.input)) {
3847
+ misses.push(`tool_calls[${i}]: ${expectedCall.tool} input mismatch`);
3848
+ continue;
3849
+ }
3850
+ }
3851
+ hits.push(`tool_calls[${i}]: ${expectedCall.tool} matched`);
3852
+ }
3853
+ const totalChecks = expected.length || 1;
3854
+ const score = hits.length / totalChecks;
3855
+ return {
3856
+ score,
3857
+ verdict: score >= 0.8 ? "pass" : score >= 0.6 ? "borderline" : "fail",
3858
+ hits,
3859
+ misses,
3860
+ expectedAspectCount: totalChecks
3861
+ };
3862
+ }
3863
+ deepEquals(a, b) {
3864
+ if (a === b) return true;
3865
+ if (typeof a !== typeof b) return false;
3866
+ if (typeof a !== "object" || a === null || b === null) return false;
3867
+ if (Array.isArray(a) && Array.isArray(b)) {
3868
+ if (a.length !== b.length) return false;
3869
+ return a.every((val, i) => this.deepEquals(val, b[i]));
3870
+ }
3871
+ if (Array.isArray(a) || Array.isArray(b)) return false;
3872
+ const aObj = a;
3873
+ const bObj = b;
3874
+ const aKeys = Object.keys(aObj);
3875
+ const bKeys = Object.keys(bObj);
3876
+ if (aKeys.length !== bKeys.length) return false;
3877
+ return aKeys.every((key) => this.deepEquals(aObj[key], bObj[key]));
3878
+ }
3879
+ };
3880
+ var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
3881
+ {{EVALUATOR_RESULTS_JSON}}
3882
+
3883
+ Decide the final score and verdict based on all evaluator results.
3884
+ Return a JSON object with: score (0.0-1.0), verdict (pass/fail/borderline), and reasoning.`;
3885
+ var CompositeEvaluator = class {
3886
+ kind = "composite";
3887
+ config;
3888
+ evaluatorFactory;
3889
+ cwd;
3890
+ constructor(options) {
3891
+ this.config = options.config;
3892
+ this.evaluatorFactory = options.evaluatorFactory;
3893
+ this.cwd = options.cwd;
3894
+ }
3895
+ async evaluate(context) {
3896
+ const memberResults = await Promise.all(
3897
+ this.config.evaluators.map(async (memberConfig) => {
3898
+ const evaluator = this.evaluatorFactory.create(memberConfig, context);
3899
+ return {
3900
+ id: memberConfig.name,
3901
+ type: memberConfig.type,
3902
+ result: await evaluator.evaluate(context)
3903
+ };
3904
+ })
3905
+ );
3906
+ return this.aggregate(memberResults, context);
3907
+ }
3908
+ async aggregate(results, context) {
3909
+ const aggregator = this.config.aggregator;
3910
+ switch (aggregator.type) {
3911
+ case "code_judge":
3912
+ return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
3913
+ case "llm_judge":
3914
+ return this.runLlmAggregator(results, context, aggregator);
3915
+ default:
3916
+ return this.runWeightedAverage(results, aggregator.weights);
3917
+ }
3918
+ }
3919
+ runWeightedAverage(results, weights) {
3920
+ let totalWeight = 0;
3921
+ let weightedSum = 0;
3922
+ const allHits = [];
3923
+ const allMisses = [];
3924
+ const reasoningParts = [];
3925
+ const evaluatorResults = [];
3926
+ for (const member of results) {
3927
+ const weight = weights?.[member.id] ?? 1;
3928
+ totalWeight += weight;
3929
+ weightedSum += member.result.score * weight;
3930
+ allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
3931
+ allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
3932
+ if (member.result.reasoning) {
3933
+ reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
3934
+ }
3935
+ evaluatorResults.push({
3936
+ name: member.id,
3937
+ type: member.type,
3938
+ score: member.result.score,
3939
+ weight,
3940
+ verdict: member.result.verdict,
3941
+ hits: [...member.result.hits],
3942
+ misses: [...member.result.misses],
3943
+ reasoning: member.result.reasoning,
3944
+ evaluatorRawRequest: member.result.evaluatorRawRequest,
3945
+ evaluatorResults: member.result.evaluatorResults
3946
+ });
3947
+ }
3948
+ const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
3949
+ return {
3950
+ score: clampScore(finalScore),
3951
+ verdict: scoreToVerdict(finalScore),
3952
+ hits: allHits,
3953
+ misses: allMisses,
3954
+ expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
3955
+ reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
3956
+ evaluatorRawRequest: {
3957
+ aggregator: "weighted_average",
3958
+ ...weights ? { weights } : {}
3959
+ },
3960
+ evaluatorResults
3961
+ };
3962
+ }
3963
+ async runCodeAggregator(results, scriptPath, cwd, weights) {
3964
+ const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
3965
+ const inputPayload = JSON.stringify({ results: resultsObject }, null, 2);
3966
+ const evaluatorResults = results.map((member) => ({
3967
+ name: member.id,
3968
+ type: member.type,
3969
+ score: member.result.score,
3970
+ weight: weights?.[member.id] ?? 1,
3971
+ verdict: member.result.verdict,
3972
+ hits: [...member.result.hits],
3973
+ misses: [...member.result.misses],
3974
+ reasoning: member.result.reasoning,
3975
+ evaluatorRawRequest: member.result.evaluatorRawRequest,
3976
+ evaluatorResults: member.result.evaluatorResults
3977
+ }));
3978
+ try {
3979
+ const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
3980
+ const parsed = parseJsonSafe(stdout);
3981
+ const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
3982
+ const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
3983
+ const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
3984
+ const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
3985
+ const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
3986
+ return {
3987
+ score,
3988
+ verdict,
3989
+ hits,
3990
+ misses,
3991
+ expectedAspectCount: hits.length + misses.length || 1,
3992
+ reasoning,
3993
+ evaluatorRawRequest: {
3994
+ aggregator: "code_judge",
3995
+ script: scriptPath
3996
+ },
3997
+ evaluatorResults
3998
+ };
3999
+ } catch (error) {
4000
+ const message = error instanceof Error ? error.message : String(error);
4001
+ return {
4002
+ score: 0,
4003
+ verdict: "fail",
4004
+ hits: [],
4005
+ misses: [`Code aggregator failed: ${message}`],
4006
+ expectedAspectCount: 1,
4007
+ reasoning: message,
4008
+ evaluatorRawRequest: {
4009
+ aggregator: "code_judge",
4010
+ script: scriptPath,
4011
+ error: message
4012
+ },
4013
+ evaluatorResults
4014
+ };
4015
+ }
4016
+ }
4017
+ async runLlmAggregator(results, context, config) {
4018
+ const judgeProvider = context.judgeProvider;
4019
+ if (!judgeProvider) {
4020
+ throw new Error("No judge provider available for LLM aggregation");
4021
+ }
4022
+ const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
4023
+ const resultsJson = JSON.stringify(resultsObject, null, 2);
4024
+ const evaluatorResults = results.map((member) => ({
4025
+ name: member.id,
4026
+ type: member.type,
4027
+ score: member.result.score,
4028
+ verdict: member.result.verdict,
4029
+ hits: [...member.result.hits],
4030
+ misses: [...member.result.misses],
4031
+ reasoning: member.result.reasoning,
4032
+ evaluatorRawRequest: member.result.evaluatorRawRequest,
4033
+ evaluatorResults: member.result.evaluatorResults
4034
+ }));
4035
+ const promptTemplate = config.prompt ?? DEFAULT_COMPOSITE_AGGREGATOR_PROMPT;
4036
+ const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
4037
+ const systemPrompt = buildOutputSchema();
4038
+ const evaluatorRawRequest = {
4039
+ aggregator: "llm_judge",
4040
+ userPrompt,
4041
+ systemPrompt,
4042
+ target: judgeProvider.targetName
4043
+ };
4044
+ try {
4045
+ const model = judgeProvider.asLanguageModel?.();
4046
+ if (model) {
4047
+ const { text } = await generateText2({
4048
+ model,
4049
+ system: systemPrompt,
4050
+ prompt: userPrompt
4051
+ });
4052
+ const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text));
4053
+ const score2 = clampScore(data2.score);
4054
+ const hits2 = Array.isArray(data2.hits) ? data2.hits.filter(isNonEmptyString).slice(0, 4) : [];
4055
+ const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
4056
+ const reasoning2 = data2.reasoning;
4057
+ return {
4058
+ score: score2,
4059
+ verdict: scoreToVerdict(score2),
4060
+ hits: hits2,
4061
+ misses: misses2,
4062
+ expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
4063
+ reasoning: reasoning2,
4064
+ evaluatorRawRequest,
4065
+ evaluatorResults
4066
+ };
4067
+ }
4068
+ const response = await judgeProvider.invoke({
4069
+ question: userPrompt,
4070
+ systemPrompt,
4071
+ evalCaseId: context.evalCase.id,
4072
+ attempt: context.attempt
4073
+ });
4074
+ const data = freeformEvaluationSchema.parse(parseJsonFromText(response.text ?? ""));
4075
+ const score = clampScore(data.score);
4076
+ const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
4077
+ const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
4078
+ const reasoning = data.reasoning ?? response.reasoning;
4079
+ return {
4080
+ score,
4081
+ verdict: scoreToVerdict(score),
4082
+ hits,
4083
+ misses,
4084
+ expectedAspectCount: Math.max(hits.length + misses.length, 1),
4085
+ reasoning,
4086
+ evaluatorRawRequest,
4087
+ evaluatorResults
4088
+ };
4089
+ } catch {
4090
+ return {
4091
+ score: 0,
4092
+ verdict: "fail",
4093
+ hits: [],
4094
+ misses: [],
4095
+ expectedAspectCount: 1,
4096
+ evaluatorRawRequest,
4097
+ evaluatorResults
4098
+ };
4099
+ }
4100
+ }
4101
+ };
3332
4102
 
3333
4103
  // src/evaluation/orchestrator.ts
3334
4104
  import { createHash, randomUUID as randomUUID2 } from "node:crypto";
@@ -3530,7 +4300,7 @@ async function runEvaluation(options) {
3530
4300
  if (!definition) {
3531
4301
  return void 0;
3532
4302
  }
3533
- const resolved = resolveTargetDefinition(definition, envLookup);
4303
+ const resolved = resolveTargetDefinition(definition, envLookup, evalFilePath);
3534
4304
  resolvedTargetsByName.set(name, resolved);
3535
4305
  return resolved;
3536
4306
  };
@@ -3844,6 +4614,17 @@ async function runEvalCase(options) {
3844
4614
  if (cacheKey && cache && !cachedResponse) {
3845
4615
  await cache.set(cacheKey, providerResponse);
3846
4616
  }
4617
+ let candidateTrace = providerResponse.trace;
4618
+ if (!candidateTrace && providerResponse.traceRef) {
4619
+ try {
4620
+ const rawTrace = await readJsonFile(providerResponse.traceRef);
4621
+ if (Array.isArray(rawTrace) && rawTrace.every(isTraceEvent)) {
4622
+ candidateTrace = rawTrace;
4623
+ }
4624
+ } catch {
4625
+ }
4626
+ }
4627
+ const candidateTraceSummary = candidateTrace ? computeTraceSummary(candidateTrace) : void 0;
3847
4628
  try {
3848
4629
  return await evaluateCandidate({
3849
4630
  evalCase,
@@ -3855,7 +4636,9 @@ async function runEvalCase(options) {
3855
4636
  nowFn,
3856
4637
  attempt,
3857
4638
  judgeProvider,
3858
- agentTimeoutMs
4639
+ agentTimeoutMs,
4640
+ candidateTrace,
4641
+ candidateTraceSummary
3859
4642
  });
3860
4643
  } catch (error) {
3861
4644
  return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs, provider);
@@ -3872,7 +4655,9 @@ async function evaluateCandidate(options) {
3872
4655
  nowFn,
3873
4656
  attempt,
3874
4657
  judgeProvider,
3875
- agentTimeoutMs
4658
+ agentTimeoutMs,
4659
+ candidateTrace,
4660
+ candidateTraceSummary
3876
4661
  } = options;
3877
4662
  const gradeTimestamp = nowFn();
3878
4663
  const { score, evaluatorResults } = await runEvaluatorsForCase({
@@ -3885,7 +4670,9 @@ async function evaluateCandidate(options) {
3885
4670
  promptInputs,
3886
4671
  now: gradeTimestamp,
3887
4672
  judgeProvider,
3888
- agentTimeoutMs
4673
+ agentTimeoutMs,
4674
+ candidateTrace,
4675
+ candidateTraceSummary
3889
4676
  });
3890
4677
  const completedAt = nowFn();
3891
4678
  let agentProviderRequest;
@@ -3924,7 +4711,8 @@ async function evaluateCandidate(options) {
3924
4711
  agent_provider_request: agentProviderRequest,
3925
4712
  lm_provider_request: lmProviderRequest,
3926
4713
  evaluator_provider_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
3927
- evaluator_results: evaluatorResults
4714
+ evaluator_results: evaluatorResults,
4715
+ trace_summary: candidateTraceSummary
3928
4716
  };
3929
4717
  }
3930
4718
  async function runEvaluatorsForCase(options) {
@@ -3938,7 +4726,9 @@ async function runEvaluatorsForCase(options) {
3938
4726
  promptInputs,
3939
4727
  now,
3940
4728
  judgeProvider,
3941
- agentTimeoutMs
4729
+ agentTimeoutMs,
4730
+ candidateTrace,
4731
+ candidateTraceSummary
3942
4732
  } = options;
3943
4733
  if (evalCase.evaluators && evalCase.evaluators.length > 0) {
3944
4734
  return runEvaluatorList({
@@ -3952,7 +4742,9 @@ async function runEvaluatorsForCase(options) {
3952
4742
  promptInputs,
3953
4743
  now,
3954
4744
  judgeProvider,
3955
- agentTimeoutMs
4745
+ agentTimeoutMs,
4746
+ candidateTrace,
4747
+ candidateTraceSummary
3956
4748
  });
3957
4749
  }
3958
4750
  const evaluatorKind = evalCase.evaluator ?? "llm_judge";
@@ -3968,7 +4760,9 @@ async function runEvaluatorsForCase(options) {
3968
4760
  attempt,
3969
4761
  promptInputs,
3970
4762
  now,
3971
- judgeProvider
4763
+ judgeProvider,
4764
+ candidateTrace,
4765
+ candidateTraceSummary
3972
4766
  });
3973
4767
  return { score };
3974
4768
  }
@@ -3984,7 +4778,9 @@ async function runEvaluatorList(options) {
3984
4778
  promptInputs,
3985
4779
  now,
3986
4780
  judgeProvider,
3987
- agentTimeoutMs
4781
+ agentTimeoutMs,
4782
+ candidateTrace,
4783
+ candidateTraceSummary
3988
4784
  } = options;
3989
4785
  const scored = [];
3990
4786
  const evaluatorResults = [];
@@ -4030,6 +4826,63 @@ async function runEvaluatorList(options) {
4030
4826
  promptInputs,
4031
4827
  now
4032
4828
  });
4829
+ scored.push({ score: score2, name: evaluator.name, type: "code_judge" });
4830
+ evaluatorResults.push({
4831
+ name: evaluator.name,
4832
+ type: "code_judge",
4833
+ score: score2.score,
4834
+ verdict: score2.verdict,
4835
+ hits: score2.hits,
4836
+ misses: score2.misses,
4837
+ reasoning: score2.reasoning,
4838
+ evaluator_provider_request: score2.evaluatorRawRequest
4839
+ });
4840
+ }
4841
+ if (evaluator.type === "composite") {
4842
+ const evalFileDir = evalCase.guideline_paths[0] ? path12.dirname(evalCase.guideline_paths[0]) : process.cwd();
4843
+ const createEvaluator = (memberConfig) => {
4844
+ switch (memberConfig.type) {
4845
+ case "llm_judge":
4846
+ return evaluatorRegistry.llm_judge;
4847
+ case "code":
4848
+ return new CodeEvaluator({
4849
+ script: memberConfig.script,
4850
+ cwd: memberConfig.resolvedCwd ?? memberConfig.cwd,
4851
+ agentTimeoutMs
4852
+ });
4853
+ case "composite":
4854
+ return new CompositeEvaluator({
4855
+ config: memberConfig,
4856
+ cwd: evalFileDir,
4857
+ evaluatorFactory: { create: createEvaluator }
4858
+ });
4859
+ case "tool_trajectory":
4860
+ return new ToolTrajectoryEvaluator({
4861
+ config: memberConfig
4862
+ });
4863
+ case "expected_messages":
4864
+ return new ExpectedMessagesEvaluator();
4865
+ default: {
4866
+ const unknownConfig = memberConfig;
4867
+ throw new Error(`Unsupported evaluator type in composite: ${unknownConfig.type}`);
4868
+ }
4869
+ }
4870
+ };
4871
+ const compositeEvaluator = new CompositeEvaluator({
4872
+ config: evaluator,
4873
+ cwd: evalFileDir,
4874
+ evaluatorFactory: { create: createEvaluator }
4875
+ });
4876
+ const score2 = await compositeEvaluator.evaluate({
4877
+ evalCase,
4878
+ candidate,
4879
+ target,
4880
+ provider,
4881
+ attempt,
4882
+ promptInputs,
4883
+ now,
4884
+ judgeProvider
4885
+ });
4033
4886
  scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
4034
4887
  evaluatorResults.push({
4035
4888
  name: evaluator.name,
@@ -4039,7 +4892,58 @@ async function runEvaluatorList(options) {
4039
4892
  hits: score2.hits,
4040
4893
  misses: score2.misses,
4041
4894
  reasoning: score2.reasoning,
4042
- evaluator_provider_request: score2.evaluatorRawRequest
4895
+ evaluator_provider_request: score2.evaluatorRawRequest,
4896
+ evaluator_results: mapChildResults(score2.evaluatorResults)
4897
+ });
4898
+ }
4899
+ if (evaluator.type === "tool_trajectory") {
4900
+ const trajectoryEvaluator = new ToolTrajectoryEvaluator({
4901
+ config: evaluator
4902
+ });
4903
+ const score2 = trajectoryEvaluator.evaluate({
4904
+ evalCase,
4905
+ candidate,
4906
+ target,
4907
+ provider,
4908
+ attempt,
4909
+ promptInputs,
4910
+ now,
4911
+ candidateTrace,
4912
+ candidateTraceSummary
4913
+ });
4914
+ scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
4915
+ evaluatorResults.push({
4916
+ name: evaluator.name,
4917
+ type: evaluator.type,
4918
+ score: score2.score,
4919
+ verdict: score2.verdict,
4920
+ hits: score2.hits,
4921
+ misses: score2.misses,
4922
+ reasoning: score2.reasoning
4923
+ });
4924
+ }
4925
+ if (evaluator.type === "expected_messages") {
4926
+ const expectedMessagesEvaluator = new ExpectedMessagesEvaluator();
4927
+ const score2 = expectedMessagesEvaluator.evaluate({
4928
+ evalCase,
4929
+ candidate,
4930
+ target,
4931
+ provider,
4932
+ attempt,
4933
+ promptInputs,
4934
+ now,
4935
+ candidateTrace,
4936
+ candidateTraceSummary
4937
+ });
4938
+ scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
4939
+ evaluatorResults.push({
4940
+ name: evaluator.name,
4941
+ type: evaluator.type,
4942
+ score: score2.score,
4943
+ verdict: score2.verdict,
4944
+ hits: score2.hits,
4945
+ misses: score2.misses,
4946
+ reasoning: score2.reasoning
4043
4947
  });
4044
4948
  }
4045
4949
  } catch (error) {
@@ -4052,14 +4956,15 @@ async function runEvaluatorList(options) {
4052
4956
  expectedAspectCount: 1,
4053
4957
  reasoning: message
4054
4958
  };
4959
+ const resultType = evaluator.type === "code" ? "code_judge" : evaluator.type;
4055
4960
  scored.push({
4056
4961
  score: fallbackScore,
4057
4962
  name: evaluator.name ?? "unknown",
4058
- type: evaluator.type ?? "unknown"
4963
+ type: resultType ?? "llm_judge"
4059
4964
  });
4060
4965
  evaluatorResults.push({
4061
4966
  name: evaluator.name ?? "unknown",
4062
- type: evaluator.type ?? "unknown",
4967
+ type: resultType ?? "llm_judge",
4063
4968
  score: 0,
4064
4969
  verdict: "fail",
4065
4970
  hits: [],
@@ -4277,6 +5182,23 @@ function isTimeoutLike(error) {
4277
5182
  const value = String(error).toLowerCase();
4278
5183
  return value.includes("timeout");
4279
5184
  }
5185
+ function mapChildResults(children) {
5186
+ if (!children || children.length === 0) {
5187
+ return void 0;
5188
+ }
5189
+ return children.map((child) => ({
5190
+ name: child.name,
5191
+ type: child.type,
5192
+ score: child.score,
5193
+ weight: child.weight,
5194
+ verdict: child.verdict,
5195
+ hits: child.hits,
5196
+ misses: child.misses,
5197
+ reasoning: child.reasoning,
5198
+ evaluator_provider_request: child.evaluatorRawRequest,
5199
+ evaluator_results: mapChildResults(child.evaluatorResults)
5200
+ }));
5201
+ }
4280
5202
 
4281
5203
  // src/evaluation/generators/rubric-generator.ts
4282
5204
  import { generateText as generateText3 } from "ai";
@@ -4364,11 +5286,15 @@ function createAgentKernel() {
4364
5286
  }
4365
5287
  export {
4366
5288
  CodeEvaluator,
5289
+ CompositeEvaluator,
5290
+ ExpectedMessagesEvaluator,
4367
5291
  LlmJudgeEvaluator,
4368
5292
  TEST_MESSAGE_ROLES,
5293
+ ToolTrajectoryEvaluator,
4369
5294
  buildDirectoryChain,
4370
5295
  buildPromptInputs,
4371
5296
  buildSearchRoots,
5297
+ computeTraceSummary,
4372
5298
  consumeCodexLogEntries,
4373
5299
  createAgentKernel,
4374
5300
  createProvider,
@@ -4379,14 +5305,18 @@ export {
4379
5305
  generateRubrics,
4380
5306
  getHitCount,
4381
5307
  isEvaluatorKind,
5308
+ isExpectedToolCall,
4382
5309
  isGuidelineFile,
4383
5310
  isJsonObject,
4384
5311
  isJsonValue,
4385
5312
  isTestMessage,
4386
5313
  isTestMessageRole,
5314
+ isTraceEvent,
5315
+ isTraceEventType,
4387
5316
  listTargetNames,
4388
5317
  loadEvalCases,
4389
5318
  normalizeLineEndings,
5319
+ readJsonFile,
4390
5320
  readTargetDefinitions,
4391
5321
  readTestSuiteMetadata,
4392
5322
  readTextFile,