@agentv/core 0.22.2 → 0.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -31,11 +31,15 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
31
31
  var index_exports = {};
32
32
  __export(index_exports, {
33
33
  CodeEvaluator: () => CodeEvaluator,
34
+ CompositeEvaluator: () => CompositeEvaluator,
35
+ ExpectedMessagesEvaluator: () => ExpectedMessagesEvaluator,
34
36
  LlmJudgeEvaluator: () => LlmJudgeEvaluator,
35
37
  TEST_MESSAGE_ROLES: () => TEST_MESSAGE_ROLES,
38
+ ToolTrajectoryEvaluator: () => ToolTrajectoryEvaluator,
36
39
  buildDirectoryChain: () => buildDirectoryChain2,
37
40
  buildPromptInputs: () => buildPromptInputs,
38
41
  buildSearchRoots: () => buildSearchRoots2,
42
+ computeTraceSummary: () => computeTraceSummary,
39
43
  consumeCodexLogEntries: () => consumeCodexLogEntries,
40
44
  createAgentKernel: () => createAgentKernel,
41
45
  createProvider: () => createProvider,
@@ -46,14 +50,18 @@ __export(index_exports, {
46
50
  generateRubrics: () => generateRubrics,
47
51
  getHitCount: () => getHitCount,
48
52
  isEvaluatorKind: () => isEvaluatorKind,
53
+ isExpectedToolCall: () => isExpectedToolCall,
49
54
  isGuidelineFile: () => isGuidelineFile,
50
55
  isJsonObject: () => isJsonObject,
51
56
  isJsonValue: () => isJsonValue,
52
57
  isTestMessage: () => isTestMessage,
53
58
  isTestMessageRole: () => isTestMessageRole,
59
+ isTraceEvent: () => isTraceEvent,
60
+ isTraceEventType: () => isTraceEventType,
54
61
  listTargetNames: () => listTargetNames,
55
62
  loadEvalCases: () => loadEvalCases,
56
63
  normalizeLineEndings: () => normalizeLineEndings,
64
+ readJsonFile: () => readJsonFile,
57
65
  readTargetDefinitions: () => readTargetDefinitions,
58
66
  readTestSuiteMetadata: () => readTestSuiteMetadata,
59
67
  readTextFile: () => readTextFile,
@@ -107,7 +115,14 @@ function isTestMessage(value) {
107
115
  }
108
116
  return candidate.content.every(isJsonObject);
109
117
  }
110
- var EVALUATOR_KIND_VALUES = ["code", "llm_judge", "rubric"];
118
+ var EVALUATOR_KIND_VALUES = [
119
+ "code_judge",
120
+ "llm_judge",
121
+ "rubric",
122
+ "composite",
123
+ "tool_trajectory",
124
+ "expected_messages"
125
+ ];
111
126
  var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
112
127
  function isEvaluatorKind(value) {
113
128
  return typeof value === "string" && EVALUATOR_KIND_SET.has(value);
@@ -116,6 +131,44 @@ function getHitCount(result) {
116
131
  return result.hits.length;
117
132
  }
118
133
 
134
+ // src/evaluation/trace.ts
135
+ function isTraceEventType(value) {
136
+ return typeof value === "string" && ["model_step", "tool_call", "tool_result", "message", "error"].includes(value);
137
+ }
138
+ function isTraceEvent(value) {
139
+ if (typeof value !== "object" || value === null) {
140
+ return false;
141
+ }
142
+ const candidate = value;
143
+ return isTraceEventType(candidate.type) && typeof candidate.timestamp === "string";
144
+ }
145
+ function isExpectedToolCall(value) {
146
+ if (typeof value !== "object" || value === null) {
147
+ return false;
148
+ }
149
+ const candidate = value;
150
+ return typeof candidate.tool === "string";
151
+ }
152
+ function computeTraceSummary(trace) {
153
+ const toolCallCounts = {};
154
+ let errorCount = 0;
155
+ for (const event of trace) {
156
+ if (event.type === "tool_call" && event.name) {
157
+ toolCallCounts[event.name] = (toolCallCounts[event.name] ?? 0) + 1;
158
+ }
159
+ if (event.type === "error") {
160
+ errorCount++;
161
+ }
162
+ }
163
+ const toolNames = Object.keys(toolCallCounts).sort();
164
+ return {
165
+ eventCount: trace.length,
166
+ toolNames,
167
+ toolCallsByName: toolCallCounts,
168
+ errorCount
169
+ };
170
+ }
171
+
119
172
  // src/evaluation/yaml-parser.ts
120
173
  var import_promises6 = require("fs/promises");
121
174
  var import_node_path6 = __toESM(require("path"), 1);
@@ -459,10 +512,10 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
459
512
  logWarning2(`Skipping evaluator with invalid name/type in '${evalId}'`);
460
513
  continue;
461
514
  }
462
- if (typeValue === "code") {
515
+ if (typeValue === "code_judge") {
463
516
  const script = asString2(rawEvaluator.script);
464
517
  if (!script) {
465
- logWarning2(`Skipping code evaluator '${name}' in '${evalId}': missing script`);
518
+ logWarning2(`Skipping code_judge evaluator '${name}' in '${evalId}': missing script`);
466
519
  continue;
467
520
  }
468
521
  const cwd = asString2(rawEvaluator.cwd);
@@ -473,7 +526,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
473
526
  resolvedCwd = import_node_path3.default.resolve(resolved.resolvedPath);
474
527
  } else {
475
528
  logWarning2(
476
- `Code evaluator '${name}' in '${evalId}': cwd not found (${resolved.displayPath})`,
529
+ `Code_judge evaluator '${name}' in '${evalId}': cwd not found (${resolved.displayPath})`,
477
530
  resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
478
531
  );
479
532
  }
@@ -489,6 +542,174 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
489
542
  });
490
543
  continue;
491
544
  }
545
+ if (typeValue === "composite") {
546
+ const rawMembers = rawEvaluator.evaluators;
547
+ if (!Array.isArray(rawMembers)) {
548
+ logWarning2(
549
+ `Skipping composite evaluator '${name}' in '${evalId}': missing evaluators array`
550
+ );
551
+ continue;
552
+ }
553
+ const rawAggregator = rawEvaluator.aggregator;
554
+ if (!isJsonObject2(rawAggregator)) {
555
+ logWarning2(`Skipping composite evaluator '${name}' in '${evalId}': missing aggregator`);
556
+ continue;
557
+ }
558
+ const aggregatorType = asString2(rawAggregator.type);
559
+ if (aggregatorType !== "weighted_average" && aggregatorType !== "code_judge" && aggregatorType !== "llm_judge") {
560
+ logWarning2(
561
+ `Skipping composite evaluator '${name}' in '${evalId}': invalid aggregator type '${aggregatorType}'`
562
+ );
563
+ continue;
564
+ }
565
+ const memberEvaluators = [];
566
+ for (const rawMember of rawMembers) {
567
+ if (!isJsonObject2(rawMember)) {
568
+ logWarning2(`Skipping invalid member evaluator in composite '${name}' (expected object)`);
569
+ continue;
570
+ }
571
+ const memberName = asString2(rawMember.name);
572
+ const memberType = rawMember.type;
573
+ if (!memberName || !isEvaluatorKind(memberType)) {
574
+ logWarning2(`Skipping member evaluator with invalid name/type in composite '${name}'`);
575
+ continue;
576
+ }
577
+ const memberConfigs = await parseEvaluators(
578
+ { evaluators: [rawMember] },
579
+ void 0,
580
+ searchRoots,
581
+ `${evalId}:${name}:${memberName}`
582
+ );
583
+ if (memberConfigs && memberConfigs.length > 0) {
584
+ memberEvaluators.push(memberConfigs[0]);
585
+ }
586
+ }
587
+ if (memberEvaluators.length === 0) {
588
+ logWarning2(
589
+ `Skipping composite evaluator '${name}' in '${evalId}': no valid member evaluators`
590
+ );
591
+ continue;
592
+ }
593
+ let aggregator;
594
+ if (aggregatorType === "weighted_average") {
595
+ const weights = isJsonObject2(rawAggregator.weights) ? rawAggregator.weights : void 0;
596
+ const parsedWeights = {};
597
+ if (weights) {
598
+ for (const [key, value] of Object.entries(weights)) {
599
+ if (typeof value === "number") {
600
+ parsedWeights[key] = value;
601
+ }
602
+ }
603
+ }
604
+ aggregator = {
605
+ type: "weighted_average",
606
+ ...Object.keys(parsedWeights).length > 0 ? { weights: parsedWeights } : {}
607
+ };
608
+ } else if (aggregatorType === "code_judge") {
609
+ const aggregatorPath = asString2(rawAggregator.path);
610
+ if (!aggregatorPath) {
611
+ logWarning2(
612
+ `Skipping composite evaluator '${name}' in '${evalId}': code_judge aggregator missing path`
613
+ );
614
+ continue;
615
+ }
616
+ aggregator = {
617
+ type: "code_judge",
618
+ path: aggregatorPath,
619
+ cwd: searchRoots[0]
620
+ };
621
+ } else {
622
+ const aggregatorPrompt = asString2(rawAggregator.prompt);
623
+ let promptPath2;
624
+ if (aggregatorPrompt) {
625
+ const resolved = await resolveFileReference(aggregatorPrompt, searchRoots);
626
+ if (resolved.resolvedPath) {
627
+ promptPath2 = import_node_path3.default.resolve(resolved.resolvedPath);
628
+ }
629
+ }
630
+ aggregator = {
631
+ type: "llm_judge",
632
+ ...aggregatorPrompt ? { prompt: aggregatorPrompt } : {},
633
+ ...promptPath2 ? { promptPath: promptPath2 } : {}
634
+ };
635
+ }
636
+ evaluators.push({
637
+ name,
638
+ type: "composite",
639
+ evaluators: memberEvaluators,
640
+ aggregator
641
+ });
642
+ continue;
643
+ }
644
+ if (typeValue === "expected_messages") {
645
+ evaluators.push({
646
+ name,
647
+ type: "expected_messages"
648
+ });
649
+ continue;
650
+ }
651
+ if (typeValue === "tool_trajectory") {
652
+ const mode = asString2(rawEvaluator.mode);
653
+ if (mode !== "any_order" && mode !== "in_order" && mode !== "exact") {
654
+ logWarning2(
655
+ `Skipping tool_trajectory evaluator '${name}' in '${evalId}': invalid mode '${mode}' (must be any_order, in_order, or exact)`
656
+ );
657
+ continue;
658
+ }
659
+ const rawMinimums = rawEvaluator.minimums;
660
+ let minimums;
661
+ if (rawMinimums !== void 0) {
662
+ if (!isJsonObject2(rawMinimums)) {
663
+ logWarning2(
664
+ `Skipping tool_trajectory evaluator '${name}' in '${evalId}': minimums must be an object`
665
+ );
666
+ continue;
667
+ }
668
+ minimums = {};
669
+ for (const [toolName, count] of Object.entries(rawMinimums)) {
670
+ if (typeof count === "number" && count >= 0) {
671
+ minimums[toolName] = count;
672
+ }
673
+ }
674
+ }
675
+ const rawExpected = rawEvaluator.expected;
676
+ let expected;
677
+ if (rawExpected !== void 0) {
678
+ if (!Array.isArray(rawExpected)) {
679
+ logWarning2(
680
+ `Skipping tool_trajectory evaluator '${name}' in '${evalId}': expected must be an array`
681
+ );
682
+ continue;
683
+ }
684
+ expected = [];
685
+ for (const item of rawExpected) {
686
+ if (isJsonObject2(item) && typeof item.tool === "string") {
687
+ expected.push({ tool: item.tool });
688
+ }
689
+ }
690
+ }
691
+ if (mode === "any_order" && !minimums) {
692
+ logWarning2(
693
+ `Skipping tool_trajectory evaluator '${name}' in '${evalId}': any_order mode requires minimums`
694
+ );
695
+ continue;
696
+ }
697
+ if ((mode === "in_order" || mode === "exact") && !expected) {
698
+ logWarning2(
699
+ `Skipping tool_trajectory evaluator '${name}' in '${evalId}': ${mode} mode requires expected`
700
+ );
701
+ continue;
702
+ }
703
+ const config = {
704
+ name,
705
+ type: "tool_trajectory",
706
+ mode,
707
+ ...minimums ? { minimums } : {},
708
+ ...expected ? { expected } : {}
709
+ };
710
+ evaluators.push(config);
711
+ continue;
712
+ }
492
713
  const prompt = asString2(rawEvaluator.prompt);
493
714
  let promptPath;
494
715
  if (prompt) {
@@ -742,6 +963,67 @@ ${detailBlock}${ANSI_RESET4}`);
742
963
  console.warn(`${ANSI_YELLOW4}Warning: ${message}${ANSI_RESET4}`);
743
964
  }
744
965
  }
966
+ async function processExpectedMessages(options) {
967
+ const { messages, searchRoots, repoRootPath, verbose } = options;
968
+ const segments = [];
969
+ for (const message of messages) {
970
+ const segment = {
971
+ role: message.role
972
+ };
973
+ if (message.role === "assistant" && message.tool_calls !== void 0) {
974
+ segment.tool_calls = message.tool_calls;
975
+ }
976
+ const content = message.content;
977
+ if (typeof content === "string") {
978
+ segment.content = content;
979
+ } else if (Array.isArray(content)) {
980
+ const processedContent = [];
981
+ for (const rawSegment of content) {
982
+ if (!isJsonObject(rawSegment)) {
983
+ continue;
984
+ }
985
+ const segmentType = asString3(rawSegment.type);
986
+ if (segmentType === "file") {
987
+ const rawValue = asString3(rawSegment.value);
988
+ if (!rawValue) {
989
+ continue;
990
+ }
991
+ const { displayPath, resolvedPath, attempted } = await resolveFileReference(
992
+ rawValue,
993
+ searchRoots
994
+ );
995
+ if (!resolvedPath) {
996
+ const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
997
+ logWarning3(`File not found in expected_messages: ${displayPath}`, attempts);
998
+ continue;
999
+ }
1000
+ try {
1001
+ const fileContent = (await (0, import_promises4.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
1002
+ processedContent.push({
1003
+ type: "file",
1004
+ path: displayPath,
1005
+ text: fileContent,
1006
+ resolvedPath: import_node_path4.default.resolve(resolvedPath)
1007
+ });
1008
+ if (verbose) {
1009
+ console.log(` [Expected Output File] Found: ${displayPath}`);
1010
+ console.log(` Resolved to: ${resolvedPath}`);
1011
+ }
1012
+ } catch (error) {
1013
+ logWarning3(
1014
+ `Could not read expected output file ${resolvedPath}: ${error.message}`
1015
+ );
1016
+ }
1017
+ continue;
1018
+ }
1019
+ processedContent.push(cloneJsonObject(rawSegment));
1020
+ }
1021
+ segment.content = processedContent;
1022
+ }
1023
+ segments.push(segment);
1024
+ }
1025
+ return segments;
1026
+ }
745
1027
 
746
1028
  // src/evaluation/formatting/prompt-builder.ts
747
1029
  var import_promises5 = require("fs/promises");
@@ -1046,12 +1328,10 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
1046
1328
  messageType: "input",
1047
1329
  verbose
1048
1330
  });
1049
- const outputSegments = hasExpectedMessages ? await processMessages({
1331
+ const outputSegments = hasExpectedMessages ? await processExpectedMessages({
1050
1332
  messages: expectedMessages,
1051
1333
  searchRoots,
1052
1334
  repoRootPath,
1053
- guidelinePatterns,
1054
- messageType: "output",
1055
1335
  verbose
1056
1336
  }) : [];
1057
1337
  const codeSnippets = extractCodeBlocks(inputSegments);
@@ -1178,6 +1458,10 @@ async function readTextFile(filePath) {
1178
1458
  const content = await (0, import_promises7.readFile)(filePath, "utf8");
1179
1459
  return normalizeLineEndings(content);
1180
1460
  }
1461
+ async function readJsonFile(filePath) {
1462
+ const content = await (0, import_promises7.readFile)(filePath, "utf8");
1463
+ return JSON.parse(content);
1464
+ }
1181
1465
  async function findGitRoot(startPath) {
1182
1466
  let currentDir = import_node_path7.default.dirname(import_node_path7.default.resolve(startPath));
1183
1467
  const root = import_node_path7.default.parse(currentDir).root;
@@ -1686,9 +1970,11 @@ var CliProvider = class {
1686
1970
  const message = detail ? `${detail} (exit code ${codeText})` : `CLI exited with code ${codeText}`;
1687
1971
  throw new Error(message);
1688
1972
  }
1689
- const responseText = await this.readAndCleanupOutputFile(outputFilePath);
1973
+ const responseContent = await this.readAndCleanupOutputFile(outputFilePath);
1974
+ const parsed = this.parseOutputContent(responseContent);
1690
1975
  return {
1691
- text: responseText,
1976
+ text: parsed.text,
1977
+ trace: parsed.trace,
1692
1978
  raw: {
1693
1979
  command: renderedCommand,
1694
1980
  stderr: result.stderr,
@@ -1698,6 +1984,31 @@ var CliProvider = class {
1698
1984
  }
1699
1985
  };
1700
1986
  }
1987
+ /**
1988
+ * Parse output content from CLI.
1989
+ * If the content is valid JSON with a 'text' field, extract text and optional trace.
1990
+ * Otherwise, treat the entire content as plain text.
1991
+ */
1992
+ parseOutputContent(content) {
1993
+ try {
1994
+ const parsed = JSON.parse(content);
1995
+ if (typeof parsed === "object" && parsed !== null && "text" in parsed) {
1996
+ const obj = parsed;
1997
+ const text = typeof obj.text === "string" ? obj.text : String(obj.text);
1998
+ const trace = this.parseTrace(obj.trace);
1999
+ return { text, trace };
2000
+ }
2001
+ } catch {
2002
+ }
2003
+ return { text: content };
2004
+ }
2005
+ parseTrace(trace) {
2006
+ if (!Array.isArray(trace)) {
2007
+ return void 0;
2008
+ }
2009
+ const validEvents = trace.filter(isTraceEvent);
2010
+ return validEvents.length > 0 ? validEvents : void 0;
2011
+ }
1701
2012
  async readAndCleanupOutputFile(filePath) {
1702
2013
  try {
1703
2014
  const content = await readTextFile(filePath);
@@ -2684,6 +2995,7 @@ var MockProvider = class {
2684
2995
  delayMs;
2685
2996
  delayMinMs;
2686
2997
  delayMaxMs;
2998
+ trace;
2687
2999
  constructor(targetName, config) {
2688
3000
  this.id = `mock:${targetName}`;
2689
3001
  this.targetName = targetName;
@@ -2691,6 +3003,7 @@ var MockProvider = class {
2691
3003
  this.delayMs = config.delayMs ?? 0;
2692
3004
  this.delayMinMs = config.delayMinMs ?? 0;
2693
3005
  this.delayMaxMs = config.delayMaxMs ?? 0;
3006
+ this.trace = config.trace;
2694
3007
  }
2695
3008
  async invoke(request) {
2696
3009
  const delay = this.calculateDelay();
@@ -2702,7 +3015,8 @@ var MockProvider = class {
2702
3015
  raw: {
2703
3016
  question: request.question,
2704
3017
  guidelines: request.guidelines
2705
- }
3018
+ },
3019
+ trace: this.trace
2706
3020
  };
2707
3021
  }
2708
3022
  calculateDelay() {
@@ -2716,6 +3030,7 @@ var MockProvider = class {
2716
3030
  };
2717
3031
 
2718
3032
  // src/evaluation/providers/targets.ts
3033
+ var import_node_path11 = __toESM(require("path"), 1);
2719
3034
  var import_zod = require("zod");
2720
3035
  var CLI_PLACEHOLDERS = /* @__PURE__ */ new Set([
2721
3036
  "PROMPT",
@@ -2731,7 +3046,7 @@ var BASE_TARGET_SCHEMA = import_zod.z.object({
2731
3046
  judge_target: import_zod.z.string().optional(),
2732
3047
  workers: import_zod.z.number().int().min(1).optional()
2733
3048
  }).passthrough();
2734
- var DEFAULT_AZURE_API_VERSION = "2024-10-01-preview";
3049
+ var DEFAULT_AZURE_API_VERSION = "2024-12-01-preview";
2735
3050
  function normalizeAzureApiVersion(value) {
2736
3051
  if (!value) {
2737
3052
  return DEFAULT_AZURE_API_VERSION;
@@ -2775,7 +3090,7 @@ function resolveRetryConfig(target) {
2775
3090
  retryableStatusCodes
2776
3091
  };
2777
3092
  }
2778
- function resolveTargetDefinition(definition, env = process.env) {
3093
+ function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
2779
3094
  const parsed = BASE_TARGET_SCHEMA.parse(definition);
2780
3095
  const provider = parsed.provider.toLowerCase();
2781
3096
  const providerBatching = resolveOptionalBoolean(
@@ -2848,7 +3163,7 @@ function resolveTargetDefinition(definition, env = process.env) {
2848
3163
  judgeTarget: parsed.judge_target,
2849
3164
  workers: parsed.workers,
2850
3165
  providerBatching,
2851
- config: resolveCliConfig(parsed, env)
3166
+ config: resolveCliConfig(parsed, env, evalFilePath)
2852
3167
  };
2853
3168
  default:
2854
3169
  throw new Error(`Unsupported provider '${parsed.provider}' in target '${parsed.name}'`);
@@ -2966,7 +3281,8 @@ function normalizeCodexLogFormat(value) {
2966
3281
  }
2967
3282
  function resolveMockConfig(target) {
2968
3283
  const response = typeof target.response === "string" ? target.response : void 0;
2969
- return { response };
3284
+ const trace = Array.isArray(target.trace) ? target.trace : void 0;
3285
+ return { response, trace };
2970
3286
  }
2971
3287
  function resolveVSCodeConfig(target, env, insiders) {
2972
3288
  const workspaceTemplateEnvVar = resolveOptionalLiteralString(
@@ -2998,15 +3314,18 @@ function resolveVSCodeConfig(target, env, insiders) {
2998
3314
  workspaceTemplate
2999
3315
  };
3000
3316
  }
3001
- function resolveCliConfig(target, env) {
3317
+ function resolveCliConfig(target, env, evalFilePath) {
3002
3318
  const commandTemplateSource = target.command_template ?? target.commandTemplate;
3003
3319
  const filesFormat = resolveOptionalLiteralString(
3004
3320
  target.files_format ?? target.filesFormat ?? target.attachments_format ?? target.attachmentsFormat
3005
3321
  );
3006
- const cwd = resolveOptionalString(target.cwd, env, `${target.name} working directory`, {
3322
+ let cwd = resolveOptionalString(target.cwd, env, `${target.name} working directory`, {
3007
3323
  allowLiteral: true,
3008
3324
  optionalEnv: true
3009
3325
  });
3326
+ if (!cwd && evalFilePath) {
3327
+ cwd = import_node_path11.default.dirname(import_node_path11.default.resolve(evalFilePath));
3328
+ }
3010
3329
  const timeoutMs = resolveTimeoutMs(
3011
3330
  target.timeout_seconds ?? target.timeoutSeconds,
3012
3331
  `${target.name} timeout`
@@ -3124,17 +3443,15 @@ function resolveOptionalString(source, env, description, options) {
3124
3443
  if (envVarMatch) {
3125
3444
  const varName = envVarMatch[1];
3126
3445
  const envValue = env[varName];
3127
- if (envValue !== void 0) {
3128
- if (envValue.trim().length === 0) {
3129
- throw new Error(`Environment variable '${varName}' for ${description} is empty`);
3130
- }
3131
- return envValue;
3132
- }
3133
3446
  const optionalEnv = options?.optionalEnv ?? false;
3134
- if (optionalEnv) {
3135
- return void 0;
3447
+ if (envValue === void 0 || envValue.trim().length === 0) {
3448
+ if (optionalEnv) {
3449
+ return void 0;
3450
+ }
3451
+ const status = envValue === void 0 ? "is not set" : "is empty";
3452
+ throw new Error(`Environment variable '${varName}' required for ${description} ${status}`);
3136
3453
  }
3137
- throw new Error(`Environment variable '${varName}' required for ${description} is not set`);
3454
+ return envValue;
3138
3455
  }
3139
3456
  const allowLiteral = options?.allowLiteral ?? false;
3140
3457
  if (!allowLiteral) {
@@ -3246,7 +3563,7 @@ function resolveOptionalNumberArray(source, description) {
3246
3563
  }
3247
3564
 
3248
3565
  // src/evaluation/providers/vscode.ts
3249
- var import_node_path11 = __toESM(require("path"), 1);
3566
+ var import_node_path12 = __toESM(require("path"), 1);
3250
3567
  var import_subagent = require("subagent");
3251
3568
 
3252
3569
  // src/evaluation/providers/vscode-templates.ts
@@ -3416,7 +3733,7 @@ function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
3416
3733
  return "";
3417
3734
  }
3418
3735
  const buildList = (files) => files.map((absolutePath) => {
3419
- const fileName = import_node_path11.default.basename(absolutePath);
3736
+ const fileName = import_node_path12.default.basename(absolutePath);
3420
3737
  const fileUri = pathToFileUri2(absolutePath);
3421
3738
  return `* [${fileName}](${fileUri})`;
3422
3739
  });
@@ -3441,8 +3758,8 @@ function collectGuidelineFiles2(attachments, guidelinePatterns) {
3441
3758
  }
3442
3759
  const unique = /* @__PURE__ */ new Map();
3443
3760
  for (const attachment of attachments) {
3444
- const absolutePath = import_node_path11.default.resolve(attachment);
3445
- const normalized = absolutePath.split(import_node_path11.default.sep).join("/");
3761
+ const absolutePath = import_node_path12.default.resolve(attachment);
3762
+ const normalized = absolutePath.split(import_node_path12.default.sep).join("/");
3446
3763
  if (isGuidelineFile(normalized, guidelinePatterns)) {
3447
3764
  if (!unique.has(absolutePath)) {
3448
3765
  unique.set(absolutePath, absolutePath);
@@ -3457,7 +3774,7 @@ function collectAttachmentFiles(attachments) {
3457
3774
  }
3458
3775
  const unique = /* @__PURE__ */ new Map();
3459
3776
  for (const attachment of attachments) {
3460
- const absolutePath = import_node_path11.default.resolve(attachment);
3777
+ const absolutePath = import_node_path12.default.resolve(attachment);
3461
3778
  if (!unique.has(absolutePath)) {
3462
3779
  unique.set(absolutePath, absolutePath);
3463
3780
  }
@@ -3465,7 +3782,7 @@ function collectAttachmentFiles(attachments) {
3465
3782
  return Array.from(unique.values());
3466
3783
  }
3467
3784
  function pathToFileUri2(filePath) {
3468
- const absolutePath = import_node_path11.default.isAbsolute(filePath) ? filePath : import_node_path11.default.resolve(filePath);
3785
+ const absolutePath = import_node_path12.default.isAbsolute(filePath) ? filePath : import_node_path12.default.resolve(filePath);
3469
3786
  const normalizedPath = absolutePath.replace(/\\/g, "/");
3470
3787
  if (/^[a-zA-Z]:\//.test(normalizedPath)) {
3471
3788
  return `file:///${normalizedPath}`;
@@ -3478,7 +3795,7 @@ function normalizeAttachments(attachments) {
3478
3795
  }
3479
3796
  const deduped = /* @__PURE__ */ new Set();
3480
3797
  for (const attachment of attachments) {
3481
- deduped.add(import_node_path11.default.resolve(attachment));
3798
+ deduped.add(import_node_path12.default.resolve(attachment));
3482
3799
  }
3483
3800
  return Array.from(deduped);
3484
3801
  }
@@ -3487,7 +3804,7 @@ function mergeAttachments(all) {
3487
3804
  for (const list of all) {
3488
3805
  if (!list) continue;
3489
3806
  for (const inputFile of list) {
3490
- deduped.add(import_node_path11.default.resolve(inputFile));
3807
+ deduped.add(import_node_path12.default.resolve(inputFile));
3491
3808
  }
3492
3809
  }
3493
3810
  return deduped.size > 0 ? Array.from(deduped) : void 0;
@@ -3536,7 +3853,7 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
3536
3853
  // src/evaluation/providers/targets-file.ts
3537
3854
  var import_node_fs4 = require("fs");
3538
3855
  var import_promises10 = require("fs/promises");
3539
- var import_node_path12 = __toESM(require("path"), 1);
3856
+ var import_node_path13 = __toESM(require("path"), 1);
3540
3857
  var import_yaml3 = require("yaml");
3541
3858
  function isRecord(value) {
3542
3859
  return typeof value === "object" && value !== null && !Array.isArray(value);
@@ -3573,7 +3890,7 @@ async function fileExists3(filePath) {
3573
3890
  }
3574
3891
  }
3575
3892
  async function readTargetDefinitions(filePath) {
3576
- const absolutePath = import_node_path12.default.resolve(filePath);
3893
+ const absolutePath = import_node_path13.default.resolve(filePath);
3577
3894
  if (!await fileExists3(absolutePath)) {
3578
3895
  throw new Error(`targets.yaml not found at ${absolutePath}`);
3579
3896
  }
@@ -4021,11 +4338,478 @@ function substituteVariables(template, variables) {
4021
4338
  return variables[varName] ?? match;
4022
4339
  });
4023
4340
  }
4341
+ var ToolTrajectoryEvaluator = class {
4342
+ kind = "tool_trajectory";
4343
+ config;
4344
+ constructor(options) {
4345
+ this.config = options.config;
4346
+ }
4347
+ evaluate(context) {
4348
+ const { candidateTrace, candidateTraceSummary } = context;
4349
+ if (!candidateTrace || !candidateTraceSummary) {
4350
+ return {
4351
+ score: 0,
4352
+ verdict: "fail",
4353
+ hits: [],
4354
+ misses: ["No trace available for evaluation"],
4355
+ expectedAspectCount: 1
4356
+ };
4357
+ }
4358
+ switch (this.config.mode) {
4359
+ case "any_order":
4360
+ return this.evaluateAnyOrder(candidateTraceSummary);
4361
+ case "in_order":
4362
+ return this.evaluateInOrder(candidateTrace);
4363
+ case "exact":
4364
+ return this.evaluateExact(candidateTrace);
4365
+ default:
4366
+ return {
4367
+ score: 0,
4368
+ verdict: "fail",
4369
+ hits: [],
4370
+ misses: [`Unknown mode: ${this.config.mode}`],
4371
+ expectedAspectCount: 1
4372
+ };
4373
+ }
4374
+ }
4375
+ evaluateAnyOrder(summary) {
4376
+ const minimums = this.config.minimums ?? {};
4377
+ const toolNames = Object.keys(minimums);
4378
+ if (toolNames.length === 0) {
4379
+ return {
4380
+ score: 1,
4381
+ verdict: "pass",
4382
+ hits: ["No tool requirements specified"],
4383
+ misses: [],
4384
+ expectedAspectCount: 0
4385
+ };
4386
+ }
4387
+ const hits = [];
4388
+ const misses = [];
4389
+ for (const toolName of toolNames) {
4390
+ const required = minimums[toolName];
4391
+ const actual = summary.toolCallsByName[toolName] ?? 0;
4392
+ if (actual >= required) {
4393
+ hits.push(`${toolName}: called ${actual} times (required \u2265${required})`);
4394
+ } else {
4395
+ misses.push(`${toolName}: called ${actual} times (required \u2265${required})`);
4396
+ }
4397
+ }
4398
+ const score = hits.length / toolNames.length;
4399
+ return {
4400
+ score,
4401
+ verdict: scoreToVerdict(score),
4402
+ hits,
4403
+ misses,
4404
+ expectedAspectCount: toolNames.length
4405
+ };
4406
+ }
4407
+ evaluateInOrder(trace) {
4408
+ const expected = this.config.expected ?? [];
4409
+ if (expected.length === 0) {
4410
+ return {
4411
+ score: 1,
4412
+ verdict: "pass",
4413
+ hits: ["No tool sequence specified"],
4414
+ misses: [],
4415
+ expectedAspectCount: 0
4416
+ };
4417
+ }
4418
+ const actualToolCalls = trace.filter((e) => e.type === "tool_call" && e.name);
4419
+ const hits = [];
4420
+ const misses = [];
4421
+ let actualIndex = 0;
4422
+ for (let i = 0; i < expected.length; i++) {
4423
+ const expectedTool = expected[i].tool;
4424
+ let found = false;
4425
+ while (actualIndex < actualToolCalls.length) {
4426
+ if (actualToolCalls[actualIndex].name === expectedTool) {
4427
+ hits.push(`Found ${expectedTool} at position ${actualIndex}`);
4428
+ actualIndex++;
4429
+ found = true;
4430
+ break;
4431
+ }
4432
+ actualIndex++;
4433
+ }
4434
+ if (!found) {
4435
+ misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
4436
+ }
4437
+ }
4438
+ const score = hits.length / expected.length;
4439
+ return {
4440
+ score,
4441
+ verdict: scoreToVerdict(score),
4442
+ hits,
4443
+ misses,
4444
+ expectedAspectCount: expected.length
4445
+ };
4446
+ }
4447
+ evaluateExact(trace) {
4448
+ const expected = this.config.expected ?? [];
4449
+ if (expected.length === 0) {
4450
+ return {
4451
+ score: 1,
4452
+ verdict: "pass",
4453
+ hits: ["No tool sequence specified"],
4454
+ misses: [],
4455
+ expectedAspectCount: 0
4456
+ };
4457
+ }
4458
+ const actualToolCalls = trace.filter((e) => e.type === "tool_call" && e.name);
4459
+ const hits = [];
4460
+ const misses = [];
4461
+ if (actualToolCalls.length !== expected.length) {
4462
+ misses.push(`Expected ${expected.length} tool calls, got ${actualToolCalls.length}`);
4463
+ }
4464
+ const checkLength = Math.min(expected.length, actualToolCalls.length);
4465
+ for (let i = 0; i < checkLength; i++) {
4466
+ const expectedTool = expected[i].tool;
4467
+ const actualTool = actualToolCalls[i].name;
4468
+ if (actualTool === expectedTool) {
4469
+ hits.push(`Position ${i}: ${expectedTool} \u2713`);
4470
+ } else {
4471
+ misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
4472
+ }
4473
+ }
4474
+ for (let i = checkLength; i < expected.length; i++) {
4475
+ misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
4476
+ }
4477
+ const score = hits.length / expected.length;
4478
+ return {
4479
+ score,
4480
+ verdict: scoreToVerdict(score),
4481
+ hits,
4482
+ misses,
4483
+ expectedAspectCount: expected.length
4484
+ };
4485
+ }
4486
+ };
4487
+ var ExpectedMessagesEvaluator = class {
4488
+ kind = "expected_messages";
4489
+ evaluate(context) {
4490
+ const { candidateTrace, evalCase } = context;
4491
+ const expectedSegments = evalCase.expected_segments;
4492
+ const expectedToolCalls = this.extractExpectedToolCalls(expectedSegments);
4493
+ if (expectedToolCalls.length === 0) {
4494
+ return {
4495
+ score: 1,
4496
+ verdict: "pass",
4497
+ hits: ["No tool_calls specified in expected_messages"],
4498
+ misses: [],
4499
+ expectedAspectCount: 1
4500
+ };
4501
+ }
4502
+ if (!candidateTrace || candidateTrace.length === 0) {
4503
+ return {
4504
+ score: 0,
4505
+ verdict: "fail",
4506
+ hits: [],
4507
+ misses: ["No trace available to validate tool_calls"],
4508
+ expectedAspectCount: expectedToolCalls.length
4509
+ };
4510
+ }
4511
+ const actualToolCalls = candidateTrace.filter((e) => e.type === "tool_call");
4512
+ return this.validateToolCalls(expectedToolCalls, actualToolCalls);
4513
+ }
4514
+ extractExpectedToolCalls(segments) {
4515
+ if (!segments) {
4516
+ return [];
4517
+ }
4518
+ const toolCalls = [];
4519
+ for (const segment of segments) {
4520
+ const role = segment.role;
4521
+ const segmentToolCalls = segment.tool_calls;
4522
+ if (role === "assistant" && Array.isArray(segmentToolCalls)) {
4523
+ for (const tc of segmentToolCalls) {
4524
+ if (typeof tc === "object" && tc !== null && typeof tc.tool === "string") {
4525
+ const toolCall = tc;
4526
+ toolCalls.push({ tool: toolCall.tool, input: toolCall.input });
4527
+ }
4528
+ }
4529
+ }
4530
+ }
4531
+ return toolCalls;
4532
+ }
4533
+ validateToolCalls(expected, actual) {
4534
+ const hits = [];
4535
+ const misses = [];
4536
+ for (let i = 0; i < expected.length; i++) {
4537
+ const expectedCall = expected[i];
4538
+ const actualCall = actual[i];
4539
+ if (!actualCall) {
4540
+ misses.push(
4541
+ `tool_calls[${i}]: expected ${expectedCall.tool}, but no more tool calls in trace`
4542
+ );
4543
+ continue;
4544
+ }
4545
+ if (actualCall.name !== expectedCall.tool) {
4546
+ misses.push(
4547
+ `tool_calls[${i}]: expected ${expectedCall.tool}, got ${actualCall.name ?? "unknown"}`
4548
+ );
4549
+ continue;
4550
+ }
4551
+ if (expectedCall.input !== void 0) {
4552
+ if (!this.deepEquals(expectedCall.input, actualCall.input)) {
4553
+ misses.push(`tool_calls[${i}]: ${expectedCall.tool} input mismatch`);
4554
+ continue;
4555
+ }
4556
+ }
4557
+ hits.push(`tool_calls[${i}]: ${expectedCall.tool} matched`);
4558
+ }
4559
+ const totalChecks = expected.length || 1;
4560
+ const score = hits.length / totalChecks;
4561
+ return {
4562
+ score,
4563
+ verdict: score >= 0.8 ? "pass" : score >= 0.6 ? "borderline" : "fail",
4564
+ hits,
4565
+ misses,
4566
+ expectedAspectCount: totalChecks
4567
+ };
4568
+ }
4569
+ deepEquals(a, b) {
4570
+ if (a === b) return true;
4571
+ if (typeof a !== typeof b) return false;
4572
+ if (typeof a !== "object" || a === null || b === null) return false;
4573
+ if (Array.isArray(a) && Array.isArray(b)) {
4574
+ if (a.length !== b.length) return false;
4575
+ return a.every((val, i) => this.deepEquals(val, b[i]));
4576
+ }
4577
+ if (Array.isArray(a) || Array.isArray(b)) return false;
4578
+ const aObj = a;
4579
+ const bObj = b;
4580
+ const aKeys = Object.keys(aObj);
4581
+ const bKeys = Object.keys(bObj);
4582
+ if (aKeys.length !== bKeys.length) return false;
4583
+ return aKeys.every((key) => this.deepEquals(aObj[key], bObj[key]));
4584
+ }
4585
+ };
4586
+ var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
4587
+ {{EVALUATOR_RESULTS_JSON}}
4588
+
4589
+ Decide the final score and verdict based on all evaluator results.
4590
+ Return a JSON object with: score (0.0-1.0), verdict (pass/fail/borderline), and reasoning.`;
4591
+ var CompositeEvaluator = class {
4592
+ kind = "composite";
4593
+ config;
4594
+ evaluatorFactory;
4595
+ cwd;
4596
+ constructor(options) {
4597
+ this.config = options.config;
4598
+ this.evaluatorFactory = options.evaluatorFactory;
4599
+ this.cwd = options.cwd;
4600
+ }
4601
+ async evaluate(context) {
4602
+ const memberResults = await Promise.all(
4603
+ this.config.evaluators.map(async (memberConfig) => {
4604
+ const evaluator = this.evaluatorFactory.create(memberConfig, context);
4605
+ return {
4606
+ id: memberConfig.name,
4607
+ type: memberConfig.type,
4608
+ result: await evaluator.evaluate(context)
4609
+ };
4610
+ })
4611
+ );
4612
+ return this.aggregate(memberResults, context);
4613
+ }
4614
+ async aggregate(results, context) {
4615
+ const aggregator = this.config.aggregator;
4616
+ switch (aggregator.type) {
4617
+ case "code_judge":
4618
+ return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
4619
+ case "llm_judge":
4620
+ return this.runLlmAggregator(results, context, aggregator);
4621
+ default:
4622
+ return this.runWeightedAverage(results, aggregator.weights);
4623
+ }
4624
+ }
4625
+ runWeightedAverage(results, weights) {
4626
+ let totalWeight = 0;
4627
+ let weightedSum = 0;
4628
+ const allHits = [];
4629
+ const allMisses = [];
4630
+ const reasoningParts = [];
4631
+ const evaluatorResults = [];
4632
+ for (const member of results) {
4633
+ const weight = weights?.[member.id] ?? 1;
4634
+ totalWeight += weight;
4635
+ weightedSum += member.result.score * weight;
4636
+ allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
4637
+ allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
4638
+ if (member.result.reasoning) {
4639
+ reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
4640
+ }
4641
+ evaluatorResults.push({
4642
+ name: member.id,
4643
+ type: member.type,
4644
+ score: member.result.score,
4645
+ weight,
4646
+ verdict: member.result.verdict,
4647
+ hits: [...member.result.hits],
4648
+ misses: [...member.result.misses],
4649
+ reasoning: member.result.reasoning,
4650
+ evaluatorRawRequest: member.result.evaluatorRawRequest,
4651
+ evaluatorResults: member.result.evaluatorResults
4652
+ });
4653
+ }
4654
+ const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
4655
+ return {
4656
+ score: clampScore(finalScore),
4657
+ verdict: scoreToVerdict(finalScore),
4658
+ hits: allHits,
4659
+ misses: allMisses,
4660
+ expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
4661
+ reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
4662
+ evaluatorRawRequest: {
4663
+ aggregator: "weighted_average",
4664
+ ...weights ? { weights } : {}
4665
+ },
4666
+ evaluatorResults
4667
+ };
4668
+ }
4669
+ async runCodeAggregator(results, scriptPath, cwd, weights) {
4670
+ const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
4671
+ const inputPayload = JSON.stringify({ results: resultsObject }, null, 2);
4672
+ const evaluatorResults = results.map((member) => ({
4673
+ name: member.id,
4674
+ type: member.type,
4675
+ score: member.result.score,
4676
+ weight: weights?.[member.id] ?? 1,
4677
+ verdict: member.result.verdict,
4678
+ hits: [...member.result.hits],
4679
+ misses: [...member.result.misses],
4680
+ reasoning: member.result.reasoning,
4681
+ evaluatorRawRequest: member.result.evaluatorRawRequest,
4682
+ evaluatorResults: member.result.evaluatorResults
4683
+ }));
4684
+ try {
4685
+ const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
4686
+ const parsed = parseJsonSafe(stdout);
4687
+ const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
4688
+ const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
4689
+ const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
4690
+ const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
4691
+ const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
4692
+ return {
4693
+ score,
4694
+ verdict,
4695
+ hits,
4696
+ misses,
4697
+ expectedAspectCount: hits.length + misses.length || 1,
4698
+ reasoning,
4699
+ evaluatorRawRequest: {
4700
+ aggregator: "code_judge",
4701
+ script: scriptPath
4702
+ },
4703
+ evaluatorResults
4704
+ };
4705
+ } catch (error) {
4706
+ const message = error instanceof Error ? error.message : String(error);
4707
+ return {
4708
+ score: 0,
4709
+ verdict: "fail",
4710
+ hits: [],
4711
+ misses: [`Code aggregator failed: ${message}`],
4712
+ expectedAspectCount: 1,
4713
+ reasoning: message,
4714
+ evaluatorRawRequest: {
4715
+ aggregator: "code_judge",
4716
+ script: scriptPath,
4717
+ error: message
4718
+ },
4719
+ evaluatorResults
4720
+ };
4721
+ }
4722
+ }
4723
+ async runLlmAggregator(results, context, config) {
4724
+ const judgeProvider = context.judgeProvider;
4725
+ if (!judgeProvider) {
4726
+ throw new Error("No judge provider available for LLM aggregation");
4727
+ }
4728
+ const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
4729
+ const resultsJson = JSON.stringify(resultsObject, null, 2);
4730
+ const evaluatorResults = results.map((member) => ({
4731
+ name: member.id,
4732
+ type: member.type,
4733
+ score: member.result.score,
4734
+ verdict: member.result.verdict,
4735
+ hits: [...member.result.hits],
4736
+ misses: [...member.result.misses],
4737
+ reasoning: member.result.reasoning,
4738
+ evaluatorRawRequest: member.result.evaluatorRawRequest,
4739
+ evaluatorResults: member.result.evaluatorResults
4740
+ }));
4741
+ const promptTemplate = config.prompt ?? DEFAULT_COMPOSITE_AGGREGATOR_PROMPT;
4742
+ const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
4743
+ const systemPrompt = buildOutputSchema();
4744
+ const evaluatorRawRequest = {
4745
+ aggregator: "llm_judge",
4746
+ userPrompt,
4747
+ systemPrompt,
4748
+ target: judgeProvider.targetName
4749
+ };
4750
+ try {
4751
+ const model = judgeProvider.asLanguageModel?.();
4752
+ if (model) {
4753
+ const { text } = await (0, import_ai2.generateText)({
4754
+ model,
4755
+ system: systemPrompt,
4756
+ prompt: userPrompt
4757
+ });
4758
+ const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text));
4759
+ const score2 = clampScore(data2.score);
4760
+ const hits2 = Array.isArray(data2.hits) ? data2.hits.filter(isNonEmptyString).slice(0, 4) : [];
4761
+ const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
4762
+ const reasoning2 = data2.reasoning;
4763
+ return {
4764
+ score: score2,
4765
+ verdict: scoreToVerdict(score2),
4766
+ hits: hits2,
4767
+ misses: misses2,
4768
+ expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
4769
+ reasoning: reasoning2,
4770
+ evaluatorRawRequest,
4771
+ evaluatorResults
4772
+ };
4773
+ }
4774
+ const response = await judgeProvider.invoke({
4775
+ question: userPrompt,
4776
+ systemPrompt,
4777
+ evalCaseId: context.evalCase.id,
4778
+ attempt: context.attempt
4779
+ });
4780
+ const data = freeformEvaluationSchema.parse(parseJsonFromText(response.text ?? ""));
4781
+ const score = clampScore(data.score);
4782
+ const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
4783
+ const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
4784
+ const reasoning = data.reasoning ?? response.reasoning;
4785
+ return {
4786
+ score,
4787
+ verdict: scoreToVerdict(score),
4788
+ hits,
4789
+ misses,
4790
+ expectedAspectCount: Math.max(hits.length + misses.length, 1),
4791
+ reasoning,
4792
+ evaluatorRawRequest,
4793
+ evaluatorResults
4794
+ };
4795
+ } catch {
4796
+ return {
4797
+ score: 0,
4798
+ verdict: "fail",
4799
+ hits: [],
4800
+ misses: [],
4801
+ expectedAspectCount: 1,
4802
+ evaluatorRawRequest,
4803
+ evaluatorResults
4804
+ };
4805
+ }
4806
+ }
4807
+ };
4024
4808
 
4025
4809
  // src/evaluation/orchestrator.ts
4026
4810
  var import_node_crypto2 = require("crypto");
4027
4811
  var import_promises11 = require("fs/promises");
4028
- var import_node_path13 = __toESM(require("path"), 1);
4812
+ var import_node_path14 = __toESM(require("path"), 1);
4029
4813
 
4030
4814
  // ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
4031
4815
  var Node = class {
@@ -4232,7 +5016,7 @@ async function runEvaluation(options) {
4232
5016
  if (!definition) {
4233
5017
  return void 0;
4234
5018
  }
4235
- const resolved = resolveTargetDefinition(definition, envLookup);
5019
+ const resolved = resolveTargetDefinition(definition, envLookup, evalFilePath);
4236
5020
  resolvedTargetsByName.set(name, resolved);
4237
5021
  return resolved;
4238
5022
  };
@@ -4546,6 +5330,17 @@ async function runEvalCase(options) {
4546
5330
  if (cacheKey && cache && !cachedResponse) {
4547
5331
  await cache.set(cacheKey, providerResponse);
4548
5332
  }
5333
+ let candidateTrace = providerResponse.trace;
5334
+ if (!candidateTrace && providerResponse.traceRef) {
5335
+ try {
5336
+ const rawTrace = await readJsonFile(providerResponse.traceRef);
5337
+ if (Array.isArray(rawTrace) && rawTrace.every(isTraceEvent)) {
5338
+ candidateTrace = rawTrace;
5339
+ }
5340
+ } catch {
5341
+ }
5342
+ }
5343
+ const candidateTraceSummary = candidateTrace ? computeTraceSummary(candidateTrace) : void 0;
4549
5344
  try {
4550
5345
  return await evaluateCandidate({
4551
5346
  evalCase,
@@ -4557,7 +5352,9 @@ async function runEvalCase(options) {
4557
5352
  nowFn,
4558
5353
  attempt,
4559
5354
  judgeProvider,
4560
- agentTimeoutMs
5355
+ agentTimeoutMs,
5356
+ candidateTrace,
5357
+ candidateTraceSummary
4561
5358
  });
4562
5359
  } catch (error) {
4563
5360
  return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs, provider);
@@ -4574,7 +5371,9 @@ async function evaluateCandidate(options) {
4574
5371
  nowFn,
4575
5372
  attempt,
4576
5373
  judgeProvider,
4577
- agentTimeoutMs
5374
+ agentTimeoutMs,
5375
+ candidateTrace,
5376
+ candidateTraceSummary
4578
5377
  } = options;
4579
5378
  const gradeTimestamp = nowFn();
4580
5379
  const { score, evaluatorResults } = await runEvaluatorsForCase({
@@ -4587,7 +5386,9 @@ async function evaluateCandidate(options) {
4587
5386
  promptInputs,
4588
5387
  now: gradeTimestamp,
4589
5388
  judgeProvider,
4590
- agentTimeoutMs
5389
+ agentTimeoutMs,
5390
+ candidateTrace,
5391
+ candidateTraceSummary
4591
5392
  });
4592
5393
  const completedAt = nowFn();
4593
5394
  let agentProviderRequest;
@@ -4626,7 +5427,8 @@ async function evaluateCandidate(options) {
4626
5427
  agent_provider_request: agentProviderRequest,
4627
5428
  lm_provider_request: lmProviderRequest,
4628
5429
  evaluator_provider_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
4629
- evaluator_results: evaluatorResults
5430
+ evaluator_results: evaluatorResults,
5431
+ trace_summary: candidateTraceSummary
4630
5432
  };
4631
5433
  }
4632
5434
  async function runEvaluatorsForCase(options) {
@@ -4640,7 +5442,9 @@ async function runEvaluatorsForCase(options) {
4640
5442
  promptInputs,
4641
5443
  now,
4642
5444
  judgeProvider,
4643
- agentTimeoutMs
5445
+ agentTimeoutMs,
5446
+ candidateTrace,
5447
+ candidateTraceSummary
4644
5448
  } = options;
4645
5449
  if (evalCase.evaluators && evalCase.evaluators.length > 0) {
4646
5450
  return runEvaluatorList({
@@ -4654,7 +5458,9 @@ async function runEvaluatorsForCase(options) {
4654
5458
  promptInputs,
4655
5459
  now,
4656
5460
  judgeProvider,
4657
- agentTimeoutMs
5461
+ agentTimeoutMs,
5462
+ candidateTrace,
5463
+ candidateTraceSummary
4658
5464
  });
4659
5465
  }
4660
5466
  const evaluatorKind = evalCase.evaluator ?? "llm_judge";
@@ -4670,7 +5476,9 @@ async function runEvaluatorsForCase(options) {
4670
5476
  attempt,
4671
5477
  promptInputs,
4672
5478
  now,
4673
- judgeProvider
5479
+ judgeProvider,
5480
+ candidateTrace,
5481
+ candidateTraceSummary
4674
5482
  });
4675
5483
  return { score };
4676
5484
  }
@@ -4686,7 +5494,9 @@ async function runEvaluatorList(options) {
4686
5494
  promptInputs,
4687
5495
  now,
4688
5496
  judgeProvider,
4689
- agentTimeoutMs
5497
+ agentTimeoutMs,
5498
+ candidateTrace,
5499
+ candidateTraceSummary
4690
5500
  } = options;
4691
5501
  const scored = [];
4692
5502
  const evaluatorResults = [];
@@ -4732,6 +5542,63 @@ async function runEvaluatorList(options) {
4732
5542
  promptInputs,
4733
5543
  now
4734
5544
  });
5545
+ scored.push({ score: score2, name: evaluator.name, type: "code_judge" });
5546
+ evaluatorResults.push({
5547
+ name: evaluator.name,
5548
+ type: "code_judge",
5549
+ score: score2.score,
5550
+ verdict: score2.verdict,
5551
+ hits: score2.hits,
5552
+ misses: score2.misses,
5553
+ reasoning: score2.reasoning,
5554
+ evaluator_provider_request: score2.evaluatorRawRequest
5555
+ });
5556
+ }
5557
+ if (evaluator.type === "composite") {
5558
+ const evalFileDir = evalCase.guideline_paths[0] ? import_node_path14.default.dirname(evalCase.guideline_paths[0]) : process.cwd();
5559
+ const createEvaluator = (memberConfig) => {
5560
+ switch (memberConfig.type) {
5561
+ case "llm_judge":
5562
+ return evaluatorRegistry.llm_judge;
5563
+ case "code":
5564
+ return new CodeEvaluator({
5565
+ script: memberConfig.script,
5566
+ cwd: memberConfig.resolvedCwd ?? memberConfig.cwd,
5567
+ agentTimeoutMs
5568
+ });
5569
+ case "composite":
5570
+ return new CompositeEvaluator({
5571
+ config: memberConfig,
5572
+ cwd: evalFileDir,
5573
+ evaluatorFactory: { create: createEvaluator }
5574
+ });
5575
+ case "tool_trajectory":
5576
+ return new ToolTrajectoryEvaluator({
5577
+ config: memberConfig
5578
+ });
5579
+ case "expected_messages":
5580
+ return new ExpectedMessagesEvaluator();
5581
+ default: {
5582
+ const unknownConfig = memberConfig;
5583
+ throw new Error(`Unsupported evaluator type in composite: ${unknownConfig.type}`);
5584
+ }
5585
+ }
5586
+ };
5587
+ const compositeEvaluator = new CompositeEvaluator({
5588
+ config: evaluator,
5589
+ cwd: evalFileDir,
5590
+ evaluatorFactory: { create: createEvaluator }
5591
+ });
5592
+ const score2 = await compositeEvaluator.evaluate({
5593
+ evalCase,
5594
+ candidate,
5595
+ target,
5596
+ provider,
5597
+ attempt,
5598
+ promptInputs,
5599
+ now,
5600
+ judgeProvider
5601
+ });
4735
5602
  scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
4736
5603
  evaluatorResults.push({
4737
5604
  name: evaluator.name,
@@ -4741,7 +5608,58 @@ async function runEvaluatorList(options) {
4741
5608
  hits: score2.hits,
4742
5609
  misses: score2.misses,
4743
5610
  reasoning: score2.reasoning,
4744
- evaluator_provider_request: score2.evaluatorRawRequest
5611
+ evaluator_provider_request: score2.evaluatorRawRequest,
5612
+ evaluator_results: mapChildResults(score2.evaluatorResults)
5613
+ });
5614
+ }
5615
+ if (evaluator.type === "tool_trajectory") {
5616
+ const trajectoryEvaluator = new ToolTrajectoryEvaluator({
5617
+ config: evaluator
5618
+ });
5619
+ const score2 = trajectoryEvaluator.evaluate({
5620
+ evalCase,
5621
+ candidate,
5622
+ target,
5623
+ provider,
5624
+ attempt,
5625
+ promptInputs,
5626
+ now,
5627
+ candidateTrace,
5628
+ candidateTraceSummary
5629
+ });
5630
+ scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
5631
+ evaluatorResults.push({
5632
+ name: evaluator.name,
5633
+ type: evaluator.type,
5634
+ score: score2.score,
5635
+ verdict: score2.verdict,
5636
+ hits: score2.hits,
5637
+ misses: score2.misses,
5638
+ reasoning: score2.reasoning
5639
+ });
5640
+ }
5641
+ if (evaluator.type === "expected_messages") {
5642
+ const expectedMessagesEvaluator = new ExpectedMessagesEvaluator();
5643
+ const score2 = expectedMessagesEvaluator.evaluate({
5644
+ evalCase,
5645
+ candidate,
5646
+ target,
5647
+ provider,
5648
+ attempt,
5649
+ promptInputs,
5650
+ now,
5651
+ candidateTrace,
5652
+ candidateTraceSummary
5653
+ });
5654
+ scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
5655
+ evaluatorResults.push({
5656
+ name: evaluator.name,
5657
+ type: evaluator.type,
5658
+ score: score2.score,
5659
+ verdict: score2.verdict,
5660
+ hits: score2.hits,
5661
+ misses: score2.misses,
5662
+ reasoning: score2.reasoning
4745
5663
  });
4746
5664
  }
4747
5665
  } catch (error) {
@@ -4754,14 +5672,15 @@ async function runEvaluatorList(options) {
4754
5672
  expectedAspectCount: 1,
4755
5673
  reasoning: message
4756
5674
  };
5675
+ const resultType = evaluator.type === "code" ? "code_judge" : evaluator.type;
4757
5676
  scored.push({
4758
5677
  score: fallbackScore,
4759
5678
  name: evaluator.name ?? "unknown",
4760
- type: evaluator.type ?? "unknown"
5679
+ type: resultType ?? "llm_judge"
4761
5680
  });
4762
5681
  evaluatorResults.push({
4763
5682
  name: evaluator.name ?? "unknown",
4764
- type: evaluator.type ?? "unknown",
5683
+ type: resultType ?? "llm_judge",
4765
5684
  score: 0,
4766
5685
  verdict: "fail",
4767
5686
  hits: [],
@@ -4865,8 +5784,8 @@ function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
4865
5784
  async function dumpPrompt(directory, evalCase, promptInputs) {
4866
5785
  const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
4867
5786
  const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
4868
- const filePath = import_node_path13.default.resolve(directory, filename);
4869
- await (0, import_promises11.mkdir)(import_node_path13.default.dirname(filePath), { recursive: true });
5787
+ const filePath = import_node_path14.default.resolve(directory, filename);
5788
+ await (0, import_promises11.mkdir)(import_node_path14.default.dirname(filePath), { recursive: true });
4870
5789
  const payload = {
4871
5790
  eval_id: evalCase.id,
4872
5791
  question: promptInputs.question,
@@ -4979,6 +5898,23 @@ function isTimeoutLike(error) {
4979
5898
  const value = String(error).toLowerCase();
4980
5899
  return value.includes("timeout");
4981
5900
  }
5901
+ function mapChildResults(children) {
5902
+ if (!children || children.length === 0) {
5903
+ return void 0;
5904
+ }
5905
+ return children.map((child) => ({
5906
+ name: child.name,
5907
+ type: child.type,
5908
+ score: child.score,
5909
+ weight: child.weight,
5910
+ verdict: child.verdict,
5911
+ hits: child.hits,
5912
+ misses: child.misses,
5913
+ reasoning: child.reasoning,
5914
+ evaluator_provider_request: child.evaluatorRawRequest,
5915
+ evaluator_results: mapChildResults(child.evaluatorResults)
5916
+ }));
5917
+ }
4982
5918
 
4983
5919
  // src/evaluation/generators/rubric-generator.ts
4984
5920
  var import_ai3 = require("ai");
@@ -5067,11 +6003,15 @@ function createAgentKernel() {
5067
6003
  // Annotate the CommonJS export names for ESM import in node:
5068
6004
  0 && (module.exports = {
5069
6005
  CodeEvaluator,
6006
+ CompositeEvaluator,
6007
+ ExpectedMessagesEvaluator,
5070
6008
  LlmJudgeEvaluator,
5071
6009
  TEST_MESSAGE_ROLES,
6010
+ ToolTrajectoryEvaluator,
5072
6011
  buildDirectoryChain,
5073
6012
  buildPromptInputs,
5074
6013
  buildSearchRoots,
6014
+ computeTraceSummary,
5075
6015
  consumeCodexLogEntries,
5076
6016
  createAgentKernel,
5077
6017
  createProvider,
@@ -5082,14 +6022,18 @@ function createAgentKernel() {
5082
6022
  generateRubrics,
5083
6023
  getHitCount,
5084
6024
  isEvaluatorKind,
6025
+ isExpectedToolCall,
5085
6026
  isGuidelineFile,
5086
6027
  isJsonObject,
5087
6028
  isJsonValue,
5088
6029
  isTestMessage,
5089
6030
  isTestMessageRole,
6031
+ isTraceEvent,
6032
+ isTraceEventType,
5090
6033
  listTargetNames,
5091
6034
  loadEvalCases,
5092
6035
  normalizeLineEndings,
6036
+ readJsonFile,
5093
6037
  readTargetDefinitions,
5094
6038
  readTestSuiteMetadata,
5095
6039
  readTextFile,