@agentv/core 0.23.0 → 0.26.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -32,11 +32,14 @@ var index_exports = {};
32
32
  __export(index_exports, {
33
33
  CodeEvaluator: () => CodeEvaluator,
34
34
  CompositeEvaluator: () => CompositeEvaluator,
35
+ ExpectedMessagesEvaluator: () => ExpectedMessagesEvaluator,
35
36
  LlmJudgeEvaluator: () => LlmJudgeEvaluator,
36
37
  TEST_MESSAGE_ROLES: () => TEST_MESSAGE_ROLES,
38
+ ToolTrajectoryEvaluator: () => ToolTrajectoryEvaluator,
37
39
  buildDirectoryChain: () => buildDirectoryChain2,
38
40
  buildPromptInputs: () => buildPromptInputs,
39
41
  buildSearchRoots: () => buildSearchRoots2,
42
+ computeTraceSummary: () => computeTraceSummary,
40
43
  consumeCodexLogEntries: () => consumeCodexLogEntries,
41
44
  createAgentKernel: () => createAgentKernel,
42
45
  createProvider: () => createProvider,
@@ -47,14 +50,18 @@ __export(index_exports, {
47
50
  generateRubrics: () => generateRubrics,
48
51
  getHitCount: () => getHitCount,
49
52
  isEvaluatorKind: () => isEvaluatorKind,
53
+ isExpectedToolCall: () => isExpectedToolCall,
50
54
  isGuidelineFile: () => isGuidelineFile,
51
55
  isJsonObject: () => isJsonObject,
52
56
  isJsonValue: () => isJsonValue,
53
57
  isTestMessage: () => isTestMessage,
54
58
  isTestMessageRole: () => isTestMessageRole,
59
+ isTraceEvent: () => isTraceEvent,
60
+ isTraceEventType: () => isTraceEventType,
55
61
  listTargetNames: () => listTargetNames,
56
62
  loadEvalCases: () => loadEvalCases,
57
63
  normalizeLineEndings: () => normalizeLineEndings,
64
+ readJsonFile: () => readJsonFile,
58
65
  readTargetDefinitions: () => readTargetDefinitions,
59
66
  readTestSuiteMetadata: () => readTestSuiteMetadata,
60
67
  readTextFile: () => readTextFile,
@@ -108,7 +115,14 @@ function isTestMessage(value) {
108
115
  }
109
116
  return candidate.content.every(isJsonObject);
110
117
  }
111
- var EVALUATOR_KIND_VALUES = ["code_judge", "llm_judge", "rubric", "composite"];
118
+ var EVALUATOR_KIND_VALUES = [
119
+ "code_judge",
120
+ "llm_judge",
121
+ "rubric",
122
+ "composite",
123
+ "tool_trajectory",
124
+ "expected_messages"
125
+ ];
112
126
  var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
113
127
  function isEvaluatorKind(value) {
114
128
  return typeof value === "string" && EVALUATOR_KIND_SET.has(value);
@@ -117,6 +131,44 @@ function getHitCount(result) {
117
131
  return result.hits.length;
118
132
  }
119
133
 
134
+ // src/evaluation/trace.ts
135
+ function isTraceEventType(value) {
136
+ return typeof value === "string" && ["model_step", "tool_call", "tool_result", "message", "error"].includes(value);
137
+ }
138
+ function isTraceEvent(value) {
139
+ if (typeof value !== "object" || value === null) {
140
+ return false;
141
+ }
142
+ const candidate = value;
143
+ return isTraceEventType(candidate.type) && typeof candidate.timestamp === "string";
144
+ }
145
+ function isExpectedToolCall(value) {
146
+ if (typeof value !== "object" || value === null) {
147
+ return false;
148
+ }
149
+ const candidate = value;
150
+ return typeof candidate.tool === "string";
151
+ }
152
+ function computeTraceSummary(trace) {
153
+ const toolCallCounts = {};
154
+ let errorCount = 0;
155
+ for (const event of trace) {
156
+ if (event.type === "tool_call" && event.name) {
157
+ toolCallCounts[event.name] = (toolCallCounts[event.name] ?? 0) + 1;
158
+ }
159
+ if (event.type === "error") {
160
+ errorCount++;
161
+ }
162
+ }
163
+ const toolNames = Object.keys(toolCallCounts).sort();
164
+ return {
165
+ eventCount: trace.length,
166
+ toolNames,
167
+ toolCallsByName: toolCallCounts,
168
+ errorCount
169
+ };
170
+ }
171
+
120
172
  // src/evaluation/yaml-parser.ts
121
173
  var import_promises6 = require("fs/promises");
122
174
  var import_node_path6 = __toESM(require("path"), 1);
@@ -466,6 +518,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
466
518
  logWarning2(`Skipping code_judge evaluator '${name}' in '${evalId}': missing script`);
467
519
  continue;
468
520
  }
521
+ const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
469
522
  const cwd = asString2(rawEvaluator.cwd);
470
523
  let resolvedCwd;
471
524
  if (cwd) {
@@ -486,7 +539,8 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
486
539
  type: "code",
487
540
  script,
488
541
  cwd,
489
- resolvedCwd
542
+ resolvedCwd,
543
+ ...weight2 !== void 0 ? { weight: weight2 } : {}
490
544
  });
491
545
  continue;
492
546
  }
@@ -581,14 +635,89 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
581
635
  ...promptPath2 ? { promptPath: promptPath2 } : {}
582
636
  };
583
637
  }
638
+ const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
584
639
  evaluators.push({
585
640
  name,
586
641
  type: "composite",
587
642
  evaluators: memberEvaluators,
588
- aggregator
643
+ aggregator,
644
+ ...weight2 !== void 0 ? { weight: weight2 } : {}
589
645
  });
590
646
  continue;
591
647
  }
648
+ if (typeValue === "expected_messages") {
649
+ const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
650
+ evaluators.push({
651
+ name,
652
+ type: "expected_messages",
653
+ ...weight2 !== void 0 ? { weight: weight2 } : {}
654
+ });
655
+ continue;
656
+ }
657
+ if (typeValue === "tool_trajectory") {
658
+ const mode = asString2(rawEvaluator.mode);
659
+ if (mode !== "any_order" && mode !== "in_order" && mode !== "exact") {
660
+ logWarning2(
661
+ `Skipping tool_trajectory evaluator '${name}' in '${evalId}': invalid mode '${mode}' (must be any_order, in_order, or exact)`
662
+ );
663
+ continue;
664
+ }
665
+ const rawMinimums = rawEvaluator.minimums;
666
+ let minimums;
667
+ if (rawMinimums !== void 0) {
668
+ if (!isJsonObject2(rawMinimums)) {
669
+ logWarning2(
670
+ `Skipping tool_trajectory evaluator '${name}' in '${evalId}': minimums must be an object`
671
+ );
672
+ continue;
673
+ }
674
+ minimums = {};
675
+ for (const [toolName, count] of Object.entries(rawMinimums)) {
676
+ if (typeof count === "number" && count >= 0) {
677
+ minimums[toolName] = count;
678
+ }
679
+ }
680
+ }
681
+ const rawExpected = rawEvaluator.expected;
682
+ let expected;
683
+ if (rawExpected !== void 0) {
684
+ if (!Array.isArray(rawExpected)) {
685
+ logWarning2(
686
+ `Skipping tool_trajectory evaluator '${name}' in '${evalId}': expected must be an array`
687
+ );
688
+ continue;
689
+ }
690
+ expected = [];
691
+ for (const item of rawExpected) {
692
+ if (isJsonObject2(item) && typeof item.tool === "string") {
693
+ expected.push({ tool: item.tool });
694
+ }
695
+ }
696
+ }
697
+ if (mode === "any_order" && !minimums) {
698
+ logWarning2(
699
+ `Skipping tool_trajectory evaluator '${name}' in '${evalId}': any_order mode requires minimums`
700
+ );
701
+ continue;
702
+ }
703
+ if ((mode === "in_order" || mode === "exact") && !expected) {
704
+ logWarning2(
705
+ `Skipping tool_trajectory evaluator '${name}' in '${evalId}': ${mode} mode requires expected`
706
+ );
707
+ continue;
708
+ }
709
+ const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
710
+ const config = {
711
+ name,
712
+ type: "tool_trajectory",
713
+ mode,
714
+ ...minimums ? { minimums } : {},
715
+ ...expected ? { expected } : {},
716
+ ...weight2 !== void 0 ? { weight: weight2 } : {}
717
+ };
718
+ evaluators.push(config);
719
+ continue;
720
+ }
592
721
  const prompt = asString2(rawEvaluator.prompt);
593
722
  let promptPath;
594
723
  if (prompt) {
@@ -625,19 +754,23 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
625
754
  logWarning2(`Skipping rubric evaluator '${name}' in '${evalId}': no valid rubrics found`);
626
755
  continue;
627
756
  }
757
+ const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
628
758
  evaluators.push({
629
759
  name,
630
760
  type: "llm_judge",
631
- rubrics: parsedRubrics
761
+ rubrics: parsedRubrics,
762
+ ...weight2 !== void 0 ? { weight: weight2 } : {}
632
763
  });
633
764
  continue;
634
765
  }
766
+ const weight = validateWeight(rawEvaluator.weight, name, evalId);
635
767
  evaluators.push({
636
768
  name,
637
769
  type: "llm_judge",
638
770
  prompt,
639
771
  promptPath,
640
- ...parsedRubrics && parsedRubrics.length > 0 ? { rubrics: parsedRubrics } : {}
772
+ ...parsedRubrics && parsedRubrics.length > 0 ? { rubrics: parsedRubrics } : {},
773
+ ...weight !== void 0 ? { weight } : {}
641
774
  });
642
775
  }
643
776
  return evaluators.length > 0 ? evaluators : void 0;
@@ -667,6 +800,27 @@ ${detailBlock}${ANSI_RESET3}`);
667
800
  console.warn(`${ANSI_YELLOW3}Warning: ${message}${ANSI_RESET3}`);
668
801
  }
669
802
  }
803
+ function validateWeight(rawWeight, evaluatorName, evalId) {
804
+ if (rawWeight === void 0) {
805
+ return void 0;
806
+ }
807
+ if (typeof rawWeight !== "number") {
808
+ throw new Error(
809
+ `Invalid weight for evaluator '${evaluatorName}' in '${evalId}': must be a number`
810
+ );
811
+ }
812
+ if (!Number.isFinite(rawWeight)) {
813
+ throw new Error(
814
+ `Invalid weight for evaluator '${evaluatorName}' in '${evalId}': must be finite (got ${rawWeight})`
815
+ );
816
+ }
817
+ if (rawWeight < 0) {
818
+ throw new Error(
819
+ `Invalid weight for evaluator '${evaluatorName}' in '${evalId}': must be non-negative (got ${rawWeight})`
820
+ );
821
+ }
822
+ return rawWeight;
823
+ }
670
824
 
671
825
  // src/evaluation/loaders/message-processor.ts
672
826
  var import_promises4 = require("fs/promises");
@@ -842,6 +996,67 @@ ${detailBlock}${ANSI_RESET4}`);
842
996
  console.warn(`${ANSI_YELLOW4}Warning: ${message}${ANSI_RESET4}`);
843
997
  }
844
998
  }
999
+ async function processExpectedMessages(options) {
1000
+ const { messages, searchRoots, repoRootPath, verbose } = options;
1001
+ const segments = [];
1002
+ for (const message of messages) {
1003
+ const segment = {
1004
+ role: message.role
1005
+ };
1006
+ if (message.role === "assistant" && message.tool_calls !== void 0) {
1007
+ segment.tool_calls = message.tool_calls;
1008
+ }
1009
+ const content = message.content;
1010
+ if (typeof content === "string") {
1011
+ segment.content = content;
1012
+ } else if (Array.isArray(content)) {
1013
+ const processedContent = [];
1014
+ for (const rawSegment of content) {
1015
+ if (!isJsonObject(rawSegment)) {
1016
+ continue;
1017
+ }
1018
+ const segmentType = asString3(rawSegment.type);
1019
+ if (segmentType === "file") {
1020
+ const rawValue = asString3(rawSegment.value);
1021
+ if (!rawValue) {
1022
+ continue;
1023
+ }
1024
+ const { displayPath, resolvedPath, attempted } = await resolveFileReference(
1025
+ rawValue,
1026
+ searchRoots
1027
+ );
1028
+ if (!resolvedPath) {
1029
+ const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
1030
+ logWarning3(`File not found in expected_messages: ${displayPath}`, attempts);
1031
+ continue;
1032
+ }
1033
+ try {
1034
+ const fileContent = (await (0, import_promises4.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
1035
+ processedContent.push({
1036
+ type: "file",
1037
+ path: displayPath,
1038
+ text: fileContent,
1039
+ resolvedPath: import_node_path4.default.resolve(resolvedPath)
1040
+ });
1041
+ if (verbose) {
1042
+ console.log(` [Expected Output File] Found: ${displayPath}`);
1043
+ console.log(` Resolved to: ${resolvedPath}`);
1044
+ }
1045
+ } catch (error) {
1046
+ logWarning3(
1047
+ `Could not read expected output file ${resolvedPath}: ${error.message}`
1048
+ );
1049
+ }
1050
+ continue;
1051
+ }
1052
+ processedContent.push(cloneJsonObject(rawSegment));
1053
+ }
1054
+ segment.content = processedContent;
1055
+ }
1056
+ segments.push(segment);
1057
+ }
1058
+ return segments;
1059
+ }
845
1060
 
846
1061
  // src/evaluation/formatting/prompt-builder.ts
847
1062
  var import_promises5 = require("fs/promises");
@@ -1146,12 +1361,10 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
1146
1361
  messageType: "input",
1147
1362
  verbose
1148
1363
  });
1149
- const outputSegments = hasExpectedMessages ? await processMessages({
1364
+ const outputSegments = hasExpectedMessages ? await processExpectedMessages({
1150
1365
  messages: expectedMessages,
1151
1366
  searchRoots,
1152
1367
  repoRootPath,
1153
- guidelinePatterns,
1154
- messageType: "output",
1155
1368
  verbose
1156
1369
  }) : [];
1157
1370
  const codeSnippets = extractCodeBlocks(inputSegments);
@@ -1278,6 +1491,10 @@ async function readTextFile(filePath) {
1278
1491
  const content = await (0, import_promises7.readFile)(filePath, "utf8");
1279
1492
  return normalizeLineEndings(content);
1280
1493
  }
1494
+ async function readJsonFile(filePath) {
1495
+ const content = await (0, import_promises7.readFile)(filePath, "utf8");
1496
+ return JSON.parse(content);
1497
+ }
1281
1498
  async function findGitRoot(startPath) {
1282
1499
  let currentDir = import_node_path7.default.dirname(import_node_path7.default.resolve(startPath));
1283
1500
  const root = import_node_path7.default.parse(currentDir).root;
@@ -1786,9 +2003,11 @@ var CliProvider = class {
1786
2003
  const message = detail ? `${detail} (exit code ${codeText})` : `CLI exited with code ${codeText}`;
1787
2004
  throw new Error(message);
1788
2005
  }
1789
- const responseText = await this.readAndCleanupOutputFile(outputFilePath);
2006
+ const responseContent = await this.readAndCleanupOutputFile(outputFilePath);
2007
+ const parsed = this.parseOutputContent(responseContent);
1790
2008
  return {
1791
- text: responseText,
2009
+ text: parsed.text,
2010
+ trace: parsed.trace,
1792
2011
  raw: {
1793
2012
  command: renderedCommand,
1794
2013
  stderr: result.stderr,
@@ -1798,6 +2017,31 @@ var CliProvider = class {
1798
2017
  }
1799
2018
  };
1800
2019
  }
2020
+ /**
2021
+ * Parse output content from CLI.
2022
+ * If the content is valid JSON with a 'text' field, extract text and optional trace.
2023
+ * Otherwise, treat the entire content as plain text.
2024
+ */
2025
+ parseOutputContent(content) {
2026
+ try {
2027
+ const parsed = JSON.parse(content);
2028
+ if (typeof parsed === "object" && parsed !== null && "text" in parsed) {
2029
+ const obj = parsed;
2030
+ const text = typeof obj.text === "string" ? obj.text : String(obj.text);
2031
+ const trace = this.parseTrace(obj.trace);
2032
+ return { text, trace };
2033
+ }
2034
+ } catch {
2035
+ }
2036
+ return { text: content };
2037
+ }
2038
+ parseTrace(trace) {
2039
+ if (!Array.isArray(trace)) {
2040
+ return void 0;
2041
+ }
2042
+ const validEvents = trace.filter(isTraceEvent);
2043
+ return validEvents.length > 0 ? validEvents : void 0;
2044
+ }
1801
2045
  async readAndCleanupOutputFile(filePath) {
1802
2046
  try {
1803
2047
  const content = await readTextFile(filePath);
@@ -2784,6 +3028,7 @@ var MockProvider = class {
2784
3028
  delayMs;
2785
3029
  delayMinMs;
2786
3030
  delayMaxMs;
3031
+ trace;
2787
3032
  constructor(targetName, config) {
2788
3033
  this.id = `mock:${targetName}`;
2789
3034
  this.targetName = targetName;
@@ -2791,6 +3036,7 @@ var MockProvider = class {
2791
3036
  this.delayMs = config.delayMs ?? 0;
2792
3037
  this.delayMinMs = config.delayMinMs ?? 0;
2793
3038
  this.delayMaxMs = config.delayMaxMs ?? 0;
3039
+ this.trace = config.trace;
2794
3040
  }
2795
3041
  async invoke(request) {
2796
3042
  const delay = this.calculateDelay();
@@ -2802,7 +3048,8 @@ var MockProvider = class {
2802
3048
  raw: {
2803
3049
  question: request.question,
2804
3050
  guidelines: request.guidelines
2805
- }
3051
+ },
3052
+ trace: this.trace
2806
3053
  };
2807
3054
  }
2808
3055
  calculateDelay() {
@@ -2816,6 +3063,7 @@ var MockProvider = class {
2816
3063
  };
2817
3064
 
2818
3065
  // src/evaluation/providers/targets.ts
3066
+ var import_node_path11 = __toESM(require("path"), 1);
2819
3067
  var import_zod = require("zod");
2820
3068
  var CLI_PLACEHOLDERS = /* @__PURE__ */ new Set([
2821
3069
  "PROMPT",
@@ -2831,7 +3079,7 @@ var BASE_TARGET_SCHEMA = import_zod.z.object({
2831
3079
  judge_target: import_zod.z.string().optional(),
2832
3080
  workers: import_zod.z.number().int().min(1).optional()
2833
3081
  }).passthrough();
2834
- var DEFAULT_AZURE_API_VERSION = "2024-10-01-preview";
3082
+ var DEFAULT_AZURE_API_VERSION = "2024-12-01-preview";
2835
3083
  function normalizeAzureApiVersion(value) {
2836
3084
  if (!value) {
2837
3085
  return DEFAULT_AZURE_API_VERSION;
@@ -2875,7 +3123,7 @@ function resolveRetryConfig(target) {
2875
3123
  retryableStatusCodes
2876
3124
  };
2877
3125
  }
2878
- function resolveTargetDefinition(definition, env = process.env) {
3126
+ function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
2879
3127
  const parsed = BASE_TARGET_SCHEMA.parse(definition);
2880
3128
  const provider = parsed.provider.toLowerCase();
2881
3129
  const providerBatching = resolveOptionalBoolean(
@@ -2948,7 +3196,7 @@ function resolveTargetDefinition(definition, env = process.env) {
2948
3196
  judgeTarget: parsed.judge_target,
2949
3197
  workers: parsed.workers,
2950
3198
  providerBatching,
2951
- config: resolveCliConfig(parsed, env)
3199
+ config: resolveCliConfig(parsed, env, evalFilePath)
2952
3200
  };
2953
3201
  default:
2954
3202
  throw new Error(`Unsupported provider '${parsed.provider}' in target '${parsed.name}'`);
@@ -2965,7 +3213,10 @@ function resolveAzureConfig(target, env) {
2965
3213
  const apiKey = resolveString(apiKeySource, env, `${target.name} api key`);
2966
3214
  const deploymentName = resolveString(deploymentSource, env, `${target.name} deployment`);
2967
3215
  const version = normalizeAzureApiVersion(
2968
- resolveOptionalString(versionSource, env, `${target.name} api version`)
3216
+ resolveOptionalString(versionSource, env, `${target.name} api version`, {
3217
+ allowLiteral: true,
3218
+ optionalEnv: true
3219
+ })
2969
3220
  );
2970
3221
  const temperature = resolveOptionalNumber(temperatureSource, `${target.name} temperature`);
2971
3222
  const maxOutputTokens = resolveOptionalNumber(
@@ -3066,7 +3317,8 @@ function normalizeCodexLogFormat(value) {
3066
3317
  }
3067
3318
  function resolveMockConfig(target) {
3068
3319
  const response = typeof target.response === "string" ? target.response : void 0;
3069
- return { response };
3320
+ const trace = Array.isArray(target.trace) ? target.trace : void 0;
3321
+ return { response, trace };
3070
3322
  }
3071
3323
  function resolveVSCodeConfig(target, env, insiders) {
3072
3324
  const workspaceTemplateEnvVar = resolveOptionalLiteralString(
@@ -3098,15 +3350,18 @@ function resolveVSCodeConfig(target, env, insiders) {
3098
3350
  workspaceTemplate
3099
3351
  };
3100
3352
  }
3101
- function resolveCliConfig(target, env) {
3353
+ function resolveCliConfig(target, env, evalFilePath) {
3102
3354
  const commandTemplateSource = target.command_template ?? target.commandTemplate;
3103
3355
  const filesFormat = resolveOptionalLiteralString(
3104
3356
  target.files_format ?? target.filesFormat ?? target.attachments_format ?? target.attachmentsFormat
3105
3357
  );
3106
- const cwd = resolveOptionalString(target.cwd, env, `${target.name} working directory`, {
3358
+ let cwd = resolveOptionalString(target.cwd, env, `${target.name} working directory`, {
3107
3359
  allowLiteral: true,
3108
3360
  optionalEnv: true
3109
3361
  });
3362
+ if (!cwd && evalFilePath) {
3363
+ cwd = import_node_path11.default.dirname(import_node_path11.default.resolve(evalFilePath));
3364
+ }
3110
3365
  const timeoutMs = resolveTimeoutMs(
3111
3366
  target.timeout_seconds ?? target.timeoutSeconds,
3112
3367
  `${target.name} timeout`
@@ -3224,17 +3479,15 @@ function resolveOptionalString(source, env, description, options) {
3224
3479
  if (envVarMatch) {
3225
3480
  const varName = envVarMatch[1];
3226
3481
  const envValue = env[varName];
3227
- if (envValue !== void 0) {
3228
- if (envValue.trim().length === 0) {
3229
- throw new Error(`Environment variable '${varName}' for ${description} is empty`);
3230
- }
3231
- return envValue;
3232
- }
3233
3482
  const optionalEnv = options?.optionalEnv ?? false;
3234
- if (optionalEnv) {
3235
- return void 0;
3483
+ if (envValue === void 0 || envValue.trim().length === 0) {
3484
+ if (optionalEnv) {
3485
+ return void 0;
3486
+ }
3487
+ const status = envValue === void 0 ? "is not set" : "is empty";
3488
+ throw new Error(`Environment variable '${varName}' required for ${description} ${status}`);
3236
3489
  }
3237
- throw new Error(`Environment variable '${varName}' required for ${description} is not set`);
3490
+ return envValue;
3238
3491
  }
3239
3492
  const allowLiteral = options?.allowLiteral ?? false;
3240
3493
  if (!allowLiteral) {
@@ -3346,7 +3599,7 @@ function resolveOptionalNumberArray(source, description) {
3346
3599
  }
3347
3600
 
3348
3601
  // src/evaluation/providers/vscode.ts
3349
- var import_node_path11 = __toESM(require("path"), 1);
3602
+ var import_node_path12 = __toESM(require("path"), 1);
3350
3603
  var import_subagent = require("subagent");
3351
3604
 
3352
3605
  // src/evaluation/providers/vscode-templates.ts
@@ -3516,7 +3769,7 @@ function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
3516
3769
  return "";
3517
3770
  }
3518
3771
  const buildList = (files) => files.map((absolutePath) => {
3519
- const fileName = import_node_path11.default.basename(absolutePath);
3772
+ const fileName = import_node_path12.default.basename(absolutePath);
3520
3773
  const fileUri = pathToFileUri2(absolutePath);
3521
3774
  return `* [${fileName}](${fileUri})`;
3522
3775
  });
@@ -3541,8 +3794,8 @@ function collectGuidelineFiles2(attachments, guidelinePatterns) {
3541
3794
  }
3542
3795
  const unique = /* @__PURE__ */ new Map();
3543
3796
  for (const attachment of attachments) {
3544
- const absolutePath = import_node_path11.default.resolve(attachment);
3545
- const normalized = absolutePath.split(import_node_path11.default.sep).join("/");
3797
+ const absolutePath = import_node_path12.default.resolve(attachment);
3798
+ const normalized = absolutePath.split(import_node_path12.default.sep).join("/");
3546
3799
  if (isGuidelineFile(normalized, guidelinePatterns)) {
3547
3800
  if (!unique.has(absolutePath)) {
3548
3801
  unique.set(absolutePath, absolutePath);
@@ -3557,7 +3810,7 @@ function collectAttachmentFiles(attachments) {
3557
3810
  }
3558
3811
  const unique = /* @__PURE__ */ new Map();
3559
3812
  for (const attachment of attachments) {
3560
- const absolutePath = import_node_path11.default.resolve(attachment);
3813
+ const absolutePath = import_node_path12.default.resolve(attachment);
3561
3814
  if (!unique.has(absolutePath)) {
3562
3815
  unique.set(absolutePath, absolutePath);
3563
3816
  }
@@ -3565,7 +3818,7 @@ function collectAttachmentFiles(attachments) {
3565
3818
  return Array.from(unique.values());
3566
3819
  }
3567
3820
  function pathToFileUri2(filePath) {
3568
- const absolutePath = import_node_path11.default.isAbsolute(filePath) ? filePath : import_node_path11.default.resolve(filePath);
3821
+ const absolutePath = import_node_path12.default.isAbsolute(filePath) ? filePath : import_node_path12.default.resolve(filePath);
3569
3822
  const normalizedPath = absolutePath.replace(/\\/g, "/");
3570
3823
  if (/^[a-zA-Z]:\//.test(normalizedPath)) {
3571
3824
  return `file:///${normalizedPath}`;
@@ -3578,7 +3831,7 @@ function normalizeAttachments(attachments) {
3578
3831
  }
3579
3832
  const deduped = /* @__PURE__ */ new Set();
3580
3833
  for (const attachment of attachments) {
3581
- deduped.add(import_node_path11.default.resolve(attachment));
3834
+ deduped.add(import_node_path12.default.resolve(attachment));
3582
3835
  }
3583
3836
  return Array.from(deduped);
3584
3837
  }
@@ -3587,7 +3840,7 @@ function mergeAttachments(all) {
3587
3840
  for (const list of all) {
3588
3841
  if (!list) continue;
3589
3842
  for (const inputFile of list) {
3590
- deduped.add(import_node_path11.default.resolve(inputFile));
3843
+ deduped.add(import_node_path12.default.resolve(inputFile));
3591
3844
  }
3592
3845
  }
3593
3846
  return deduped.size > 0 ? Array.from(deduped) : void 0;
@@ -3636,7 +3889,7 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
3636
3889
  // src/evaluation/providers/targets-file.ts
3637
3890
  var import_node_fs4 = require("fs");
3638
3891
  var import_promises10 = require("fs/promises");
3639
- var import_node_path12 = __toESM(require("path"), 1);
3892
+ var import_node_path13 = __toESM(require("path"), 1);
3640
3893
  var import_yaml3 = require("yaml");
3641
3894
  function isRecord(value) {
3642
3895
  return typeof value === "object" && value !== null && !Array.isArray(value);
@@ -3673,7 +3926,7 @@ async function fileExists3(filePath) {
3673
3926
  }
3674
3927
  }
3675
3928
  async function readTargetDefinitions(filePath) {
3676
- const absolutePath = import_node_path12.default.resolve(filePath);
3929
+ const absolutePath = import_node_path13.default.resolve(filePath);
3677
3930
  if (!await fileExists3(absolutePath)) {
3678
3931
  throw new Error(`targets.yaml not found at ${absolutePath}`);
3679
3932
  }
@@ -3999,9 +4252,11 @@ var CodeEvaluator = class {
3999
4252
  expected_outcome: context.evalCase.expected_outcome,
4000
4253
  reference_answer: context.evalCase.reference_answer,
4001
4254
  candidate_answer: context.candidate,
4002
- guideline_paths: context.evalCase.guideline_paths,
4003
- input_files: context.evalCase.file_paths,
4004
- input_segments: context.evalCase.input_segments
4255
+ guideline_files: context.evalCase.guideline_paths,
4256
+ input_files: context.evalCase.file_paths.filter(
4257
+ (path15) => !context.evalCase.guideline_paths.includes(path15)
4258
+ ),
4259
+ input_messages: context.evalCase.input_messages
4005
4260
  },
4006
4261
  null,
4007
4262
  2
@@ -4121,6 +4376,251 @@ function substituteVariables(template, variables) {
4121
4376
  return variables[varName] ?? match;
4122
4377
  });
4123
4378
  }
4379
+ var ToolTrajectoryEvaluator = class {
4380
+ kind = "tool_trajectory";
4381
+ config;
4382
+ constructor(options) {
4383
+ this.config = options.config;
4384
+ }
4385
+ evaluate(context) {
4386
+ const { candidateTrace, candidateTraceSummary } = context;
4387
+ if (!candidateTrace || !candidateTraceSummary) {
4388
+ return {
4389
+ score: 0,
4390
+ verdict: "fail",
4391
+ hits: [],
4392
+ misses: ["No trace available for evaluation"],
4393
+ expectedAspectCount: 1
4394
+ };
4395
+ }
4396
+ switch (this.config.mode) {
4397
+ case "any_order":
4398
+ return this.evaluateAnyOrder(candidateTraceSummary);
4399
+ case "in_order":
4400
+ return this.evaluateInOrder(candidateTrace);
4401
+ case "exact":
4402
+ return this.evaluateExact(candidateTrace);
4403
+ default:
4404
+ return {
4405
+ score: 0,
4406
+ verdict: "fail",
4407
+ hits: [],
4408
+ misses: [`Unknown mode: ${this.config.mode}`],
4409
+ expectedAspectCount: 1
4410
+ };
4411
+ }
4412
+ }
4413
+ evaluateAnyOrder(summary) {
4414
+ const minimums = this.config.minimums ?? {};
4415
+ const toolNames = Object.keys(minimums);
4416
+ if (toolNames.length === 0) {
4417
+ return {
4418
+ score: 1,
4419
+ verdict: "pass",
4420
+ hits: ["No tool requirements specified"],
4421
+ misses: [],
4422
+ expectedAspectCount: 0
4423
+ };
4424
+ }
4425
+ const hits = [];
4426
+ const misses = [];
4427
+ for (const toolName of toolNames) {
4428
+ const required = minimums[toolName];
4429
+ const actual = summary.toolCallsByName[toolName] ?? 0;
4430
+ if (actual >= required) {
4431
+ hits.push(`${toolName}: called ${actual} times (required \u2265${required})`);
4432
+ } else {
4433
+ misses.push(`${toolName}: called ${actual} times (required \u2265${required})`);
4434
+ }
4435
+ }
4436
+ const score = hits.length / toolNames.length;
4437
+ return {
4438
+ score,
4439
+ verdict: scoreToVerdict(score),
4440
+ hits,
4441
+ misses,
4442
+ expectedAspectCount: toolNames.length
4443
+ };
4444
+ }
4445
+ evaluateInOrder(trace) {
4446
+ const expected = this.config.expected ?? [];
4447
+ if (expected.length === 0) {
4448
+ return {
4449
+ score: 1,
4450
+ verdict: "pass",
4451
+ hits: ["No tool sequence specified"],
4452
+ misses: [],
4453
+ expectedAspectCount: 0
4454
+ };
4455
+ }
4456
+ const actualToolCalls = trace.filter((e) => e.type === "tool_call" && e.name);
4457
+ const hits = [];
4458
+ const misses = [];
4459
+ let actualIndex = 0;
4460
+ for (let i = 0; i < expected.length; i++) {
4461
+ const expectedTool = expected[i].tool;
4462
+ let found = false;
4463
+ while (actualIndex < actualToolCalls.length) {
4464
+ if (actualToolCalls[actualIndex].name === expectedTool) {
4465
+ hits.push(`Found ${expectedTool} at position ${actualIndex}`);
4466
+ actualIndex++;
4467
+ found = true;
4468
+ break;
4469
+ }
4470
+ actualIndex++;
4471
+ }
4472
+ if (!found) {
4473
+ misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
4474
+ }
4475
+ }
4476
+ const score = hits.length / expected.length;
4477
+ return {
4478
+ score,
4479
+ verdict: scoreToVerdict(score),
4480
+ hits,
4481
+ misses,
4482
+ expectedAspectCount: expected.length
4483
+ };
4484
+ }
4485
+ evaluateExact(trace) {
4486
+ const expected = this.config.expected ?? [];
4487
+ if (expected.length === 0) {
4488
+ return {
4489
+ score: 1,
4490
+ verdict: "pass",
4491
+ hits: ["No tool sequence specified"],
4492
+ misses: [],
4493
+ expectedAspectCount: 0
4494
+ };
4495
+ }
4496
+ const actualToolCalls = trace.filter((e) => e.type === "tool_call" && e.name);
4497
+ const hits = [];
4498
+ const misses = [];
4499
+ if (actualToolCalls.length !== expected.length) {
4500
+ misses.push(`Expected ${expected.length} tool calls, got ${actualToolCalls.length}`);
4501
+ }
4502
+ const checkLength = Math.min(expected.length, actualToolCalls.length);
4503
+ for (let i = 0; i < checkLength; i++) {
4504
+ const expectedTool = expected[i].tool;
4505
+ const actualTool = actualToolCalls[i].name;
4506
+ if (actualTool === expectedTool) {
4507
+ hits.push(`Position ${i}: ${expectedTool} \u2713`);
4508
+ } else {
4509
+ misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
4510
+ }
4511
+ }
4512
+ for (let i = checkLength; i < expected.length; i++) {
4513
+ misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
4514
+ }
4515
+ const score = hits.length / expected.length;
4516
+ return {
4517
+ score,
4518
+ verdict: scoreToVerdict(score),
4519
+ hits,
4520
+ misses,
4521
+ expectedAspectCount: expected.length
4522
+ };
4523
+ }
4524
+ };
4525
+ var ExpectedMessagesEvaluator = class {
4526
+ kind = "expected_messages";
4527
+ evaluate(context) {
4528
+ const { candidateTrace, evalCase } = context;
4529
+ const expectedSegments = evalCase.expected_segments;
4530
+ const expectedToolCalls = this.extractExpectedToolCalls(expectedSegments);
4531
+ if (expectedToolCalls.length === 0) {
4532
+ return {
4533
+ score: 1,
4534
+ verdict: "pass",
4535
+ hits: ["No tool_calls specified in expected_messages"],
4536
+ misses: [],
4537
+ expectedAspectCount: 1
4538
+ };
4539
+ }
4540
+ if (!candidateTrace || candidateTrace.length === 0) {
4541
+ return {
4542
+ score: 0,
4543
+ verdict: "fail",
4544
+ hits: [],
4545
+ misses: ["No trace available to validate tool_calls"],
4546
+ expectedAspectCount: expectedToolCalls.length
4547
+ };
4548
+ }
4549
+ const actualToolCalls = candidateTrace.filter((e) => e.type === "tool_call");
4550
+ return this.validateToolCalls(expectedToolCalls, actualToolCalls);
4551
+ }
4552
+ extractExpectedToolCalls(segments) {
4553
+ if (!segments) {
4554
+ return [];
4555
+ }
4556
+ const toolCalls = [];
4557
+ for (const segment of segments) {
4558
+ const role = segment.role;
4559
+ const segmentToolCalls = segment.tool_calls;
4560
+ if (role === "assistant" && Array.isArray(segmentToolCalls)) {
4561
+ for (const tc of segmentToolCalls) {
4562
+ if (typeof tc === "object" && tc !== null && typeof tc.tool === "string") {
4563
+ const toolCall = tc;
4564
+ toolCalls.push({ tool: toolCall.tool, input: toolCall.input });
4565
+ }
4566
+ }
4567
+ }
4568
+ }
4569
+ return toolCalls;
4570
+ }
4571
+ validateToolCalls(expected, actual) {
4572
+ const hits = [];
4573
+ const misses = [];
4574
+ for (let i = 0; i < expected.length; i++) {
4575
+ const expectedCall = expected[i];
4576
+ const actualCall = actual[i];
4577
+ if (!actualCall) {
4578
+ misses.push(
4579
+ `tool_calls[${i}]: expected ${expectedCall.tool}, but no more tool calls in trace`
4580
+ );
4581
+ continue;
4582
+ }
4583
+ if (actualCall.name !== expectedCall.tool) {
4584
+ misses.push(
4585
+ `tool_calls[${i}]: expected ${expectedCall.tool}, got ${actualCall.name ?? "unknown"}`
4586
+ );
4587
+ continue;
4588
+ }
4589
+ if (expectedCall.input !== void 0) {
4590
+ if (!this.deepEquals(expectedCall.input, actualCall.input)) {
4591
+ misses.push(`tool_calls[${i}]: ${expectedCall.tool} input mismatch`);
4592
+ continue;
4593
+ }
4594
+ }
4595
+ hits.push(`tool_calls[${i}]: ${expectedCall.tool} matched`);
4596
+ }
4597
+ const totalChecks = expected.length || 1;
4598
+ const score = hits.length / totalChecks;
4599
+ return {
4600
+ score,
4601
+ verdict: score >= 0.8 ? "pass" : score >= 0.6 ? "borderline" : "fail",
4602
+ hits,
4603
+ misses,
4604
+ expectedAspectCount: totalChecks
4605
+ };
4606
+ }
4607
+ deepEquals(a, b) {
4608
+ if (a === b) return true;
4609
+ if (typeof a !== typeof b) return false;
4610
+ if (typeof a !== "object" || a === null || b === null) return false;
4611
+ if (Array.isArray(a) && Array.isArray(b)) {
4612
+ if (a.length !== b.length) return false;
4613
+ return a.every((val, i) => this.deepEquals(val, b[i]));
4614
+ }
4615
+ if (Array.isArray(a) || Array.isArray(b)) return false;
4616
+ const aObj = a;
4617
+ const bObj = b;
4618
+ const aKeys = Object.keys(aObj);
4619
+ const bKeys = Object.keys(bObj);
4620
+ if (aKeys.length !== bKeys.length) return false;
4621
+ return aKeys.every((key) => this.deepEquals(aObj[key], bObj[key]));
4622
+ }
4623
+ };
4124
4624
  var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
4125
4625
  {{EVALUATOR_RESULTS_JSON}}
4126
4626
 
@@ -4347,7 +4847,7 @@ var CompositeEvaluator = class {
4347
4847
  // src/evaluation/orchestrator.ts
4348
4848
  var import_node_crypto2 = require("crypto");
4349
4849
  var import_promises11 = require("fs/promises");
4350
- var import_node_path13 = __toESM(require("path"), 1);
4850
+ var import_node_path14 = __toESM(require("path"), 1);
4351
4851
 
4352
4852
  // ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
4353
4853
  var Node = class {
@@ -4554,7 +5054,7 @@ async function runEvaluation(options) {
4554
5054
  if (!definition) {
4555
5055
  return void 0;
4556
5056
  }
4557
- const resolved = resolveTargetDefinition(definition, envLookup);
5057
+ const resolved = resolveTargetDefinition(definition, envLookup, evalFilePath);
4558
5058
  resolvedTargetsByName.set(name, resolved);
4559
5059
  return resolved;
4560
5060
  };
@@ -4868,6 +5368,17 @@ async function runEvalCase(options) {
4868
5368
  if (cacheKey && cache && !cachedResponse) {
4869
5369
  await cache.set(cacheKey, providerResponse);
4870
5370
  }
5371
+ let candidateTrace = providerResponse.trace;
5372
+ if (!candidateTrace && providerResponse.traceRef) {
5373
+ try {
5374
+ const rawTrace = await readJsonFile(providerResponse.traceRef);
5375
+ if (Array.isArray(rawTrace) && rawTrace.every(isTraceEvent)) {
5376
+ candidateTrace = rawTrace;
5377
+ }
5378
+ } catch {
5379
+ }
5380
+ }
5381
+ const candidateTraceSummary = candidateTrace ? computeTraceSummary(candidateTrace) : void 0;
4871
5382
  try {
4872
5383
  return await evaluateCandidate({
4873
5384
  evalCase,
@@ -4879,7 +5390,9 @@ async function runEvalCase(options) {
4879
5390
  nowFn,
4880
5391
  attempt,
4881
5392
  judgeProvider,
4882
- agentTimeoutMs
5393
+ agentTimeoutMs,
5394
+ candidateTrace,
5395
+ candidateTraceSummary
4883
5396
  });
4884
5397
  } catch (error) {
4885
5398
  return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs, provider);
@@ -4896,7 +5409,9 @@ async function evaluateCandidate(options) {
4896
5409
  nowFn,
4897
5410
  attempt,
4898
5411
  judgeProvider,
4899
- agentTimeoutMs
5412
+ agentTimeoutMs,
5413
+ candidateTrace,
5414
+ candidateTraceSummary
4900
5415
  } = options;
4901
5416
  const gradeTimestamp = nowFn();
4902
5417
  const { score, evaluatorResults } = await runEvaluatorsForCase({
@@ -4909,7 +5424,9 @@ async function evaluateCandidate(options) {
4909
5424
  promptInputs,
4910
5425
  now: gradeTimestamp,
4911
5426
  judgeProvider,
4912
- agentTimeoutMs
5427
+ agentTimeoutMs,
5428
+ candidateTrace,
5429
+ candidateTraceSummary
4913
5430
  });
4914
5431
  const completedAt = nowFn();
4915
5432
  let agentProviderRequest;
@@ -4922,14 +5439,12 @@ async function evaluateCandidate(options) {
4922
5439
  } else {
4923
5440
  if (promptInputs.chatPrompt) {
4924
5441
  lmProviderRequest = {
4925
- chat_prompt: promptInputs.chatPrompt,
4926
- guideline_paths: evalCase.guideline_paths
5442
+ chat_prompt: promptInputs.chatPrompt
4927
5443
  };
4928
5444
  } else {
4929
5445
  lmProviderRequest = {
4930
5446
  question: promptInputs.question,
4931
- guidelines: promptInputs.guidelines,
4932
- guideline_paths: evalCase.guideline_paths
5447
+ guidelines: promptInputs.guidelines
4933
5448
  };
4934
5449
  }
4935
5450
  }
@@ -4948,7 +5463,8 @@ async function evaluateCandidate(options) {
4948
5463
  agent_provider_request: agentProviderRequest,
4949
5464
  lm_provider_request: lmProviderRequest,
4950
5465
  evaluator_provider_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
4951
- evaluator_results: evaluatorResults
5466
+ evaluator_results: evaluatorResults,
5467
+ trace_summary: candidateTraceSummary
4952
5468
  };
4953
5469
  }
4954
5470
  async function runEvaluatorsForCase(options) {
@@ -4962,7 +5478,9 @@ async function runEvaluatorsForCase(options) {
4962
5478
  promptInputs,
4963
5479
  now,
4964
5480
  judgeProvider,
4965
- agentTimeoutMs
5481
+ agentTimeoutMs,
5482
+ candidateTrace,
5483
+ candidateTraceSummary
4966
5484
  } = options;
4967
5485
  if (evalCase.evaluators && evalCase.evaluators.length > 0) {
4968
5486
  return runEvaluatorList({
@@ -4976,7 +5494,9 @@ async function runEvaluatorsForCase(options) {
4976
5494
  promptInputs,
4977
5495
  now,
4978
5496
  judgeProvider,
4979
- agentTimeoutMs
5497
+ agentTimeoutMs,
5498
+ candidateTrace,
5499
+ candidateTraceSummary
4980
5500
  });
4981
5501
  }
4982
5502
  const evaluatorKind = evalCase.evaluator ?? "llm_judge";
@@ -4992,7 +5512,9 @@ async function runEvaluatorsForCase(options) {
4992
5512
  attempt,
4993
5513
  promptInputs,
4994
5514
  now,
4995
- judgeProvider
5515
+ judgeProvider,
5516
+ candidateTrace,
5517
+ candidateTraceSummary
4996
5518
  });
4997
5519
  return { score };
4998
5520
  }
@@ -5008,7 +5530,9 @@ async function runEvaluatorList(options) {
5008
5530
  promptInputs,
5009
5531
  now,
5010
5532
  judgeProvider,
5011
- agentTimeoutMs
5533
+ agentTimeoutMs,
5534
+ candidateTrace,
5535
+ candidateTraceSummary
5012
5536
  } = options;
5013
5537
  const scored = [];
5014
5538
  const evaluatorResults = [];
@@ -5027,11 +5551,13 @@ async function runEvaluatorList(options) {
5027
5551
  now,
5028
5552
  judgeProvider
5029
5553
  });
5030
- scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
5554
+ const weight = evaluator.weight ?? 1;
5555
+ scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
5031
5556
  evaluatorResults.push({
5032
5557
  name: evaluator.name,
5033
5558
  type: evaluator.type,
5034
5559
  score: score2.score,
5560
+ weight,
5035
5561
  verdict: score2.verdict,
5036
5562
  hits: score2.hits,
5037
5563
  misses: score2.misses,
@@ -5054,11 +5580,13 @@ async function runEvaluatorList(options) {
5054
5580
  promptInputs,
5055
5581
  now
5056
5582
  });
5057
- scored.push({ score: score2, name: evaluator.name, type: "code_judge" });
5583
+ const weight = evaluator.weight ?? 1;
5584
+ scored.push({ score: score2, name: evaluator.name, type: "code_judge", weight });
5058
5585
  evaluatorResults.push({
5059
5586
  name: evaluator.name,
5060
5587
  type: "code_judge",
5061
5588
  score: score2.score,
5589
+ weight,
5062
5590
  verdict: score2.verdict,
5063
5591
  hits: score2.hits,
5064
5592
  misses: score2.misses,
@@ -5067,7 +5595,7 @@ async function runEvaluatorList(options) {
5067
5595
  });
5068
5596
  }
5069
5597
  if (evaluator.type === "composite") {
5070
- const evalFileDir = evalCase.guideline_paths[0] ? import_node_path13.default.dirname(evalCase.guideline_paths[0]) : process.cwd();
5598
+ const evalFileDir = evalCase.guideline_paths[0] ? import_node_path14.default.dirname(evalCase.guideline_paths[0]) : process.cwd();
5071
5599
  const createEvaluator = (memberConfig) => {
5072
5600
  switch (memberConfig.type) {
5073
5601
  case "llm_judge":
@@ -5084,6 +5612,12 @@ async function runEvaluatorList(options) {
5084
5612
  cwd: evalFileDir,
5085
5613
  evaluatorFactory: { create: createEvaluator }
5086
5614
  });
5615
+ case "tool_trajectory":
5616
+ return new ToolTrajectoryEvaluator({
5617
+ config: memberConfig
5618
+ });
5619
+ case "expected_messages":
5620
+ return new ExpectedMessagesEvaluator();
5087
5621
  default: {
5088
5622
  const unknownConfig = memberConfig;
5089
5623
  throw new Error(`Unsupported evaluator type in composite: ${unknownConfig.type}`);
@@ -5105,11 +5639,13 @@ async function runEvaluatorList(options) {
5105
5639
  now,
5106
5640
  judgeProvider
5107
5641
  });
5108
- scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
5642
+ const weight = evaluator.weight ?? 1;
5643
+ scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
5109
5644
  evaluatorResults.push({
5110
5645
  name: evaluator.name,
5111
5646
  type: evaluator.type,
5112
5647
  score: score2.score,
5648
+ weight,
5113
5649
  verdict: score2.verdict,
5114
5650
  hits: score2.hits,
5115
5651
  misses: score2.misses,
@@ -5118,6 +5654,60 @@ async function runEvaluatorList(options) {
5118
5654
  evaluator_results: mapChildResults(score2.evaluatorResults)
5119
5655
  });
5120
5656
  }
5657
+ if (evaluator.type === "tool_trajectory") {
5658
+ const trajectoryEvaluator = new ToolTrajectoryEvaluator({
5659
+ config: evaluator
5660
+ });
5661
+ const score2 = trajectoryEvaluator.evaluate({
5662
+ evalCase,
5663
+ candidate,
5664
+ target,
5665
+ provider,
5666
+ attempt,
5667
+ promptInputs,
5668
+ now,
5669
+ candidateTrace,
5670
+ candidateTraceSummary
5671
+ });
5672
+ const weight = evaluator.weight ?? 1;
5673
+ scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
5674
+ evaluatorResults.push({
5675
+ name: evaluator.name,
5676
+ type: evaluator.type,
5677
+ score: score2.score,
5678
+ weight,
5679
+ verdict: score2.verdict,
5680
+ hits: score2.hits,
5681
+ misses: score2.misses,
5682
+ reasoning: score2.reasoning
5683
+ });
5684
+ }
5685
+ if (evaluator.type === "expected_messages") {
5686
+ const expectedMessagesEvaluator = new ExpectedMessagesEvaluator();
5687
+ const score2 = expectedMessagesEvaluator.evaluate({
5688
+ evalCase,
5689
+ candidate,
5690
+ target,
5691
+ provider,
5692
+ attempt,
5693
+ promptInputs,
5694
+ now,
5695
+ candidateTrace,
5696
+ candidateTraceSummary
5697
+ });
5698
+ const weight = evaluator.weight ?? 1;
5699
+ scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
5700
+ evaluatorResults.push({
5701
+ name: evaluator.name,
5702
+ type: evaluator.type,
5703
+ score: score2.score,
5704
+ weight,
5705
+ verdict: score2.verdict,
5706
+ hits: score2.hits,
5707
+ misses: score2.misses,
5708
+ reasoning: score2.reasoning
5709
+ });
5710
+ }
5121
5711
  } catch (error) {
5122
5712
  const message = error instanceof Error ? error.message : String(error);
5123
5713
  const fallbackScore = {
@@ -5129,15 +5719,18 @@ async function runEvaluatorList(options) {
5129
5719
  reasoning: message
5130
5720
  };
5131
5721
  const resultType = evaluator.type === "code" ? "code_judge" : evaluator.type;
5722
+ const weight = evaluator.weight ?? 1;
5132
5723
  scored.push({
5133
5724
  score: fallbackScore,
5134
5725
  name: evaluator.name ?? "unknown",
5135
- type: resultType ?? "llm_judge"
5726
+ type: resultType ?? "llm_judge",
5727
+ weight
5136
5728
  });
5137
5729
  evaluatorResults.push({
5138
5730
  name: evaluator.name ?? "unknown",
5139
5731
  type: resultType ?? "llm_judge",
5140
5732
  score: 0,
5733
+ weight,
5141
5734
  verdict: "fail",
5142
5735
  hits: [],
5143
5736
  misses: [`Evaluator '${evaluator.name ?? "unknown"}' failed: ${message}`],
@@ -5145,7 +5738,9 @@ async function runEvaluatorList(options) {
5145
5738
  });
5146
5739
  }
5147
5740
  }
5148
- const aggregateScore = scored.length > 0 ? scored.reduce((total, entry) => total + entry.score.score, 0) / scored.length : 0;
5741
+ const aggregateScore = scored.length > 0 ? computeWeightedMean(
5742
+ scored.map((entry) => ({ score: entry.score.score, weight: entry.weight }))
5743
+ ) : 0;
5149
5744
  const hits = scored.flatMap((entry) => entry.score.hits);
5150
5745
  const misses = scored.flatMap((entry) => entry.score.misses);
5151
5746
  const expectedAspectCount = scored.reduce(
@@ -5240,8 +5835,8 @@ function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
5240
5835
  async function dumpPrompt(directory, evalCase, promptInputs) {
5241
5836
  const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
5242
5837
  const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
5243
- const filePath = import_node_path13.default.resolve(directory, filename);
5244
- await (0, import_promises11.mkdir)(import_node_path13.default.dirname(filePath), { recursive: true });
5838
+ const filePath = import_node_path14.default.resolve(directory, filename);
5839
+ await (0, import_promises11.mkdir)(import_node_path14.default.dirname(filePath), { recursive: true });
5245
5840
  const payload = {
5246
5841
  eval_id: evalCase.id,
5247
5842
  question: promptInputs.question,
@@ -5371,6 +5966,16 @@ function mapChildResults(children) {
5371
5966
  evaluator_results: mapChildResults(child.evaluatorResults)
5372
5967
  }));
5373
5968
  }
5969
+ function computeWeightedMean(entries) {
5970
+ let totalWeight = 0;
5971
+ let weightedSum = 0;
5972
+ for (const entry of entries) {
5973
+ const weight = entry.weight ?? 1;
5974
+ totalWeight += weight;
5975
+ weightedSum += entry.score * weight;
5976
+ }
5977
+ return totalWeight > 0 ? weightedSum / totalWeight : 0;
5978
+ }
5374
5979
 
5375
5980
  // src/evaluation/generators/rubric-generator.ts
5376
5981
  var import_ai3 = require("ai");
@@ -5460,11 +6065,14 @@ function createAgentKernel() {
5460
6065
  0 && (module.exports = {
5461
6066
  CodeEvaluator,
5462
6067
  CompositeEvaluator,
6068
+ ExpectedMessagesEvaluator,
5463
6069
  LlmJudgeEvaluator,
5464
6070
  TEST_MESSAGE_ROLES,
6071
+ ToolTrajectoryEvaluator,
5465
6072
  buildDirectoryChain,
5466
6073
  buildPromptInputs,
5467
6074
  buildSearchRoots,
6075
+ computeTraceSummary,
5468
6076
  consumeCodexLogEntries,
5469
6077
  createAgentKernel,
5470
6078
  createProvider,
@@ -5475,14 +6083,18 @@ function createAgentKernel() {
5475
6083
  generateRubrics,
5476
6084
  getHitCount,
5477
6085
  isEvaluatorKind,
6086
+ isExpectedToolCall,
5478
6087
  isGuidelineFile,
5479
6088
  isJsonObject,
5480
6089
  isJsonValue,
5481
6090
  isTestMessage,
5482
6091
  isTestMessageRole,
6092
+ isTraceEvent,
6093
+ isTraceEventType,
5483
6094
  listTargetNames,
5484
6095
  loadEvalCases,
5485
6096
  normalizeLineEndings,
6097
+ readJsonFile,
5486
6098
  readTargetDefinitions,
5487
6099
  readTestSuiteMetadata,
5488
6100
  readTextFile,