@agentv/core 0.14.2 → 0.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -116,7 +116,7 @@ function getHitCount(result) {
116
116
  }
117
117
 
118
118
  // src/evaluation/yaml-parser.ts
119
- var import_promises5 = require("fs/promises");
119
+ var import_promises6 = require("fs/promises");
120
120
  var import_node_path6 = __toESM(require("path"), 1);
121
121
  var import_yaml2 = require("yaml");
122
122
 
@@ -154,7 +154,7 @@ ${part.content}
154
154
  }
155
155
  return parts.map((p) => p.content).join(" ");
156
156
  }
157
- function formatSegment(segment) {
157
+ function formatSegment(segment, mode = "lm") {
158
158
  const type = asString(segment.type);
159
159
  if (type === "text") {
160
160
  return asString(segment.value);
@@ -164,8 +164,14 @@ function formatSegment(segment) {
164
164
  return refPath ? `<Attached: ${refPath}>` : void 0;
165
165
  }
166
166
  if (type === "file") {
167
- const text = asString(segment.text);
168
167
  const filePath = asString(segment.path);
168
+ if (!filePath) {
169
+ return void 0;
170
+ }
171
+ if (mode === "agent") {
172
+ return `<file: path="${filePath}">`;
173
+ }
174
+ const text = asString(segment.text);
169
175
  if (text && filePath) {
170
176
  return formatFileContents([{ content: text.trim(), isFile: true, displayPath: filePath }]);
171
177
  }
@@ -369,8 +375,67 @@ function logWarning(message) {
369
375
 
370
376
  // src/evaluation/loaders/evaluator-parser.ts
371
377
  var import_node_path3 = __toESM(require("path"), 1);
378
+
379
+ // src/evaluation/validation/prompt-validator.ts
380
+ var import_promises3 = require("fs/promises");
381
+
382
+ // src/evaluation/template-variables.ts
383
+ var TEMPLATE_VARIABLES = {
384
+ CANDIDATE_ANSWER: "candidate_answer",
385
+ EXPECTED_MESSAGES: "expected_messages",
386
+ QUESTION: "question",
387
+ EXPECTED_OUTCOME: "expected_outcome",
388
+ REFERENCE_ANSWER: "reference_answer",
389
+ INPUT_MESSAGES: "input_messages"
390
+ };
391
+ var VALID_TEMPLATE_VARIABLES = new Set(
392
+ Object.values(TEMPLATE_VARIABLES)
393
+ );
394
+ var REQUIRED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Set([
395
+ TEMPLATE_VARIABLES.CANDIDATE_ANSWER,
396
+ TEMPLATE_VARIABLES.EXPECTED_MESSAGES
397
+ ]);
398
+
399
+ // src/evaluation/validation/prompt-validator.ts
372
400
  var ANSI_YELLOW2 = "\x1B[33m";
373
401
  var ANSI_RESET2 = "\x1B[0m";
402
+ async function validateCustomPromptContent(promptPath) {
403
+ const content = await (0, import_promises3.readFile)(promptPath, "utf8");
404
+ validateTemplateVariables(content, promptPath);
405
+ }
406
+ function validateTemplateVariables(content, source) {
407
+ const variablePattern = /\{\{\s*([a-zA-Z0-9_]+)\s*\}\}/g;
408
+ const foundVariables = /* @__PURE__ */ new Set();
409
+ const invalidVariables = [];
410
+ let match;
411
+ while ((match = variablePattern.exec(content)) !== null) {
412
+ const varName = match[1];
413
+ foundVariables.add(varName);
414
+ if (!VALID_TEMPLATE_VARIABLES.has(varName)) {
415
+ invalidVariables.push(varName);
416
+ }
417
+ }
418
+ const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.CANDIDATE_ANSWER);
419
+ const hasExpectedMessages = foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_MESSAGES);
420
+ const hasRequiredFields = hasCandidateAnswer || hasExpectedMessages;
421
+ if (!hasRequiredFields) {
422
+ throw new Error(
423
+ `Missing required fields. Must include at least one of:
424
+ - {{ ${TEMPLATE_VARIABLES.CANDIDATE_ANSWER} }}
425
+ - {{ ${TEMPLATE_VARIABLES.EXPECTED_MESSAGES} }}`
426
+ );
427
+ }
428
+ if (invalidVariables.length > 0) {
429
+ const warningMessage = `${ANSI_YELLOW2}Warning: Custom evaluator template at ${source}
430
+ Contains invalid variables: ${invalidVariables.map((v) => `{{ ${v} }}`).join(", ")}
431
+ Valid variables: ${Array.from(VALID_TEMPLATE_VARIABLES).map((v) => `{{ ${v} }}`).join(", ")}${ANSI_RESET2}`;
432
+ console.warn(warningMessage);
433
+ }
434
+ }
435
+
436
+ // src/evaluation/loaders/evaluator-parser.ts
437
+ var ANSI_YELLOW3 = "\x1B[33m";
438
+ var ANSI_RESET3 = "\x1B[0m";
374
439
  async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
375
440
  const execution = rawEvalCase.execution;
376
441
  const candidateEvaluators = isJsonObject2(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators ?? globalExecution?.evaluators;
@@ -429,6 +494,12 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
429
494
  const resolved = await resolveFileReference(prompt, searchRoots);
430
495
  if (resolved.resolvedPath) {
431
496
  promptPath = import_node_path3.default.resolve(resolved.resolvedPath);
497
+ try {
498
+ await validateCustomPromptContent(promptPath);
499
+ } catch (error) {
500
+ const message = error instanceof Error ? error.message : String(error);
501
+ throw new Error(`Evaluator '${name}' template (${promptPath}): ${message}`);
502
+ }
432
503
  } else {
433
504
  logWarning2(
434
505
  `Inline prompt used for evaluator '${name}' in '${evalId}' (file not found: ${resolved.displayPath})`,
@@ -465,18 +536,18 @@ function isJsonObject2(value) {
465
536
  function logWarning2(message, details) {
466
537
  if (details && details.length > 0) {
467
538
  const detailBlock = details.join("\n");
468
- console.warn(`${ANSI_YELLOW2}Warning: ${message}
469
- ${detailBlock}${ANSI_RESET2}`);
539
+ console.warn(`${ANSI_YELLOW3}Warning: ${message}
540
+ ${detailBlock}${ANSI_RESET3}`);
470
541
  } else {
471
- console.warn(`${ANSI_YELLOW2}Warning: ${message}${ANSI_RESET2}`);
542
+ console.warn(`${ANSI_YELLOW3}Warning: ${message}${ANSI_RESET3}`);
472
543
  }
473
544
  }
474
545
 
475
546
  // src/evaluation/loaders/message-processor.ts
476
- var import_promises3 = require("fs/promises");
547
+ var import_promises4 = require("fs/promises");
477
548
  var import_node_path4 = __toESM(require("path"), 1);
478
- var ANSI_YELLOW3 = "\x1B[33m";
479
- var ANSI_RESET3 = "\x1B[0m";
549
+ var ANSI_YELLOW4 = "\x1B[33m";
550
+ var ANSI_RESET4 = "\x1B[0m";
480
551
  async function processMessages(options) {
481
552
  const {
482
553
  messages,
@@ -519,7 +590,7 @@ async function processMessages(options) {
519
590
  continue;
520
591
  }
521
592
  try {
522
- const fileContent = (await (0, import_promises3.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
593
+ const fileContent = (await (0, import_promises4.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
523
594
  if (messageType === "input" && guidelinePatterns && guidelinePaths) {
524
595
  const relativeToRepo = import_node_path4.default.relative(repoRootPath, resolvedPath);
525
596
  if (isGuidelineFile(relativeToRepo, guidelinePatterns)) {
@@ -590,7 +661,7 @@ async function resolveAssistantContent(content, searchRoots, verbose) {
590
661
  continue;
591
662
  }
592
663
  try {
593
- const fileContent = (await (0, import_promises3.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n").trim();
664
+ const fileContent = (await (0, import_promises4.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n").trim();
594
665
  parts.push({ content: fileContent, isFile: true, displayPath });
595
666
  if (verbose) {
596
667
  console.log(` [Expected Assistant File] Found: ${displayPath}`);
@@ -640,19 +711,19 @@ function cloneJsonValue(value) {
640
711
  function logWarning3(message, details) {
641
712
  if (details && details.length > 0) {
642
713
  const detailBlock = details.join("\n");
643
- console.warn(`${ANSI_YELLOW3}Warning: ${message}
644
- ${detailBlock}${ANSI_RESET3}`);
714
+ console.warn(`${ANSI_YELLOW4}Warning: ${message}
715
+ ${detailBlock}${ANSI_RESET4}`);
645
716
  } else {
646
- console.warn(`${ANSI_YELLOW3}Warning: ${message}${ANSI_RESET3}`);
717
+ console.warn(`${ANSI_YELLOW4}Warning: ${message}${ANSI_RESET4}`);
647
718
  }
648
719
  }
649
720
 
650
721
  // src/evaluation/formatting/prompt-builder.ts
651
- var import_promises4 = require("fs/promises");
722
+ var import_promises5 = require("fs/promises");
652
723
  var import_node_path5 = __toESM(require("path"), 1);
653
- var ANSI_YELLOW4 = "\x1B[33m";
654
- var ANSI_RESET4 = "\x1B[0m";
655
- async function buildPromptInputs(testCase) {
724
+ var ANSI_YELLOW5 = "\x1B[33m";
725
+ var ANSI_RESET5 = "\x1B[0m";
726
+ async function buildPromptInputs(testCase, mode = "lm") {
656
727
  const guidelineParts = [];
657
728
  for (const rawPath of testCase.guideline_paths) {
658
729
  const absolutePath = import_node_path5.default.resolve(rawPath);
@@ -661,7 +732,7 @@ async function buildPromptInputs(testCase) {
661
732
  continue;
662
733
  }
663
734
  try {
664
- const content = (await (0, import_promises4.readFile)(absolutePath, "utf8")).replace(/\r\n/g, "\n").trim();
735
+ const content = (await (0, import_promises5.readFile)(absolutePath, "utf8")).replace(/\r\n/g, "\n").trim();
665
736
  guidelineParts.push({
666
737
  content,
667
738
  isFile: true,
@@ -728,7 +799,7 @@ async function buildPromptInputs(testCase) {
728
799
  const roleLabel = message.role.charAt(0).toUpperCase() + message.role.slice(1);
729
800
  const contentParts = [];
730
801
  for (const segment of segments) {
731
- const formattedContent = formatSegment(segment);
802
+ const formattedContent = formatSegment(segment, mode);
732
803
  if (formattedContent) {
733
804
  contentParts.push(formattedContent);
734
805
  }
@@ -743,7 +814,11 @@ ${messageContent}`);
743
814
  } else {
744
815
  const questionParts = [];
745
816
  for (const segment of testCase.input_segments) {
746
- const formattedContent = formatSegment(segment);
817
+ if (segment.type === "file" && typeof segment.path === "string" && testCase.guideline_patterns && isGuidelineFile(segment.path, testCase.guideline_patterns)) {
818
+ questionParts.push(`<Attached: ${segment.path}>`);
819
+ continue;
820
+ }
821
+ const formattedContent = formatSegment(segment, mode);
747
822
  if (formattedContent) {
748
823
  questionParts.push(formattedContent);
749
824
  }
@@ -757,7 +832,8 @@ ${messageContent}`);
757
832
  messages: testCase.input_messages,
758
833
  segmentsByMessage,
759
834
  guidelinePatterns: testCase.guideline_patterns,
760
- guidelineContent: guidelines
835
+ guidelineContent: guidelines,
836
+ mode
761
837
  }) : void 0;
762
838
  return { question, guidelines, chatPrompt };
763
839
  }
@@ -774,7 +850,7 @@ function needsRoleMarkers(messages, processedSegmentsByMessage) {
774
850
  return messagesWithContent > 1;
775
851
  }
776
852
  function buildChatPromptFromSegments(options) {
777
- const { messages, segmentsByMessage, guidelinePatterns, guidelineContent, systemPrompt } = options;
853
+ const { messages, segmentsByMessage, guidelinePatterns, guidelineContent, systemPrompt, mode = "lm" } = options;
778
854
  if (messages.length === 0) {
779
855
  return void 0;
780
856
  }
@@ -792,7 +868,7 @@ ${guidelineContent.trim()}`);
792
868
  const segments = segmentsByMessage[startIndex];
793
869
  const contentParts = [];
794
870
  for (const segment of segments) {
795
- const formatted = formatSegment(segment);
871
+ const formatted = formatSegment(segment, mode);
796
872
  if (formatted) {
797
873
  contentParts.push(formatted);
798
874
  }
@@ -825,7 +901,7 @@ ${guidelineContent.trim()}`);
825
901
  if (segment.type === "guideline_ref") {
826
902
  continue;
827
903
  }
828
- const formatted = formatSegment(segment);
904
+ const formatted = formatSegment(segment, mode);
829
905
  if (formatted) {
830
906
  const isGuidelineRef = segment.type === "file" && typeof segment.path === "string" && guidelinePatterns && isGuidelineFile(segment.path, guidelinePatterns);
831
907
  if (isGuidelineRef) {
@@ -849,17 +925,18 @@ function asString4(value) {
849
925
  return typeof value === "string" ? value : void 0;
850
926
  }
851
927
  function logWarning4(message) {
852
- console.warn(`${ANSI_YELLOW4}Warning: ${message}${ANSI_RESET4}`);
928
+ console.warn(`${ANSI_YELLOW5}Warning: ${message}${ANSI_RESET5}`);
853
929
  }
854
930
 
855
931
  // src/evaluation/yaml-parser.ts
856
- var ANSI_YELLOW5 = "\x1B[33m";
857
- var ANSI_RESET5 = "\x1B[0m";
932
+ var ANSI_YELLOW6 = "\x1B[33m";
933
+ var ANSI_RED = "\x1B[31m";
934
+ var ANSI_RESET6 = "\x1B[0m";
858
935
  var SCHEMA_EVAL_V2 = "agentv-eval-v2";
859
936
  async function readTestSuiteMetadata(testFilePath) {
860
937
  try {
861
938
  const absolutePath = import_node_path6.default.resolve(testFilePath);
862
- const content = await (0, import_promises5.readFile)(absolutePath, "utf8");
939
+ const content = await (0, import_promises6.readFile)(absolutePath, "utf8");
863
940
  const parsed = (0, import_yaml2.parse)(content);
864
941
  if (!isJsonObject(parsed)) {
865
942
  return {};
@@ -877,7 +954,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
877
954
  const searchRoots = buildSearchRoots(absoluteTestPath, repoRootPath);
878
955
  const config = await loadConfig(absoluteTestPath, repoRootPath);
879
956
  const guidelinePatterns = config?.guideline_patterns;
880
- const rawFile = await (0, import_promises5.readFile)(absoluteTestPath, "utf8");
957
+ const rawFile = await (0, import_promises6.readFile)(absoluteTestPath, "utf8");
881
958
  const parsed = (0, import_yaml2.parse)(rawFile);
882
959
  if (!isJsonObject(parsed)) {
883
960
  throw new Error(`Invalid test file format: ${evalFilePath}`);
@@ -915,14 +992,14 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
915
992
  const inputMessagesValue = evalcase.input_messages;
916
993
  const expectedMessagesValue = evalcase.expected_messages;
917
994
  if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
918
- logWarning5(`Skipping incomplete eval case: ${id ?? "unknown"}`);
995
+ logError(`Skipping incomplete eval case: ${id ?? "unknown"}. Missing required fields: id, outcome, and/or input_messages`);
919
996
  continue;
920
997
  }
921
998
  const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
922
999
  const inputMessages = inputMessagesValue.filter((msg) => isTestMessage(msg));
923
1000
  const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
924
1001
  if (hasExpectedMessages && expectedMessages.length === 0) {
925
- logWarning5(`No valid expected message found for eval case: ${id}`);
1002
+ logError(`No valid expected message found for eval case: ${id}`);
926
1003
  continue;
927
1004
  }
928
1005
  if (expectedMessages.length > 1) {
@@ -953,7 +1030,14 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
953
1030
  const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
954
1031
  const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
955
1032
  const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
956
- const evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
1033
+ let evaluators;
1034
+ try {
1035
+ evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
1036
+ } catch (error) {
1037
+ const message = error instanceof Error ? error.message : String(error);
1038
+ logError(`Skipping eval case '${id}': ${message}`);
1039
+ continue;
1040
+ }
957
1041
  const userFilePaths = [];
958
1042
  for (const segment of inputSegments) {
959
1043
  if (segment.type === "file" && typeof segment.resolvedPath === "string") {
@@ -971,7 +1055,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
971
1055
  question,
972
1056
  input_messages: inputMessages,
973
1057
  input_segments: inputSegments,
974
- output_segments: outputSegments,
1058
+ expected_segments: outputSegments,
975
1059
  reference_answer: referenceAnswer,
976
1060
  guideline_paths: guidelinePaths.map((guidelinePath) => import_node_path6.default.resolve(guidelinePath)),
977
1061
  guideline_patterns: guidelinePatterns,
@@ -1003,20 +1087,29 @@ function asString5(value) {
1003
1087
  function logWarning5(message, details) {
1004
1088
  if (details && details.length > 0) {
1005
1089
  const detailBlock = details.join("\n");
1006
- console.warn(`${ANSI_YELLOW5}Warning: ${message}
1007
- ${detailBlock}${ANSI_RESET5}`);
1090
+ console.warn(`${ANSI_YELLOW6}Warning: ${message}
1091
+ ${detailBlock}${ANSI_RESET6}`);
1008
1092
  } else {
1009
- console.warn(`${ANSI_YELLOW5}Warning: ${message}${ANSI_RESET5}`);
1093
+ console.warn(`${ANSI_YELLOW6}Warning: ${message}${ANSI_RESET6}`);
1094
+ }
1095
+ }
1096
+ function logError(message, details) {
1097
+ if (details && details.length > 0) {
1098
+ const detailBlock = details.join("\n");
1099
+ console.error(`${ANSI_RED}Error: ${message}
1100
+ ${detailBlock}${ANSI_RESET6}`);
1101
+ } else {
1102
+ console.error(`${ANSI_RED}Error: ${message}${ANSI_RESET6}`);
1010
1103
  }
1011
1104
  }
1012
1105
 
1013
1106
  // src/evaluation/file-utils.ts
1014
1107
  var import_node_fs2 = require("fs");
1015
- var import_promises6 = require("fs/promises");
1108
+ var import_promises7 = require("fs/promises");
1016
1109
  var import_node_path7 = __toESM(require("path"), 1);
1017
1110
  async function fileExists2(filePath) {
1018
1111
  try {
1019
- await (0, import_promises6.access)(filePath, import_node_fs2.constants.F_OK);
1112
+ await (0, import_promises7.access)(filePath, import_node_fs2.constants.F_OK);
1020
1113
  return true;
1021
1114
  } catch {
1022
1115
  return false;
@@ -1026,7 +1119,7 @@ function normalizeLineEndings(content) {
1026
1119
  return content.replace(/\r\n/g, "\n");
1027
1120
  }
1028
1121
  async function readTextFile(filePath) {
1029
- const content = await (0, import_promises6.readFile)(filePath, "utf8");
1122
+ const content = await (0, import_promises7.readFile)(filePath, "utf8");
1030
1123
  return normalizeLineEndings(content);
1031
1124
  }
1032
1125
  async function findGitRoot(startPath) {
@@ -1447,7 +1540,7 @@ async function withRetry(fn, retryConfig, signal) {
1447
1540
 
1448
1541
  // src/evaluation/providers/cli.ts
1449
1542
  var import_node_child_process = require("child_process");
1450
- var import_promises7 = __toESM(require("fs/promises"), 1);
1543
+ var import_promises8 = __toESM(require("fs/promises"), 1);
1451
1544
  var import_node_os = __toESM(require("os"), 1);
1452
1545
  var import_node_path8 = __toESM(require("path"), 1);
1453
1546
  var import_node_util = require("util");
@@ -1548,7 +1641,7 @@ var CliProvider = class {
1548
1641
  const errorMsg = error instanceof Error ? error.message : String(error);
1549
1642
  throw new Error(`Failed to read output file '${filePath}': ${errorMsg}`);
1550
1643
  } finally {
1551
- await import_promises7.default.unlink(filePath).catch(() => {
1644
+ await import_promises8.default.unlink(filePath).catch(() => {
1552
1645
  });
1553
1646
  }
1554
1647
  }
@@ -1687,7 +1780,7 @@ function formatTimeoutSuffix(timeoutMs) {
1687
1780
  var import_node_child_process2 = require("child_process");
1688
1781
  var import_node_crypto = require("crypto");
1689
1782
  var import_node_fs3 = require("fs");
1690
- var import_promises8 = require("fs/promises");
1783
+ var import_promises9 = require("fs/promises");
1691
1784
  var import_node_os2 = require("os");
1692
1785
  var import_node_path10 = __toESM(require("path"), 1);
1693
1786
  var import_node_util2 = require("util");
@@ -1877,7 +1970,7 @@ var CodexProvider = class {
1877
1970
  try {
1878
1971
  const promptContent = buildPromptDocument(request, inputFiles);
1879
1972
  const promptFile = import_node_path10.default.join(workspaceRoot, PROMPT_FILENAME);
1880
- await (0, import_promises8.writeFile)(promptFile, promptContent, "utf8");
1973
+ await (0, import_promises9.writeFile)(promptFile, promptContent, "utf8");
1881
1974
  const args = this.buildCodexArgs();
1882
1975
  const cwd = this.resolveCwd(workspaceRoot);
1883
1976
  const result = await this.executeCodex(args, cwd, promptContent, request.signal, logger);
@@ -1960,11 +2053,11 @@ var CodexProvider = class {
1960
2053
  }
1961
2054
  }
1962
2055
  async createWorkspace() {
1963
- return await (0, import_promises8.mkdtemp)(import_node_path10.default.join((0, import_node_os2.tmpdir)(), WORKSPACE_PREFIX));
2056
+ return await (0, import_promises9.mkdtemp)(import_node_path10.default.join((0, import_node_os2.tmpdir)(), WORKSPACE_PREFIX));
1964
2057
  }
1965
2058
  async cleanupWorkspace(workspaceRoot) {
1966
2059
  try {
1967
- await (0, import_promises8.rm)(workspaceRoot, { recursive: true, force: true });
2060
+ await (0, import_promises9.rm)(workspaceRoot, { recursive: true, force: true });
1968
2061
  } catch {
1969
2062
  }
1970
2063
  }
@@ -1984,7 +2077,7 @@ var CodexProvider = class {
1984
2077
  return void 0;
1985
2078
  }
1986
2079
  try {
1987
- await (0, import_promises8.mkdir)(logDir, { recursive: true });
2080
+ await (0, import_promises9.mkdir)(logDir, { recursive: true });
1988
2081
  } catch (error) {
1989
2082
  const message = error instanceof Error ? error.message : String(error);
1990
2083
  console.warn(`Skipping Codex stream logging (could not create ${logDir}): ${message}`);
@@ -2207,7 +2300,7 @@ async function locateExecutable(candidate) {
2207
2300
  if (includesPathSeparator) {
2208
2301
  const resolved = import_node_path10.default.isAbsolute(candidate) ? candidate : import_node_path10.default.resolve(candidate);
2209
2302
  const executablePath = await ensureWindowsExecutableVariant(resolved);
2210
- await (0, import_promises8.access)(executablePath, import_node_fs3.constants.F_OK);
2303
+ await (0, import_promises9.access)(executablePath, import_node_fs3.constants.F_OK);
2211
2304
  return executablePath;
2212
2305
  }
2213
2306
  const locator = process.platform === "win32" ? "where" : "which";
@@ -2217,7 +2310,7 @@ async function locateExecutable(candidate) {
2217
2310
  const preferred = selectExecutableCandidate(lines);
2218
2311
  if (preferred) {
2219
2312
  const executablePath = await ensureWindowsExecutableVariant(preferred);
2220
- await (0, import_promises8.access)(executablePath, import_node_fs3.constants.F_OK);
2313
+ await (0, import_promises9.access)(executablePath, import_node_fs3.constants.F_OK);
2221
2314
  return executablePath;
2222
2315
  }
2223
2316
  } catch {
@@ -2251,7 +2344,7 @@ async function ensureWindowsExecutableVariant(candidate) {
2251
2344
  for (const ext of extensions) {
2252
2345
  const withExtension = `${candidate}${ext}`;
2253
2346
  try {
2254
- await (0, import_promises8.access)(withExtension, import_node_fs3.constants.F_OK);
2347
+ await (0, import_promises9.access)(withExtension, import_node_fs3.constants.F_OK);
2255
2348
  return withExtension;
2256
2349
  } catch {
2257
2350
  }
@@ -3313,7 +3406,7 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
3313
3406
 
3314
3407
  // src/evaluation/providers/targets-file.ts
3315
3408
  var import_node_fs4 = require("fs");
3316
- var import_promises9 = require("fs/promises");
3409
+ var import_promises10 = require("fs/promises");
3317
3410
  var import_node_path12 = __toESM(require("path"), 1);
3318
3411
  var import_yaml3 = require("yaml");
3319
3412
 
@@ -3376,7 +3469,7 @@ function assertTargetDefinition(value, index, filePath) {
3376
3469
  }
3377
3470
  async function fileExists3(filePath) {
3378
3471
  try {
3379
- await (0, import_promises9.access)(filePath, import_node_fs4.constants.F_OK);
3472
+ await (0, import_promises10.access)(filePath, import_node_fs4.constants.F_OK);
3380
3473
  return true;
3381
3474
  } catch {
3382
3475
  return false;
@@ -3387,7 +3480,7 @@ async function readTargetDefinitions(filePath) {
3387
3480
  if (!await fileExists3(absolutePath)) {
3388
3481
  throw new Error(`targets.yaml not found at ${absolutePath}`);
3389
3482
  }
3390
- const raw = await (0, import_promises9.readFile)(absolutePath, "utf8");
3483
+ const raw = await (0, import_promises10.readFile)(absolutePath, "utf8");
3391
3484
  const parsed = (0, import_yaml3.parse)(raw);
3392
3485
  if (!isRecord(parsed)) {
3393
3486
  throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with '$schema' and 'targets' fields`);
@@ -3438,16 +3531,16 @@ Use the reference_answer as a gold standard for a high-quality response (if prov
3438
3531
  Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.
3439
3532
 
3440
3533
  [[ ## expected_outcome ## ]]
3441
- {{expected_outcome}}
3534
+ {{${TEMPLATE_VARIABLES.EXPECTED_OUTCOME}}}
3442
3535
 
3443
3536
  [[ ## question ## ]]
3444
- {{question}}
3537
+ {{${TEMPLATE_VARIABLES.QUESTION}}}
3445
3538
 
3446
3539
  [[ ## reference_answer ## ]]
3447
- {{reference_answer}}
3540
+ {{${TEMPLATE_VARIABLES.REFERENCE_ANSWER}}}
3448
3541
 
3449
3542
  [[ ## candidate_answer ## ]]
3450
- {{candidate_answer}}`;
3543
+ {{${TEMPLATE_VARIABLES.CANDIDATE_ANSWER}}}`;
3451
3544
  var LlmJudgeEvaluator = class {
3452
3545
  kind = "llm_judge";
3453
3546
  resolveJudgeProvider;
@@ -3470,12 +3563,12 @@ var LlmJudgeEvaluator = class {
3470
3563
  async evaluateWithPrompt(context, judgeProvider) {
3471
3564
  const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
3472
3565
  const variables = {
3473
- input_messages: JSON.stringify(context.evalCase.input_segments, null, 2),
3474
- output_messages: JSON.stringify(context.evalCase.output_segments, null, 2),
3475
- candidate_answer: context.candidate.trim(),
3476
- reference_answer: (context.evalCase.reference_answer ?? "").trim(),
3477
- expected_outcome: context.evalCase.expected_outcome.trim(),
3478
- question: formattedQuestion.trim()
3566
+ [TEMPLATE_VARIABLES.INPUT_MESSAGES]: JSON.stringify(context.evalCase.input_segments, null, 2),
3567
+ [TEMPLATE_VARIABLES.EXPECTED_MESSAGES]: JSON.stringify(context.evalCase.expected_segments, null, 2),
3568
+ [TEMPLATE_VARIABLES.CANDIDATE_ANSWER]: context.candidate.trim(),
3569
+ [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context.evalCase.reference_answer ?? "").trim(),
3570
+ [TEMPLATE_VARIABLES.EXPECTED_OUTCOME]: context.evalCase.expected_outcome.trim(),
3571
+ [TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim()
3479
3572
  };
3480
3573
  const systemPrompt = buildOutputSchema();
3481
3574
  const evaluatorTemplate = context.evaluatorTemplateOverride ?? this.evaluatorTemplate ?? DEFAULT_EVALUATOR_TEMPLATE;
@@ -3707,14 +3800,14 @@ function parseJsonSafe(payload) {
3707
3800
  }
3708
3801
  }
3709
3802
  function substituteVariables(template, variables) {
3710
- return template.replace(/\{\{([a-zA-Z0-9_]+)\}\}/g, (match, varName) => {
3803
+ return template.replace(/\{\{\s*([a-zA-Z0-9_]+)\s*\}\}/g, (match, varName) => {
3711
3804
  return variables[varName] ?? match;
3712
3805
  });
3713
3806
  }
3714
3807
 
3715
3808
  // src/evaluation/orchestrator.ts
3716
3809
  var import_node_crypto2 = require("crypto");
3717
- var import_promises10 = require("fs/promises");
3810
+ var import_promises11 = require("fs/promises");
3718
3811
  var import_node_path13 = __toESM(require("path"), 1);
3719
3812
 
3720
3813
  // ../../node_modules/.pnpm/yocto-queue@1.2.1/node_modules/yocto-queue/index.js
@@ -3871,11 +3964,11 @@ async function runEvaluation(options) {
3871
3964
  now,
3872
3965
  evalId,
3873
3966
  verbose,
3967
+ evalCases: preloadedEvalCases,
3874
3968
  onResult,
3875
3969
  onProgress
3876
3970
  } = options;
3877
- const load = loadEvalCases;
3878
- const evalCases = await load(evalFilePath, repoRoot, { verbose, evalId });
3971
+ const evalCases = preloadedEvalCases ?? await loadEvalCases(evalFilePath, repoRoot, { verbose, evalId });
3879
3972
  const filteredEvalCases = filterEvalCases(evalCases, evalId);
3880
3973
  if (filteredEvalCases.length === 0) {
3881
3974
  if (evalId) {
@@ -4059,8 +4152,9 @@ async function runBatchEvaluation(options) {
4059
4152
  agentTimeoutMs
4060
4153
  } = options;
4061
4154
  const promptInputsList = [];
4155
+ const formattingMode = isAgentProvider(provider) ? "agent" : "lm";
4062
4156
  for (const evalCase of evalCases) {
4063
- const promptInputs = await buildPromptInputs(evalCase);
4157
+ const promptInputs = await buildPromptInputs(evalCase, formattingMode);
4064
4158
  if (promptDumpDir) {
4065
4159
  await dumpPrompt(promptDumpDir, evalCase, promptInputs);
4066
4160
  }
@@ -4166,7 +4260,8 @@ async function runEvalCase(options) {
4166
4260
  signal,
4167
4261
  judgeProvider
4168
4262
  } = options;
4169
- const promptInputs = await buildPromptInputs(evalCase);
4263
+ const formattingMode = isAgentProvider(provider) ? "agent" : "lm";
4264
+ const promptInputs = await buildPromptInputs(evalCase, formattingMode);
4170
4265
  if (promptDumpDir) {
4171
4266
  await dumpPrompt(promptDumpDir, evalCase, promptInputs);
4172
4267
  }
@@ -4455,7 +4550,8 @@ async function runLlmJudgeEvaluator(options) {
4455
4550
  async function resolveCustomPrompt(config) {
4456
4551
  if (config.promptPath) {
4457
4552
  try {
4458
- return await readTextFile(config.promptPath);
4553
+ const content = await readTextFile(config.promptPath);
4554
+ return content;
4459
4555
  } catch (error) {
4460
4556
  const message = error instanceof Error ? error.message : String(error);
4461
4557
  console.warn(`Could not read custom prompt at ${config.promptPath}: ${message}`);
@@ -4490,14 +4586,14 @@ async function dumpPrompt(directory, evalCase, promptInputs) {
4490
4586
  const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
4491
4587
  const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
4492
4588
  const filePath = import_node_path13.default.resolve(directory, filename);
4493
- await (0, import_promises10.mkdir)(import_node_path13.default.dirname(filePath), { recursive: true });
4589
+ await (0, import_promises11.mkdir)(import_node_path13.default.dirname(filePath), { recursive: true });
4494
4590
  const payload = {
4495
4591
  eval_id: evalCase.id,
4496
4592
  question: promptInputs.question,
4497
4593
  guidelines: promptInputs.guidelines,
4498
4594
  guideline_paths: evalCase.guideline_paths
4499
4595
  };
4500
- await (0, import_promises10.writeFile)(filePath, JSON.stringify(payload, null, 2), "utf8");
4596
+ await (0, import_promises11.writeFile)(filePath, JSON.stringify(payload, null, 2), "utf8");
4501
4597
  }
4502
4598
  function sanitizeFilename(value) {
4503
4599
  if (!value) {