@agentv/core 0.15.0 → 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,5 +1,4 @@
1
1
  import {
2
- TARGETS_SCHEMA_V2,
3
2
  buildDirectoryChain,
4
3
  buildSearchRoots,
5
4
  fileExists,
@@ -9,7 +8,7 @@ import {
9
8
  readTextFile,
10
9
  resolveFileReference,
11
10
  resolveTargetDefinition
12
- } from "./chunk-IOCVST3R.js";
11
+ } from "./chunk-YCIZ33BO.js";
13
12
 
14
13
  // src/evaluation/types.ts
15
14
  var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
@@ -62,7 +61,7 @@ function getHitCount(result) {
62
61
  }
63
62
 
64
63
  // src/evaluation/yaml-parser.ts
65
- import { readFile as readFile4 } from "node:fs/promises";
64
+ import { readFile as readFile5 } from "node:fs/promises";
66
65
  import path6 from "node:path";
67
66
  import { parse as parse2 } from "yaml";
68
67
 
@@ -71,11 +70,11 @@ function extractCodeBlocks(segments) {
71
70
  const CODE_BLOCK_PATTERN = /```[\s\S]*?```/g;
72
71
  const codeBlocks = [];
73
72
  for (const segment of segments) {
74
- const typeValue = segment["type"];
73
+ const typeValue = segment.type;
75
74
  if (typeof typeValue !== "string" || typeValue !== "text") {
76
75
  continue;
77
76
  }
78
- const textValue = segment["value"];
77
+ const textValue = segment.value;
79
78
  if (typeof textValue !== "string") {
80
79
  continue;
81
80
  }
@@ -100,7 +99,7 @@ ${part.content}
100
99
  }
101
100
  return parts.map((p) => p.content).join(" ");
102
101
  }
103
- function formatSegment(segment) {
102
+ function formatSegment(segment, mode = "lm") {
104
103
  const type = asString(segment.type);
105
104
  if (type === "text") {
106
105
  return asString(segment.value);
@@ -110,8 +109,14 @@ function formatSegment(segment) {
110
109
  return refPath ? `<Attached: ${refPath}>` : void 0;
111
110
  }
112
111
  if (type === "file") {
113
- const text = asString(segment.text);
114
112
  const filePath = asString(segment.path);
113
+ if (!filePath) {
114
+ return void 0;
115
+ }
116
+ if (mode === "agent") {
117
+ return `<file: path="${filePath}">`;
118
+ }
119
+ const text = asString(segment.text);
115
120
  if (text && filePath) {
116
121
  return formatFileContents([{ content: text.trim(), isFile: true, displayPath: filePath }]);
117
122
  }
@@ -140,9 +145,9 @@ function asString(value) {
140
145
  }
141
146
 
142
147
  // src/evaluation/loaders/config-loader.ts
143
- import micromatch from "micromatch";
144
148
  import { readFile } from "node:fs/promises";
145
149
  import path2 from "node:path";
150
+ import micromatch from "micromatch";
146
151
  import { parse } from "yaml";
147
152
 
148
153
  // src/evaluation/loaders/file-resolver.ts
@@ -284,8 +289,9 @@ Please add '$schema: ${SCHEMA_CONFIG_V2}' at the top of the file.`;
284
289
  guideline_patterns: guidelinePatterns
285
290
  };
286
291
  } catch (error) {
287
- logWarning(`Could not read .agentv/config.yaml at ${configPath}: ${error.message}`);
288
- continue;
292
+ logWarning(
293
+ `Could not read .agentv/config.yaml at ${configPath}: ${error.message}`
294
+ );
289
295
  }
290
296
  }
291
297
  return null;
@@ -315,8 +321,66 @@ function logWarning(message) {
315
321
 
316
322
  // src/evaluation/loaders/evaluator-parser.ts
317
323
  import path3 from "node:path";
324
+
325
+ // src/evaluation/validation/prompt-validator.ts
326
+ import { readFile as readFile2 } from "node:fs/promises";
327
+
328
+ // src/evaluation/template-variables.ts
329
+ var TEMPLATE_VARIABLES = {
330
+ CANDIDATE_ANSWER: "candidate_answer",
331
+ EXPECTED_MESSAGES: "expected_messages",
332
+ QUESTION: "question",
333
+ EXPECTED_OUTCOME: "expected_outcome",
334
+ REFERENCE_ANSWER: "reference_answer",
335
+ INPUT_MESSAGES: "input_messages"
336
+ };
337
+ var VALID_TEMPLATE_VARIABLES = new Set(Object.values(TEMPLATE_VARIABLES));
338
+ var REQUIRED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Set([
339
+ TEMPLATE_VARIABLES.CANDIDATE_ANSWER,
340
+ TEMPLATE_VARIABLES.EXPECTED_MESSAGES
341
+ ]);
342
+
343
+ // src/evaluation/validation/prompt-validator.ts
318
344
  var ANSI_YELLOW2 = "\x1B[33m";
319
345
  var ANSI_RESET2 = "\x1B[0m";
346
+ async function validateCustomPromptContent(promptPath) {
347
+ const content = await readFile2(promptPath, "utf8");
348
+ validateTemplateVariables(content, promptPath);
349
+ }
350
+ function validateTemplateVariables(content, source) {
351
+ const variablePattern = /\{\{\s*([a-zA-Z0-9_]+)\s*\}\}/g;
352
+ const foundVariables = /* @__PURE__ */ new Set();
353
+ const invalidVariables = [];
354
+ let match = variablePattern.exec(content);
355
+ while (match !== null) {
356
+ const varName = match[1];
357
+ foundVariables.add(varName);
358
+ if (!VALID_TEMPLATE_VARIABLES.has(varName)) {
359
+ invalidVariables.push(varName);
360
+ }
361
+ match = variablePattern.exec(content);
362
+ }
363
+ const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.CANDIDATE_ANSWER);
364
+ const hasExpectedMessages = foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_MESSAGES);
365
+ const hasRequiredFields = hasCandidateAnswer || hasExpectedMessages;
366
+ if (!hasRequiredFields) {
367
+ throw new Error(
368
+ `Missing required fields. Must include at least one of:
369
+ - {{ ${TEMPLATE_VARIABLES.CANDIDATE_ANSWER} }}
370
+ - {{ ${TEMPLATE_VARIABLES.EXPECTED_MESSAGES} }}`
371
+ );
372
+ }
373
+ if (invalidVariables.length > 0) {
374
+ const warningMessage = `${ANSI_YELLOW2}Warning: Custom evaluator template at ${source}
375
+ Contains invalid variables: ${invalidVariables.map((v) => `{{ ${v} }}`).join(", ")}
376
+ Valid variables: ${Array.from(VALID_TEMPLATE_VARIABLES).map((v) => `{{ ${v} }}`).join(", ")}${ANSI_RESET2}`;
377
+ console.warn(warningMessage);
378
+ }
379
+ }
380
+
381
+ // src/evaluation/loaders/evaluator-parser.ts
382
+ var ANSI_YELLOW3 = "\x1B[33m";
383
+ var ANSI_RESET3 = "\x1B[0m";
320
384
  async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
321
385
  const execution = rawEvalCase.execution;
322
386
  const candidateEvaluators = isJsonObject2(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators ?? globalExecution?.evaluators;
@@ -375,6 +439,12 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
375
439
  const resolved = await resolveFileReference2(prompt, searchRoots);
376
440
  if (resolved.resolvedPath) {
377
441
  promptPath = path3.resolve(resolved.resolvedPath);
442
+ try {
443
+ await validateCustomPromptContent(promptPath);
444
+ } catch (error) {
445
+ const message = error instanceof Error ? error.message : String(error);
446
+ throw new Error(`Evaluator '${name}' template (${promptPath}): ${message}`);
447
+ }
378
448
  } else {
379
449
  logWarning2(
380
450
  `Inline prompt used for evaluator '${name}' in '${evalId}' (file not found: ${resolved.displayPath})`,
@@ -411,18 +481,18 @@ function isJsonObject2(value) {
411
481
  function logWarning2(message, details) {
412
482
  if (details && details.length > 0) {
413
483
  const detailBlock = details.join("\n");
414
- console.warn(`${ANSI_YELLOW2}Warning: ${message}
415
- ${detailBlock}${ANSI_RESET2}`);
484
+ console.warn(`${ANSI_YELLOW3}Warning: ${message}
485
+ ${detailBlock}${ANSI_RESET3}`);
416
486
  } else {
417
- console.warn(`${ANSI_YELLOW2}Warning: ${message}${ANSI_RESET2}`);
487
+ console.warn(`${ANSI_YELLOW3}Warning: ${message}${ANSI_RESET3}`);
418
488
  }
419
489
  }
420
490
 
421
491
  // src/evaluation/loaders/message-processor.ts
422
- import { readFile as readFile2 } from "node:fs/promises";
492
+ import { readFile as readFile3 } from "node:fs/promises";
423
493
  import path4 from "node:path";
424
- var ANSI_YELLOW3 = "\x1B[33m";
425
- var ANSI_RESET3 = "\x1B[0m";
494
+ var ANSI_YELLOW4 = "\x1B[33m";
495
+ var ANSI_RESET4 = "\x1B[0m";
426
496
  async function processMessages(options) {
427
497
  const {
428
498
  messages,
@@ -465,7 +535,7 @@ async function processMessages(options) {
465
535
  continue;
466
536
  }
467
537
  try {
468
- const fileContent = (await readFile2(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
538
+ const fileContent = (await readFile3(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
469
539
  if (messageType === "input" && guidelinePatterns && guidelinePaths) {
470
540
  const relativeToRepo = path4.relative(repoRootPath, resolvedPath);
471
541
  if (isGuidelineFile(relativeToRepo, guidelinePatterns)) {
@@ -536,7 +606,7 @@ async function resolveAssistantContent(content, searchRoots, verbose) {
536
606
  continue;
537
607
  }
538
608
  try {
539
- const fileContent = (await readFile2(resolvedPath, "utf8")).replace(/\r\n/g, "\n").trim();
609
+ const fileContent = (await readFile3(resolvedPath, "utf8")).replace(/\r\n/g, "\n").trim();
540
610
  parts.push({ content: fileContent, isFile: true, displayPath });
541
611
  if (verbose) {
542
612
  console.log(` [Expected Assistant File] Found: ${displayPath}`);
@@ -586,19 +656,19 @@ function cloneJsonValue(value) {
586
656
  function logWarning3(message, details) {
587
657
  if (details && details.length > 0) {
588
658
  const detailBlock = details.join("\n");
589
- console.warn(`${ANSI_YELLOW3}Warning: ${message}
590
- ${detailBlock}${ANSI_RESET3}`);
659
+ console.warn(`${ANSI_YELLOW4}Warning: ${message}
660
+ ${detailBlock}${ANSI_RESET4}`);
591
661
  } else {
592
- console.warn(`${ANSI_YELLOW3}Warning: ${message}${ANSI_RESET3}`);
662
+ console.warn(`${ANSI_YELLOW4}Warning: ${message}${ANSI_RESET4}`);
593
663
  }
594
664
  }
595
665
 
596
666
  // src/evaluation/formatting/prompt-builder.ts
597
- import { readFile as readFile3 } from "node:fs/promises";
667
+ import { readFile as readFile4 } from "node:fs/promises";
598
668
  import path5 from "node:path";
599
- var ANSI_YELLOW4 = "\x1B[33m";
600
- var ANSI_RESET4 = "\x1B[0m";
601
- async function buildPromptInputs(testCase) {
669
+ var ANSI_YELLOW5 = "\x1B[33m";
670
+ var ANSI_RESET5 = "\x1B[0m";
671
+ async function buildPromptInputs(testCase, mode = "lm") {
602
672
  const guidelineParts = [];
603
673
  for (const rawPath of testCase.guideline_paths) {
604
674
  const absolutePath = path5.resolve(rawPath);
@@ -607,7 +677,7 @@ async function buildPromptInputs(testCase) {
607
677
  continue;
608
678
  }
609
679
  try {
610
- const content = (await readFile3(absolutePath, "utf8")).replace(/\r\n/g, "\n").trim();
680
+ const content = (await readFile4(absolutePath, "utf8")).replace(/\r\n/g, "\n").trim();
611
681
  guidelineParts.push({
612
682
  content,
613
683
  isFile: true,
@@ -674,7 +744,7 @@ async function buildPromptInputs(testCase) {
674
744
  const roleLabel = message.role.charAt(0).toUpperCase() + message.role.slice(1);
675
745
  const contentParts = [];
676
746
  for (const segment of segments) {
677
- const formattedContent = formatSegment(segment);
747
+ const formattedContent = formatSegment(segment, mode);
678
748
  if (formattedContent) {
679
749
  contentParts.push(formattedContent);
680
750
  }
@@ -689,7 +759,11 @@ ${messageContent}`);
689
759
  } else {
690
760
  const questionParts = [];
691
761
  for (const segment of testCase.input_segments) {
692
- const formattedContent = formatSegment(segment);
762
+ if (segment.type === "file" && typeof segment.path === "string" && testCase.guideline_patterns && isGuidelineFile(segment.path, testCase.guideline_patterns)) {
763
+ questionParts.push(`<Attached: ${segment.path}>`);
764
+ continue;
765
+ }
766
+ const formattedContent = formatSegment(segment, mode);
693
767
  if (formattedContent) {
694
768
  questionParts.push(formattedContent);
695
769
  }
@@ -703,7 +777,8 @@ ${messageContent}`);
703
777
  messages: testCase.input_messages,
704
778
  segmentsByMessage,
705
779
  guidelinePatterns: testCase.guideline_patterns,
706
- guidelineContent: guidelines
780
+ guidelineContent: guidelines,
781
+ mode
707
782
  }) : void 0;
708
783
  return { question, guidelines, chatPrompt };
709
784
  }
@@ -720,7 +795,14 @@ function needsRoleMarkers(messages, processedSegmentsByMessage) {
720
795
  return messagesWithContent > 1;
721
796
  }
722
797
  function buildChatPromptFromSegments(options) {
723
- const { messages, segmentsByMessage, guidelinePatterns, guidelineContent, systemPrompt } = options;
798
+ const {
799
+ messages,
800
+ segmentsByMessage,
801
+ guidelinePatterns,
802
+ guidelineContent,
803
+ systemPrompt,
804
+ mode = "lm"
805
+ } = options;
724
806
  if (messages.length === 0) {
725
807
  return void 0;
726
808
  }
@@ -738,7 +820,7 @@ ${guidelineContent.trim()}`);
738
820
  const segments = segmentsByMessage[startIndex];
739
821
  const contentParts = [];
740
822
  for (const segment of segments) {
741
- const formatted = formatSegment(segment);
823
+ const formatted = formatSegment(segment, mode);
742
824
  if (formatted) {
743
825
  contentParts.push(formatted);
744
826
  }
@@ -771,7 +853,7 @@ ${guidelineContent.trim()}`);
771
853
  if (segment.type === "guideline_ref") {
772
854
  continue;
773
855
  }
774
- const formatted = formatSegment(segment);
856
+ const formatted = formatSegment(segment, mode);
775
857
  if (formatted) {
776
858
  const isGuidelineRef = segment.type === "file" && typeof segment.path === "string" && guidelinePatterns && isGuidelineFile(segment.path, guidelinePatterns);
777
859
  if (isGuidelineRef) {
@@ -795,17 +877,17 @@ function asString4(value) {
795
877
  return typeof value === "string" ? value : void 0;
796
878
  }
797
879
  function logWarning4(message) {
798
- console.warn(`${ANSI_YELLOW4}Warning: ${message}${ANSI_RESET4}`);
880
+ console.warn(`${ANSI_YELLOW5}Warning: ${message}${ANSI_RESET5}`);
799
881
  }
800
882
 
801
883
  // src/evaluation/yaml-parser.ts
802
- var ANSI_YELLOW5 = "\x1B[33m";
803
- var ANSI_RESET5 = "\x1B[0m";
804
- var SCHEMA_EVAL_V2 = "agentv-eval-v2";
884
+ var ANSI_YELLOW6 = "\x1B[33m";
885
+ var ANSI_RED = "\x1B[31m";
886
+ var ANSI_RESET6 = "\x1B[0m";
805
887
  async function readTestSuiteMetadata(testFilePath) {
806
888
  try {
807
889
  const absolutePath = path6.resolve(testFilePath);
808
- const content = await readFile4(absolutePath, "utf8");
890
+ const content = await readFile5(absolutePath, "utf8");
809
891
  const parsed = parse2(content);
810
892
  if (!isJsonObject(parsed)) {
811
893
  return {};
@@ -823,7 +905,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
823
905
  const searchRoots = buildSearchRoots2(absoluteTestPath, repoRootPath);
824
906
  const config = await loadConfig(absoluteTestPath, repoRootPath);
825
907
  const guidelinePatterns = config?.guideline_patterns;
826
- const rawFile = await readFile4(absoluteTestPath, "utf8");
908
+ const rawFile = await readFile5(absoluteTestPath, "utf8");
827
909
  const parsed = parse2(rawFile);
828
910
  if (!isJsonObject(parsed)) {
829
911
  throw new Error(`Invalid test file format: ${evalFilePath}`);
@@ -832,12 +914,6 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
832
914
  const datasetNameFromSuite = asString5(suite.dataset)?.trim();
833
915
  const fallbackDataset = path6.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
834
916
  const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
835
- const schema = suite.$schema;
836
- if (schema !== SCHEMA_EVAL_V2) {
837
- const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${evalFilePath}. Expected '${SCHEMA_EVAL_V2}'` : `Missing required field '$schema' in ${evalFilePath}.
838
- Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
839
- throw new Error(message);
840
- }
841
917
  const rawTestcases = suite.evalcases;
842
918
  if (!Array.isArray(rawTestcases)) {
843
919
  throw new Error(`Invalid test file format: ${evalFilePath} - missing 'evalcases' field`);
@@ -861,14 +937,18 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
861
937
  const inputMessagesValue = evalcase.input_messages;
862
938
  const expectedMessagesValue = evalcase.expected_messages;
863
939
  if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
864
- logWarning5(`Skipping incomplete eval case: ${id ?? "unknown"}`);
940
+ logError(
941
+ `Skipping incomplete eval case: ${id ?? "unknown"}. Missing required fields: id, outcome, and/or input_messages`
942
+ );
865
943
  continue;
866
944
  }
867
945
  const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
868
- const inputMessages = inputMessagesValue.filter((msg) => isTestMessage(msg));
946
+ const inputMessages = inputMessagesValue.filter(
947
+ (msg) => isTestMessage(msg)
948
+ );
869
949
  const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
870
950
  if (hasExpectedMessages && expectedMessages.length === 0) {
871
- logWarning5(`No valid expected message found for eval case: ${id}`);
951
+ logError(`No valid expected message found for eval case: ${id}`);
872
952
  continue;
873
953
  }
874
954
  if (expectedMessages.length > 1) {
@@ -899,7 +979,14 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
899
979
  const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
900
980
  const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
901
981
  const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
902
- const evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
982
+ let evaluators;
983
+ try {
984
+ evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
985
+ } catch (error) {
986
+ const message = error instanceof Error ? error.message : String(error);
987
+ logError(`Skipping eval case '${id}': ${message}`);
988
+ continue;
989
+ }
903
990
  const userFilePaths = [];
904
991
  for (const segment of inputSegments) {
905
992
  if (segment.type === "file" && typeof segment.resolvedPath === "string") {
@@ -917,7 +1004,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
917
1004
  question,
918
1005
  input_messages: inputMessages,
919
1006
  input_segments: inputSegments,
920
- output_segments: outputSegments,
1007
+ expected_segments: outputSegments,
921
1008
  reference_answer: referenceAnswer,
922
1009
  guideline_paths: guidelinePaths.map((guidelinePath) => path6.resolve(guidelinePath)),
923
1010
  guideline_patterns: guidelinePatterns,
@@ -949,10 +1036,19 @@ function asString5(value) {
949
1036
  function logWarning5(message, details) {
950
1037
  if (details && details.length > 0) {
951
1038
  const detailBlock = details.join("\n");
952
- console.warn(`${ANSI_YELLOW5}Warning: ${message}
953
- ${detailBlock}${ANSI_RESET5}`);
1039
+ console.warn(`${ANSI_YELLOW6}Warning: ${message}
1040
+ ${detailBlock}${ANSI_RESET6}`);
954
1041
  } else {
955
- console.warn(`${ANSI_YELLOW5}Warning: ${message}${ANSI_RESET5}`);
1042
+ console.warn(`${ANSI_YELLOW6}Warning: ${message}${ANSI_RESET6}`);
1043
+ }
1044
+ }
1045
+ function logError(message, details) {
1046
+ if (details && details.length > 0) {
1047
+ const detailBlock = details.join("\n");
1048
+ console.error(`${ANSI_RED}Error: ${message}
1049
+ ${detailBlock}${ANSI_RESET6}`);
1050
+ } else {
1051
+ console.error(`${ANSI_RED}Error: ${message}${ANSI_RESET6}`);
956
1052
  }
957
1053
  }
958
1054
 
@@ -1522,7 +1618,7 @@ function formatTimeoutSuffix(timeoutMs) {
1522
1618
  import { exec as execCallback, spawn } from "node:child_process";
1523
1619
  import { randomUUID } from "node:crypto";
1524
1620
  import { constants as constants2, createWriteStream } from "node:fs";
1525
- import { access as access2, mkdtemp, mkdir, rm, writeFile } from "node:fs/promises";
1621
+ import { access as access2, mkdir, mkdtemp, rm, writeFile } from "node:fs/promises";
1526
1622
  import { tmpdir } from "node:os";
1527
1623
  import path9 from "node:path";
1528
1624
  import { promisify as promisify2 } from "node:util";
@@ -1590,9 +1686,7 @@ function buildPromptDocument(request, inputFiles, options) {
1590
1686
  options?.guidelineOverrides
1591
1687
  );
1592
1688
  const inputFilesList = collectInputFiles(inputFiles);
1593
- const nonGuidelineInputFiles = inputFilesList.filter(
1594
- (file) => !guidelineFiles.includes(file)
1595
- );
1689
+ const nonGuidelineInputFiles = inputFilesList.filter((file) => !guidelineFiles.includes(file));
1596
1690
  const prereadBlock = buildMandatoryPrereadBlock(guidelineFiles, nonGuidelineInputFiles);
1597
1691
  if (prereadBlock.length > 0) {
1598
1692
  parts.push("\n", prereadBlock);
@@ -1764,7 +1858,15 @@ var CodexProvider = class {
1764
1858
  return path9.resolve(this.config.cwd);
1765
1859
  }
1766
1860
  buildCodexArgs() {
1767
- const args = ["--ask-for-approval", "never", "exec", "--json", "--color", "never", "--skip-git-repo-check"];
1861
+ const args = [
1862
+ "--ask-for-approval",
1863
+ "never",
1864
+ "exec",
1865
+ "--json",
1866
+ "--color",
1867
+ "never",
1868
+ "--skip-git-repo-check"
1869
+ ];
1768
1870
  if (this.config.args && this.config.args.length > 0) {
1769
1871
  args.push(...this.config.args);
1770
1872
  }
@@ -2388,7 +2490,12 @@ var MockProvider = class {
2388
2490
 
2389
2491
  // src/evaluation/providers/vscode.ts
2390
2492
  import path10 from "node:path";
2391
- import { dispatchAgentSession, dispatchBatchAgent, getSubagentRoot, provisionSubagents } from "subagent";
2493
+ import {
2494
+ dispatchAgentSession,
2495
+ dispatchBatchAgent,
2496
+ getSubagentRoot,
2497
+ provisionSubagents
2498
+ } from "subagent";
2392
2499
  var VSCodeProvider = class {
2393
2500
  id;
2394
2501
  kind;
@@ -2505,9 +2612,7 @@ function buildPromptDocument2(request, attachments, guidelinePatterns) {
2505
2612
  }
2506
2613
  const guidelineFiles = collectGuidelineFiles2(attachments, guidelinePatterns);
2507
2614
  const attachmentFiles = collectAttachmentFiles(attachments);
2508
- const nonGuidelineAttachments = attachmentFiles.filter(
2509
- (file) => !guidelineFiles.includes(file)
2510
- );
2615
+ const nonGuidelineAttachments = attachmentFiles.filter((file) => !guidelineFiles.includes(file));
2511
2616
  const prereadBlock = buildMandatoryPrereadBlock2(guidelineFiles, nonGuidelineAttachments);
2512
2617
  if (prereadBlock.length > 0) {
2513
2618
  parts.push("\n", prereadBlock);
@@ -2616,8 +2721,10 @@ async function ensureVSCodeSubagents(options) {
2616
2721
  if (result.skippedExisting.length > 0) {
2617
2722
  console.log(`Reusing ${result.skippedExisting.length} existing unlocked subagent(s)`);
2618
2723
  }
2619
- console.log(`
2620
- total unlocked subagents available: ${result.created.length + result.skippedExisting.length}`);
2724
+ console.log(
2725
+ `
2726
+ total unlocked subagents available: ${result.created.length + result.skippedExisting.length}`
2727
+ );
2621
2728
  }
2622
2729
  return {
2623
2730
  provisioned: true,
@@ -2637,33 +2744,12 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
2637
2744
 
2638
2745
  // src/evaluation/providers/targets-file.ts
2639
2746
  import { constants as constants3 } from "node:fs";
2640
- import { access as access3, readFile as readFile5 } from "node:fs/promises";
2747
+ import { access as access3, readFile as readFile6 } from "node:fs/promises";
2641
2748
  import path11 from "node:path";
2642
2749
  import { parse as parse3 } from "yaml";
2643
2750
  function isRecord(value) {
2644
2751
  return typeof value === "object" && value !== null && !Array.isArray(value);
2645
2752
  }
2646
- function checkSchema(parsed, absolutePath) {
2647
- const schema = parsed.$schema;
2648
- if (schema === void 0) {
2649
- throw new Error(
2650
- `Missing $schema field in targets.yaml at ${absolutePath}.
2651
- Please add '$schema: ${TARGETS_SCHEMA_V2}' at the top of the file.`
2652
- );
2653
- }
2654
- if (typeof schema !== "string") {
2655
- throw new Error(
2656
- `Invalid $schema field in targets.yaml at ${absolutePath}.
2657
- Expected a string value '${TARGETS_SCHEMA_V2}'.`
2658
- );
2659
- }
2660
- if (schema !== TARGETS_SCHEMA_V2) {
2661
- throw new Error(
2662
- `Invalid $schema '${schema}' in targets.yaml at ${absolutePath}.
2663
- Expected '${TARGETS_SCHEMA_V2}'.`
2664
- );
2665
- }
2666
- }
2667
2753
  function extractTargetsArray(parsed, absolutePath) {
2668
2754
  const targets = parsed.targets;
2669
2755
  if (!Array.isArray(targets)) {
@@ -2678,7 +2764,9 @@ function assertTargetDefinition(value, index, filePath) {
2678
2764
  const name = value.name;
2679
2765
  const provider = value.provider;
2680
2766
  if (typeof name !== "string" || name.trim().length === 0) {
2681
- throw new Error(`targets.yaml entry at index ${index} in ${filePath} is missing a valid 'name'`);
2767
+ throw new Error(
2768
+ `targets.yaml entry at index ${index} in ${filePath} is missing a valid 'name'`
2769
+ );
2682
2770
  }
2683
2771
  if (typeof provider !== "string" || provider.trim().length === 0) {
2684
2772
  throw new Error(`targets.yaml entry '${name}' in ${filePath} is missing a valid 'provider'`);
@@ -2698,14 +2786,15 @@ async function readTargetDefinitions(filePath) {
2698
2786
  if (!await fileExists3(absolutePath)) {
2699
2787
  throw new Error(`targets.yaml not found at ${absolutePath}`);
2700
2788
  }
2701
- const raw = await readFile5(absolutePath, "utf8");
2789
+ const raw = await readFile6(absolutePath, "utf8");
2702
2790
  const parsed = parse3(raw);
2703
2791
  if (!isRecord(parsed)) {
2704
- throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with '$schema' and 'targets' fields`);
2792
+ throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with a 'targets' field`);
2705
2793
  }
2706
- checkSchema(parsed, absolutePath);
2707
2794
  const targets = extractTargetsArray(parsed, absolutePath);
2708
- const definitions = targets.map((entry, index) => assertTargetDefinition(entry, index, absolutePath));
2795
+ const definitions = targets.map(
2796
+ (entry, index) => assertTargetDefinition(entry, index, absolutePath)
2797
+ );
2709
2798
  return definitions;
2710
2799
  }
2711
2800
  function listTargetNames(definitions) {
@@ -2749,16 +2838,16 @@ Use the reference_answer as a gold standard for a high-quality response (if prov
2749
2838
  Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.
2750
2839
 
2751
2840
  [[ ## expected_outcome ## ]]
2752
- {{expected_outcome}}
2841
+ {{${TEMPLATE_VARIABLES.EXPECTED_OUTCOME}}}
2753
2842
 
2754
2843
  [[ ## question ## ]]
2755
- {{question}}
2844
+ {{${TEMPLATE_VARIABLES.QUESTION}}}
2756
2845
 
2757
2846
  [[ ## reference_answer ## ]]
2758
- {{reference_answer}}
2847
+ {{${TEMPLATE_VARIABLES.REFERENCE_ANSWER}}}
2759
2848
 
2760
2849
  [[ ## candidate_answer ## ]]
2761
- {{candidate_answer}}`;
2850
+ {{${TEMPLATE_VARIABLES.CANDIDATE_ANSWER}}}`;
2762
2851
  var LlmJudgeEvaluator = class {
2763
2852
  kind = "llm_judge";
2764
2853
  resolveJudgeProvider;
@@ -2781,12 +2870,16 @@ var LlmJudgeEvaluator = class {
2781
2870
  async evaluateWithPrompt(context, judgeProvider) {
2782
2871
  const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
2783
2872
  const variables = {
2784
- input_messages: JSON.stringify(context.evalCase.input_segments, null, 2),
2785
- output_messages: JSON.stringify(context.evalCase.output_segments, null, 2),
2786
- candidate_answer: context.candidate.trim(),
2787
- reference_answer: (context.evalCase.reference_answer ?? "").trim(),
2788
- expected_outcome: context.evalCase.expected_outcome.trim(),
2789
- question: formattedQuestion.trim()
2873
+ [TEMPLATE_VARIABLES.INPUT_MESSAGES]: JSON.stringify(context.evalCase.input_segments, null, 2),
2874
+ [TEMPLATE_VARIABLES.EXPECTED_MESSAGES]: JSON.stringify(
2875
+ context.evalCase.expected_segments,
2876
+ null,
2877
+ 2
2878
+ ),
2879
+ [TEMPLATE_VARIABLES.CANDIDATE_ANSWER]: context.candidate.trim(),
2880
+ [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context.evalCase.reference_answer ?? "").trim(),
2881
+ [TEMPLATE_VARIABLES.EXPECTED_OUTCOME]: context.evalCase.expected_outcome.trim(),
2882
+ [TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim()
2790
2883
  };
2791
2884
  const systemPrompt = buildOutputSchema();
2792
2885
  const evaluatorTemplate = context.evaluatorTemplateOverride ?? this.evaluatorTemplate ?? DEFAULT_EVALUATOR_TEMPLATE;
@@ -3018,7 +3111,7 @@ function parseJsonSafe(payload) {
3018
3111
  }
3019
3112
  }
3020
3113
  function substituteVariables(template, variables) {
3021
- return template.replace(/\{\{([a-zA-Z0-9_]+)\}\}/g, (match, varName) => {
3114
+ return template.replace(/\{\{\s*([a-zA-Z0-9_]+)\s*\}\}/g, (match, varName) => {
3022
3115
  return variables[varName] ?? match;
3023
3116
  });
3024
3117
  }
@@ -3028,7 +3121,7 @@ import { createHash, randomUUID as randomUUID2 } from "node:crypto";
3028
3121
  import { mkdir as mkdir2, writeFile as writeFile2 } from "node:fs/promises";
3029
3122
  import path12 from "node:path";
3030
3123
 
3031
- // ../../node_modules/.pnpm/yocto-queue@1.2.1/node_modules/yocto-queue/index.js
3124
+ // ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
3032
3125
  var Node = class {
3033
3126
  value;
3034
3127
  next;
@@ -3061,6 +3154,9 @@ var Queue = class {
3061
3154
  }
3062
3155
  this.#head = this.#head.next;
3063
3156
  this.#size--;
3157
+ if (!this.#head) {
3158
+ this.#tail = void 0;
3159
+ }
3064
3160
  return current.value;
3065
3161
  }
3066
3162
  peek() {
@@ -3091,7 +3187,7 @@ var Queue = class {
3091
3187
  }
3092
3188
  };
3093
3189
 
3094
- // ../../node_modules/.pnpm/p-limit@6.2.0/node_modules/p-limit/index.js
3190
+ // ../../node_modules/.bun/p-limit@6.2.0/node_modules/p-limit/index.js
3095
3191
  function pLimit(concurrency) {
3096
3192
  validateConcurrency(concurrency);
3097
3193
  const queue = new Queue();
@@ -3182,11 +3278,11 @@ async function runEvaluation(options) {
3182
3278
  now,
3183
3279
  evalId,
3184
3280
  verbose,
3281
+ evalCases: preloadedEvalCases,
3185
3282
  onResult,
3186
3283
  onProgress
3187
3284
  } = options;
3188
- const load = loadEvalCases;
3189
- const evalCases = await load(evalFilePath, repoRoot, { verbose, evalId });
3285
+ const evalCases = preloadedEvalCases ?? await loadEvalCases(evalFilePath, repoRoot, { verbose, evalId });
3190
3286
  const filteredEvalCases = filterEvalCases(evalCases, evalId);
3191
3287
  if (filteredEvalCases.length === 0) {
3192
3288
  if (evalId) {
@@ -3267,7 +3363,9 @@ async function runEvaluation(options) {
3267
3363
  } catch (error) {
3268
3364
  if (verbose) {
3269
3365
  const message = error instanceof Error ? error.message : String(error);
3270
- console.warn(`Provider batch execution failed, falling back to per-case dispatch: ${message}`);
3366
+ console.warn(
3367
+ `Provider batch execution failed, falling back to per-case dispatch: ${message}`
3368
+ );
3271
3369
  }
3272
3370
  }
3273
3371
  }
@@ -3370,8 +3468,9 @@ async function runBatchEvaluation(options) {
3370
3468
  agentTimeoutMs
3371
3469
  } = options;
3372
3470
  const promptInputsList = [];
3471
+ const formattingMode = isAgentProvider(provider) ? "agent" : "lm";
3373
3472
  for (const evalCase of evalCases) {
3374
- const promptInputs = await buildPromptInputs(evalCase);
3473
+ const promptInputs = await buildPromptInputs(evalCase, formattingMode);
3375
3474
  if (promptDumpDir) {
3376
3475
  await dumpPrompt(promptDumpDir, evalCase, promptInputs);
3377
3476
  }
@@ -3430,7 +3529,14 @@ async function runBatchEvaluation(options) {
3430
3529
  agentTimeoutMs
3431
3530
  });
3432
3531
  } catch (error) {
3433
- const errorResult = buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs, provider);
3532
+ const errorResult = buildErrorResult(
3533
+ evalCase,
3534
+ target.name,
3535
+ nowFn(),
3536
+ error,
3537
+ promptInputs,
3538
+ provider
3539
+ );
3434
3540
  results.push(errorResult);
3435
3541
  if (onResult) {
3436
3542
  await onResult(errorResult);
@@ -3477,7 +3583,8 @@ async function runEvalCase(options) {
3477
3583
  signal,
3478
3584
  judgeProvider
3479
3585
  } = options;
3480
- const promptInputs = await buildPromptInputs(evalCase);
3586
+ const formattingMode = isAgentProvider(provider) ? "agent" : "lm";
3587
+ const promptInputs = await buildPromptInputs(evalCase, formattingMode);
3481
3588
  if (promptDumpDir) {
3482
3589
  await dumpPrompt(promptDumpDir, evalCase, promptInputs);
3483
3590
  }
@@ -3607,7 +3714,18 @@ async function evaluateCandidate(options) {
3607
3714
  };
3608
3715
  }
3609
3716
  async function runEvaluatorsForCase(options) {
3610
- const { evalCase, candidate, target, provider, evaluators, attempt, promptInputs, now, judgeProvider, agentTimeoutMs } = options;
3717
+ const {
3718
+ evalCase,
3719
+ candidate,
3720
+ target,
3721
+ provider,
3722
+ evaluators,
3723
+ attempt,
3724
+ promptInputs,
3725
+ now,
3726
+ judgeProvider,
3727
+ agentTimeoutMs
3728
+ } = options;
3611
3729
  if (evalCase.evaluators && evalCase.evaluators.length > 0) {
3612
3730
  return runEvaluatorList({
3613
3731
  evalCase,
@@ -3708,7 +3826,6 @@ async function runEvaluatorList(options) {
3708
3826
  reasoning: score2.reasoning,
3709
3827
  evaluator_provider_request: score2.evaluatorRawRequest
3710
3828
  });
3711
- continue;
3712
3829
  }
3713
3830
  } catch (error) {
3714
3831
  const message = error instanceof Error ? error.message : String(error);
@@ -3719,7 +3836,11 @@ async function runEvaluatorList(options) {
3719
3836
  expectedAspectCount: 1,
3720
3837
  reasoning: message
3721
3838
  };
3722
- scored.push({ score: fallbackScore, name: evaluator.name ?? "unknown", type: evaluator.type ?? "unknown" });
3839
+ scored.push({
3840
+ score: fallbackScore,
3841
+ name: evaluator.name ?? "unknown",
3842
+ type: evaluator.type ?? "unknown"
3843
+ });
3723
3844
  evaluatorResults.push({
3724
3845
  name: evaluator.name ?? "unknown",
3725
3846
  type: evaluator.type ?? "unknown",
@@ -3733,7 +3854,10 @@ async function runEvaluatorList(options) {
3733
3854
  const aggregateScore = scored.length > 0 ? scored.reduce((total, entry) => total + entry.score.score, 0) / scored.length : 0;
3734
3855
  const hits = scored.flatMap((entry) => entry.score.hits);
3735
3856
  const misses = scored.flatMap((entry) => entry.score.misses);
3736
- const expectedAspectCount = scored.reduce((total, entry) => total + (entry.score.expectedAspectCount ?? 0), 0);
3857
+ const expectedAspectCount = scored.reduce(
3858
+ (total, entry) => total + (entry.score.expectedAspectCount ?? 0),
3859
+ 0
3860
+ );
3737
3861
  const rawAspects = scored.flatMap((entry) => entry.score.rawAspects ?? []);
3738
3862
  const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString2);
3739
3863
  const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
@@ -3748,7 +3872,18 @@ async function runEvaluatorList(options) {
3748
3872
  return { score, evaluatorResults };
3749
3873
  }
3750
3874
  async function runLlmJudgeEvaluator(options) {
3751
- const { config, evalCase, candidate, target, provider, evaluatorRegistry, attempt, promptInputs, now, judgeProvider } = options;
3875
+ const {
3876
+ config,
3877
+ evalCase,
3878
+ candidate,
3879
+ target,
3880
+ provider,
3881
+ evaluatorRegistry,
3882
+ attempt,
3883
+ promptInputs,
3884
+ now,
3885
+ judgeProvider
3886
+ } = options;
3752
3887
  const customPrompt = await resolveCustomPrompt(config);
3753
3888
  return evaluatorRegistry.llm_judge.evaluate({
3754
3889
  evalCase,
@@ -3766,7 +3901,8 @@ async function runLlmJudgeEvaluator(options) {
3766
3901
  async function resolveCustomPrompt(config) {
3767
3902
  if (config.promptPath) {
3768
3903
  try {
3769
- return await readTextFile(config.promptPath);
3904
+ const content = await readTextFile(config.promptPath);
3905
+ return content;
3770
3906
  } catch (error) {
3771
3907
  const message = error instanceof Error ? error.message : String(error);
3772
3908
  console.warn(`Could not read custom prompt at ${config.promptPath}: ${message}`);