@agentv/core 0.15.0 → 0.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -101,7 +101,7 @@ interface EvalCase {
101
101
  readonly question: string;
102
102
  readonly input_messages: readonly TestMessage[];
103
103
  readonly input_segments: readonly JsonObject[];
104
- readonly output_segments: readonly JsonObject[];
104
+ readonly expected_segments: readonly JsonObject[];
105
105
  readonly reference_answer?: string;
106
106
  readonly guideline_paths: readonly string[];
107
107
  readonly guideline_patterns?: readonly string[];
@@ -147,6 +147,17 @@ interface EvaluatorResult {
147
147
  */
148
148
  declare function getHitCount(result: Pick<EvaluationResult, "hits">): number;
149
149
 
150
+ /**
151
+ * Formatting mode for segment content.
152
+ * - 'agent': File references only (for providers with filesystem access)
153
+ * - 'lm': Embedded file content with XML tags (for language model providers)
154
+ */
155
+ type FormattingMode = 'agent' | 'lm';
156
+ /**
157
+ * Extract fenced code blocks from AgentV user segments.
158
+ */
159
+ declare function extractCodeBlocks(segments: readonly JsonObject[]): readonly string[];
160
+
150
161
  type ChatMessageRole = "system" | "user" | "assistant" | "tool" | "function";
151
162
  interface ChatMessage {
152
163
  readonly role: ChatMessageRole;
@@ -271,12 +282,13 @@ interface PromptInputs {
271
282
  readonly chatPrompt?: ChatPrompt;
272
283
  readonly systemMessage?: string;
273
284
  }
274
- declare function buildPromptInputs(testCase: EvalCase): Promise<PromptInputs>;
275
-
276
285
  /**
277
- * Extract fenced code blocks from AgentV user segments.
286
+ * Build prompt inputs by consolidating user request context and guideline content.
287
+ *
288
+ * @param testCase - The evaluation test case
289
+ * @param mode - Formatting mode: 'agent' for file references, 'lm' for embedded content (default: 'lm')
278
290
  */
279
- declare function extractCodeBlocks(segments: readonly JsonObject[]): readonly string[];
291
+ declare function buildPromptInputs(testCase: EvalCase, mode?: FormattingMode): Promise<PromptInputs>;
280
292
 
281
293
  /**
282
294
  * Determine whether a path references guideline content (instructions or prompts).
@@ -605,6 +617,7 @@ interface RunEvaluationOptions {
605
617
  readonly evalId?: string;
606
618
  readonly verbose?: boolean;
607
619
  readonly maxConcurrency?: number;
620
+ readonly evalCases?: readonly EvalCase[];
608
621
  readonly onResult?: (result: EvaluationResult) => MaybePromise<void>;
609
622
  readonly onProgress?: (event: ProgressEvent) => MaybePromise<void>;
610
623
  }
package/dist/index.d.ts CHANGED
@@ -101,7 +101,7 @@ interface EvalCase {
101
101
  readonly question: string;
102
102
  readonly input_messages: readonly TestMessage[];
103
103
  readonly input_segments: readonly JsonObject[];
104
- readonly output_segments: readonly JsonObject[];
104
+ readonly expected_segments: readonly JsonObject[];
105
105
  readonly reference_answer?: string;
106
106
  readonly guideline_paths: readonly string[];
107
107
  readonly guideline_patterns?: readonly string[];
@@ -147,6 +147,17 @@ interface EvaluatorResult {
147
147
  */
148
148
  declare function getHitCount(result: Pick<EvaluationResult, "hits">): number;
149
149
 
150
+ /**
151
+ * Formatting mode for segment content.
152
+ * - 'agent': File references only (for providers with filesystem access)
153
+ * - 'lm': Embedded file content with XML tags (for language model providers)
154
+ */
155
+ type FormattingMode = 'agent' | 'lm';
156
+ /**
157
+ * Extract fenced code blocks from AgentV user segments.
158
+ */
159
+ declare function extractCodeBlocks(segments: readonly JsonObject[]): readonly string[];
160
+
150
161
  type ChatMessageRole = "system" | "user" | "assistant" | "tool" | "function";
151
162
  interface ChatMessage {
152
163
  readonly role: ChatMessageRole;
@@ -271,12 +282,13 @@ interface PromptInputs {
271
282
  readonly chatPrompt?: ChatPrompt;
272
283
  readonly systemMessage?: string;
273
284
  }
274
- declare function buildPromptInputs(testCase: EvalCase): Promise<PromptInputs>;
275
-
276
285
  /**
277
- * Extract fenced code blocks from AgentV user segments.
286
+ * Build prompt inputs by consolidating user request context and guideline content.
287
+ *
288
+ * @param testCase - The evaluation test case
289
+ * @param mode - Formatting mode: 'agent' for file references, 'lm' for embedded content (default: 'lm')
278
290
  */
279
- declare function extractCodeBlocks(segments: readonly JsonObject[]): readonly string[];
291
+ declare function buildPromptInputs(testCase: EvalCase, mode?: FormattingMode): Promise<PromptInputs>;
280
292
 
281
293
  /**
282
294
  * Determine whether a path references guideline content (instructions or prompts).
@@ -605,6 +617,7 @@ interface RunEvaluationOptions {
605
617
  readonly evalId?: string;
606
618
  readonly verbose?: boolean;
607
619
  readonly maxConcurrency?: number;
620
+ readonly evalCases?: readonly EvalCase[];
608
621
  readonly onResult?: (result: EvaluationResult) => MaybePromise<void>;
609
622
  readonly onProgress?: (event: ProgressEvent) => MaybePromise<void>;
610
623
  }
package/dist/index.js CHANGED
@@ -62,7 +62,7 @@ function getHitCount(result) {
62
62
  }
63
63
 
64
64
  // src/evaluation/yaml-parser.ts
65
- import { readFile as readFile4 } from "node:fs/promises";
65
+ import { readFile as readFile5 } from "node:fs/promises";
66
66
  import path6 from "node:path";
67
67
  import { parse as parse2 } from "yaml";
68
68
 
@@ -100,7 +100,7 @@ ${part.content}
100
100
  }
101
101
  return parts.map((p) => p.content).join(" ");
102
102
  }
103
- function formatSegment(segment) {
103
+ function formatSegment(segment, mode = "lm") {
104
104
  const type = asString(segment.type);
105
105
  if (type === "text") {
106
106
  return asString(segment.value);
@@ -110,8 +110,14 @@ function formatSegment(segment) {
110
110
  return refPath ? `<Attached: ${refPath}>` : void 0;
111
111
  }
112
112
  if (type === "file") {
113
- const text = asString(segment.text);
114
113
  const filePath = asString(segment.path);
114
+ if (!filePath) {
115
+ return void 0;
116
+ }
117
+ if (mode === "agent") {
118
+ return `<file: path="${filePath}">`;
119
+ }
120
+ const text = asString(segment.text);
115
121
  if (text && filePath) {
116
122
  return formatFileContents([{ content: text.trim(), isFile: true, displayPath: filePath }]);
117
123
  }
@@ -315,8 +321,67 @@ function logWarning(message) {
315
321
 
316
322
  // src/evaluation/loaders/evaluator-parser.ts
317
323
  import path3 from "node:path";
324
+
325
+ // src/evaluation/validation/prompt-validator.ts
326
+ import { readFile as readFile2 } from "node:fs/promises";
327
+
328
+ // src/evaluation/template-variables.ts
329
+ var TEMPLATE_VARIABLES = {
330
+ CANDIDATE_ANSWER: "candidate_answer",
331
+ EXPECTED_MESSAGES: "expected_messages",
332
+ QUESTION: "question",
333
+ EXPECTED_OUTCOME: "expected_outcome",
334
+ REFERENCE_ANSWER: "reference_answer",
335
+ INPUT_MESSAGES: "input_messages"
336
+ };
337
+ var VALID_TEMPLATE_VARIABLES = new Set(
338
+ Object.values(TEMPLATE_VARIABLES)
339
+ );
340
+ var REQUIRED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Set([
341
+ TEMPLATE_VARIABLES.CANDIDATE_ANSWER,
342
+ TEMPLATE_VARIABLES.EXPECTED_MESSAGES
343
+ ]);
344
+
345
+ // src/evaluation/validation/prompt-validator.ts
318
346
  var ANSI_YELLOW2 = "\x1B[33m";
319
347
  var ANSI_RESET2 = "\x1B[0m";
348
+ async function validateCustomPromptContent(promptPath) {
349
+ const content = await readFile2(promptPath, "utf8");
350
+ validateTemplateVariables(content, promptPath);
351
+ }
352
+ function validateTemplateVariables(content, source) {
353
+ const variablePattern = /\{\{\s*([a-zA-Z0-9_]+)\s*\}\}/g;
354
+ const foundVariables = /* @__PURE__ */ new Set();
355
+ const invalidVariables = [];
356
+ let match;
357
+ while ((match = variablePattern.exec(content)) !== null) {
358
+ const varName = match[1];
359
+ foundVariables.add(varName);
360
+ if (!VALID_TEMPLATE_VARIABLES.has(varName)) {
361
+ invalidVariables.push(varName);
362
+ }
363
+ }
364
+ const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.CANDIDATE_ANSWER);
365
+ const hasExpectedMessages = foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_MESSAGES);
366
+ const hasRequiredFields = hasCandidateAnswer || hasExpectedMessages;
367
+ if (!hasRequiredFields) {
368
+ throw new Error(
369
+ `Missing required fields. Must include at least one of:
370
+ - {{ ${TEMPLATE_VARIABLES.CANDIDATE_ANSWER} }}
371
+ - {{ ${TEMPLATE_VARIABLES.EXPECTED_MESSAGES} }}`
372
+ );
373
+ }
374
+ if (invalidVariables.length > 0) {
375
+ const warningMessage = `${ANSI_YELLOW2}Warning: Custom evaluator template at ${source}
376
+ Contains invalid variables: ${invalidVariables.map((v) => `{{ ${v} }}`).join(", ")}
377
+ Valid variables: ${Array.from(VALID_TEMPLATE_VARIABLES).map((v) => `{{ ${v} }}`).join(", ")}${ANSI_RESET2}`;
378
+ console.warn(warningMessage);
379
+ }
380
+ }
381
+
382
+ // src/evaluation/loaders/evaluator-parser.ts
383
+ var ANSI_YELLOW3 = "\x1B[33m";
384
+ var ANSI_RESET3 = "\x1B[0m";
320
385
  async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
321
386
  const execution = rawEvalCase.execution;
322
387
  const candidateEvaluators = isJsonObject2(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators ?? globalExecution?.evaluators;
@@ -375,6 +440,12 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
375
440
  const resolved = await resolveFileReference2(prompt, searchRoots);
376
441
  if (resolved.resolvedPath) {
377
442
  promptPath = path3.resolve(resolved.resolvedPath);
443
+ try {
444
+ await validateCustomPromptContent(promptPath);
445
+ } catch (error) {
446
+ const message = error instanceof Error ? error.message : String(error);
447
+ throw new Error(`Evaluator '${name}' template (${promptPath}): ${message}`);
448
+ }
378
449
  } else {
379
450
  logWarning2(
380
451
  `Inline prompt used for evaluator '${name}' in '${evalId}' (file not found: ${resolved.displayPath})`,
@@ -411,18 +482,18 @@ function isJsonObject2(value) {
411
482
  function logWarning2(message, details) {
412
483
  if (details && details.length > 0) {
413
484
  const detailBlock = details.join("\n");
414
- console.warn(`${ANSI_YELLOW2}Warning: ${message}
415
- ${detailBlock}${ANSI_RESET2}`);
485
+ console.warn(`${ANSI_YELLOW3}Warning: ${message}
486
+ ${detailBlock}${ANSI_RESET3}`);
416
487
  } else {
417
- console.warn(`${ANSI_YELLOW2}Warning: ${message}${ANSI_RESET2}`);
488
+ console.warn(`${ANSI_YELLOW3}Warning: ${message}${ANSI_RESET3}`);
418
489
  }
419
490
  }
420
491
 
421
492
  // src/evaluation/loaders/message-processor.ts
422
- import { readFile as readFile2 } from "node:fs/promises";
493
+ import { readFile as readFile3 } from "node:fs/promises";
423
494
  import path4 from "node:path";
424
- var ANSI_YELLOW3 = "\x1B[33m";
425
- var ANSI_RESET3 = "\x1B[0m";
495
+ var ANSI_YELLOW4 = "\x1B[33m";
496
+ var ANSI_RESET4 = "\x1B[0m";
426
497
  async function processMessages(options) {
427
498
  const {
428
499
  messages,
@@ -465,7 +536,7 @@ async function processMessages(options) {
465
536
  continue;
466
537
  }
467
538
  try {
468
- const fileContent = (await readFile2(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
539
+ const fileContent = (await readFile3(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
469
540
  if (messageType === "input" && guidelinePatterns && guidelinePaths) {
470
541
  const relativeToRepo = path4.relative(repoRootPath, resolvedPath);
471
542
  if (isGuidelineFile(relativeToRepo, guidelinePatterns)) {
@@ -536,7 +607,7 @@ async function resolveAssistantContent(content, searchRoots, verbose) {
536
607
  continue;
537
608
  }
538
609
  try {
539
- const fileContent = (await readFile2(resolvedPath, "utf8")).replace(/\r\n/g, "\n").trim();
610
+ const fileContent = (await readFile3(resolvedPath, "utf8")).replace(/\r\n/g, "\n").trim();
540
611
  parts.push({ content: fileContent, isFile: true, displayPath });
541
612
  if (verbose) {
542
613
  console.log(` [Expected Assistant File] Found: ${displayPath}`);
@@ -586,19 +657,19 @@ function cloneJsonValue(value) {
586
657
  function logWarning3(message, details) {
587
658
  if (details && details.length > 0) {
588
659
  const detailBlock = details.join("\n");
589
- console.warn(`${ANSI_YELLOW3}Warning: ${message}
590
- ${detailBlock}${ANSI_RESET3}`);
660
+ console.warn(`${ANSI_YELLOW4}Warning: ${message}
661
+ ${detailBlock}${ANSI_RESET4}`);
591
662
  } else {
592
- console.warn(`${ANSI_YELLOW3}Warning: ${message}${ANSI_RESET3}`);
663
+ console.warn(`${ANSI_YELLOW4}Warning: ${message}${ANSI_RESET4}`);
593
664
  }
594
665
  }
595
666
 
596
667
  // src/evaluation/formatting/prompt-builder.ts
597
- import { readFile as readFile3 } from "node:fs/promises";
668
+ import { readFile as readFile4 } from "node:fs/promises";
598
669
  import path5 from "node:path";
599
- var ANSI_YELLOW4 = "\x1B[33m";
600
- var ANSI_RESET4 = "\x1B[0m";
601
- async function buildPromptInputs(testCase) {
670
+ var ANSI_YELLOW5 = "\x1B[33m";
671
+ var ANSI_RESET5 = "\x1B[0m";
672
+ async function buildPromptInputs(testCase, mode = "lm") {
602
673
  const guidelineParts = [];
603
674
  for (const rawPath of testCase.guideline_paths) {
604
675
  const absolutePath = path5.resolve(rawPath);
@@ -607,7 +678,7 @@ async function buildPromptInputs(testCase) {
607
678
  continue;
608
679
  }
609
680
  try {
610
- const content = (await readFile3(absolutePath, "utf8")).replace(/\r\n/g, "\n").trim();
681
+ const content = (await readFile4(absolutePath, "utf8")).replace(/\r\n/g, "\n").trim();
611
682
  guidelineParts.push({
612
683
  content,
613
684
  isFile: true,
@@ -674,7 +745,7 @@ async function buildPromptInputs(testCase) {
674
745
  const roleLabel = message.role.charAt(0).toUpperCase() + message.role.slice(1);
675
746
  const contentParts = [];
676
747
  for (const segment of segments) {
677
- const formattedContent = formatSegment(segment);
748
+ const formattedContent = formatSegment(segment, mode);
678
749
  if (formattedContent) {
679
750
  contentParts.push(formattedContent);
680
751
  }
@@ -689,7 +760,11 @@ ${messageContent}`);
689
760
  } else {
690
761
  const questionParts = [];
691
762
  for (const segment of testCase.input_segments) {
692
- const formattedContent = formatSegment(segment);
763
+ if (segment.type === "file" && typeof segment.path === "string" && testCase.guideline_patterns && isGuidelineFile(segment.path, testCase.guideline_patterns)) {
764
+ questionParts.push(`<Attached: ${segment.path}>`);
765
+ continue;
766
+ }
767
+ const formattedContent = formatSegment(segment, mode);
693
768
  if (formattedContent) {
694
769
  questionParts.push(formattedContent);
695
770
  }
@@ -703,7 +778,8 @@ ${messageContent}`);
703
778
  messages: testCase.input_messages,
704
779
  segmentsByMessage,
705
780
  guidelinePatterns: testCase.guideline_patterns,
706
- guidelineContent: guidelines
781
+ guidelineContent: guidelines,
782
+ mode
707
783
  }) : void 0;
708
784
  return { question, guidelines, chatPrompt };
709
785
  }
@@ -720,7 +796,7 @@ function needsRoleMarkers(messages, processedSegmentsByMessage) {
720
796
  return messagesWithContent > 1;
721
797
  }
722
798
  function buildChatPromptFromSegments(options) {
723
- const { messages, segmentsByMessage, guidelinePatterns, guidelineContent, systemPrompt } = options;
799
+ const { messages, segmentsByMessage, guidelinePatterns, guidelineContent, systemPrompt, mode = "lm" } = options;
724
800
  if (messages.length === 0) {
725
801
  return void 0;
726
802
  }
@@ -738,7 +814,7 @@ ${guidelineContent.trim()}`);
738
814
  const segments = segmentsByMessage[startIndex];
739
815
  const contentParts = [];
740
816
  for (const segment of segments) {
741
- const formatted = formatSegment(segment);
817
+ const formatted = formatSegment(segment, mode);
742
818
  if (formatted) {
743
819
  contentParts.push(formatted);
744
820
  }
@@ -771,7 +847,7 @@ ${guidelineContent.trim()}`);
771
847
  if (segment.type === "guideline_ref") {
772
848
  continue;
773
849
  }
774
- const formatted = formatSegment(segment);
850
+ const formatted = formatSegment(segment, mode);
775
851
  if (formatted) {
776
852
  const isGuidelineRef = segment.type === "file" && typeof segment.path === "string" && guidelinePatterns && isGuidelineFile(segment.path, guidelinePatterns);
777
853
  if (isGuidelineRef) {
@@ -795,17 +871,18 @@ function asString4(value) {
795
871
  return typeof value === "string" ? value : void 0;
796
872
  }
797
873
  function logWarning4(message) {
798
- console.warn(`${ANSI_YELLOW4}Warning: ${message}${ANSI_RESET4}`);
874
+ console.warn(`${ANSI_YELLOW5}Warning: ${message}${ANSI_RESET5}`);
799
875
  }
800
876
 
801
877
  // src/evaluation/yaml-parser.ts
802
- var ANSI_YELLOW5 = "\x1B[33m";
803
- var ANSI_RESET5 = "\x1B[0m";
878
+ var ANSI_YELLOW6 = "\x1B[33m";
879
+ var ANSI_RED = "\x1B[31m";
880
+ var ANSI_RESET6 = "\x1B[0m";
804
881
  var SCHEMA_EVAL_V2 = "agentv-eval-v2";
805
882
  async function readTestSuiteMetadata(testFilePath) {
806
883
  try {
807
884
  const absolutePath = path6.resolve(testFilePath);
808
- const content = await readFile4(absolutePath, "utf8");
885
+ const content = await readFile5(absolutePath, "utf8");
809
886
  const parsed = parse2(content);
810
887
  if (!isJsonObject(parsed)) {
811
888
  return {};
@@ -823,7 +900,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
823
900
  const searchRoots = buildSearchRoots2(absoluteTestPath, repoRootPath);
824
901
  const config = await loadConfig(absoluteTestPath, repoRootPath);
825
902
  const guidelinePatterns = config?.guideline_patterns;
826
- const rawFile = await readFile4(absoluteTestPath, "utf8");
903
+ const rawFile = await readFile5(absoluteTestPath, "utf8");
827
904
  const parsed = parse2(rawFile);
828
905
  if (!isJsonObject(parsed)) {
829
906
  throw new Error(`Invalid test file format: ${evalFilePath}`);
@@ -861,14 +938,14 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
861
938
  const inputMessagesValue = evalcase.input_messages;
862
939
  const expectedMessagesValue = evalcase.expected_messages;
863
940
  if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
864
- logWarning5(`Skipping incomplete eval case: ${id ?? "unknown"}`);
941
+ logError(`Skipping incomplete eval case: ${id ?? "unknown"}. Missing required fields: id, outcome, and/or input_messages`);
865
942
  continue;
866
943
  }
867
944
  const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
868
945
  const inputMessages = inputMessagesValue.filter((msg) => isTestMessage(msg));
869
946
  const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
870
947
  if (hasExpectedMessages && expectedMessages.length === 0) {
871
- logWarning5(`No valid expected message found for eval case: ${id}`);
948
+ logError(`No valid expected message found for eval case: ${id}`);
872
949
  continue;
873
950
  }
874
951
  if (expectedMessages.length > 1) {
@@ -899,7 +976,14 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
899
976
  const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
900
977
  const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
901
978
  const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
902
- const evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
979
+ let evaluators;
980
+ try {
981
+ evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
982
+ } catch (error) {
983
+ const message = error instanceof Error ? error.message : String(error);
984
+ logError(`Skipping eval case '${id}': ${message}`);
985
+ continue;
986
+ }
903
987
  const userFilePaths = [];
904
988
  for (const segment of inputSegments) {
905
989
  if (segment.type === "file" && typeof segment.resolvedPath === "string") {
@@ -917,7 +1001,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
917
1001
  question,
918
1002
  input_messages: inputMessages,
919
1003
  input_segments: inputSegments,
920
- output_segments: outputSegments,
1004
+ expected_segments: outputSegments,
921
1005
  reference_answer: referenceAnswer,
922
1006
  guideline_paths: guidelinePaths.map((guidelinePath) => path6.resolve(guidelinePath)),
923
1007
  guideline_patterns: guidelinePatterns,
@@ -949,10 +1033,19 @@ function asString5(value) {
949
1033
  function logWarning5(message, details) {
950
1034
  if (details && details.length > 0) {
951
1035
  const detailBlock = details.join("\n");
952
- console.warn(`${ANSI_YELLOW5}Warning: ${message}
953
- ${detailBlock}${ANSI_RESET5}`);
1036
+ console.warn(`${ANSI_YELLOW6}Warning: ${message}
1037
+ ${detailBlock}${ANSI_RESET6}`);
954
1038
  } else {
955
- console.warn(`${ANSI_YELLOW5}Warning: ${message}${ANSI_RESET5}`);
1039
+ console.warn(`${ANSI_YELLOW6}Warning: ${message}${ANSI_RESET6}`);
1040
+ }
1041
+ }
1042
+ function logError(message, details) {
1043
+ if (details && details.length > 0) {
1044
+ const detailBlock = details.join("\n");
1045
+ console.error(`${ANSI_RED}Error: ${message}
1046
+ ${detailBlock}${ANSI_RESET6}`);
1047
+ } else {
1048
+ console.error(`${ANSI_RED}Error: ${message}${ANSI_RESET6}`);
956
1049
  }
957
1050
  }
958
1051
 
@@ -2637,7 +2730,7 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
2637
2730
 
2638
2731
  // src/evaluation/providers/targets-file.ts
2639
2732
  import { constants as constants3 } from "node:fs";
2640
- import { access as access3, readFile as readFile5 } from "node:fs/promises";
2733
+ import { access as access3, readFile as readFile6 } from "node:fs/promises";
2641
2734
  import path11 from "node:path";
2642
2735
  import { parse as parse3 } from "yaml";
2643
2736
  function isRecord(value) {
@@ -2698,7 +2791,7 @@ async function readTargetDefinitions(filePath) {
2698
2791
  if (!await fileExists3(absolutePath)) {
2699
2792
  throw new Error(`targets.yaml not found at ${absolutePath}`);
2700
2793
  }
2701
- const raw = await readFile5(absolutePath, "utf8");
2794
+ const raw = await readFile6(absolutePath, "utf8");
2702
2795
  const parsed = parse3(raw);
2703
2796
  if (!isRecord(parsed)) {
2704
2797
  throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with '$schema' and 'targets' fields`);
@@ -2749,16 +2842,16 @@ Use the reference_answer as a gold standard for a high-quality response (if prov
2749
2842
  Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.
2750
2843
 
2751
2844
  [[ ## expected_outcome ## ]]
2752
- {{expected_outcome}}
2845
+ {{${TEMPLATE_VARIABLES.EXPECTED_OUTCOME}}}
2753
2846
 
2754
2847
  [[ ## question ## ]]
2755
- {{question}}
2848
+ {{${TEMPLATE_VARIABLES.QUESTION}}}
2756
2849
 
2757
2850
  [[ ## reference_answer ## ]]
2758
- {{reference_answer}}
2851
+ {{${TEMPLATE_VARIABLES.REFERENCE_ANSWER}}}
2759
2852
 
2760
2853
  [[ ## candidate_answer ## ]]
2761
- {{candidate_answer}}`;
2854
+ {{${TEMPLATE_VARIABLES.CANDIDATE_ANSWER}}}`;
2762
2855
  var LlmJudgeEvaluator = class {
2763
2856
  kind = "llm_judge";
2764
2857
  resolveJudgeProvider;
@@ -2781,12 +2874,12 @@ var LlmJudgeEvaluator = class {
2781
2874
  async evaluateWithPrompt(context, judgeProvider) {
2782
2875
  const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
2783
2876
  const variables = {
2784
- input_messages: JSON.stringify(context.evalCase.input_segments, null, 2),
2785
- output_messages: JSON.stringify(context.evalCase.output_segments, null, 2),
2786
- candidate_answer: context.candidate.trim(),
2787
- reference_answer: (context.evalCase.reference_answer ?? "").trim(),
2788
- expected_outcome: context.evalCase.expected_outcome.trim(),
2789
- question: formattedQuestion.trim()
2877
+ [TEMPLATE_VARIABLES.INPUT_MESSAGES]: JSON.stringify(context.evalCase.input_segments, null, 2),
2878
+ [TEMPLATE_VARIABLES.EXPECTED_MESSAGES]: JSON.stringify(context.evalCase.expected_segments, null, 2),
2879
+ [TEMPLATE_VARIABLES.CANDIDATE_ANSWER]: context.candidate.trim(),
2880
+ [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context.evalCase.reference_answer ?? "").trim(),
2881
+ [TEMPLATE_VARIABLES.EXPECTED_OUTCOME]: context.evalCase.expected_outcome.trim(),
2882
+ [TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim()
2790
2883
  };
2791
2884
  const systemPrompt = buildOutputSchema();
2792
2885
  const evaluatorTemplate = context.evaluatorTemplateOverride ?? this.evaluatorTemplate ?? DEFAULT_EVALUATOR_TEMPLATE;
@@ -3018,7 +3111,7 @@ function parseJsonSafe(payload) {
3018
3111
  }
3019
3112
  }
3020
3113
  function substituteVariables(template, variables) {
3021
- return template.replace(/\{\{([a-zA-Z0-9_]+)\}\}/g, (match, varName) => {
3114
+ return template.replace(/\{\{\s*([a-zA-Z0-9_]+)\s*\}\}/g, (match, varName) => {
3022
3115
  return variables[varName] ?? match;
3023
3116
  });
3024
3117
  }
@@ -3182,11 +3275,11 @@ async function runEvaluation(options) {
3182
3275
  now,
3183
3276
  evalId,
3184
3277
  verbose,
3278
+ evalCases: preloadedEvalCases,
3185
3279
  onResult,
3186
3280
  onProgress
3187
3281
  } = options;
3188
- const load = loadEvalCases;
3189
- const evalCases = await load(evalFilePath, repoRoot, { verbose, evalId });
3282
+ const evalCases = preloadedEvalCases ?? await loadEvalCases(evalFilePath, repoRoot, { verbose, evalId });
3190
3283
  const filteredEvalCases = filterEvalCases(evalCases, evalId);
3191
3284
  if (filteredEvalCases.length === 0) {
3192
3285
  if (evalId) {
@@ -3370,8 +3463,9 @@ async function runBatchEvaluation(options) {
3370
3463
  agentTimeoutMs
3371
3464
  } = options;
3372
3465
  const promptInputsList = [];
3466
+ const formattingMode = isAgentProvider(provider) ? "agent" : "lm";
3373
3467
  for (const evalCase of evalCases) {
3374
- const promptInputs = await buildPromptInputs(evalCase);
3468
+ const promptInputs = await buildPromptInputs(evalCase, formattingMode);
3375
3469
  if (promptDumpDir) {
3376
3470
  await dumpPrompt(promptDumpDir, evalCase, promptInputs);
3377
3471
  }
@@ -3477,7 +3571,8 @@ async function runEvalCase(options) {
3477
3571
  signal,
3478
3572
  judgeProvider
3479
3573
  } = options;
3480
- const promptInputs = await buildPromptInputs(evalCase);
3574
+ const formattingMode = isAgentProvider(provider) ? "agent" : "lm";
3575
+ const promptInputs = await buildPromptInputs(evalCase, formattingMode);
3481
3576
  if (promptDumpDir) {
3482
3577
  await dumpPrompt(promptDumpDir, evalCase, promptInputs);
3483
3578
  }
@@ -3766,7 +3861,8 @@ async function runLlmJudgeEvaluator(options) {
3766
3861
  async function resolveCustomPrompt(config) {
3767
3862
  if (config.promptPath) {
3768
3863
  try {
3769
- return await readTextFile(config.promptPath);
3864
+ const content = await readTextFile(config.promptPath);
3865
+ return content;
3770
3866
  } catch (error) {
3771
3867
  const message = error instanceof Error ? error.message : String(error);
3772
3868
  console.warn(`Could not read custom prompt at ${config.promptPath}: ${message}`);