@agentv/core 0.9.0 → 0.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -54,6 +54,7 @@ __export(index_exports, {
54
54
  loadEvalCases: () => loadEvalCases,
55
55
  normalizeLineEndings: () => normalizeLineEndings,
56
56
  readTargetDefinitions: () => readTargetDefinitions,
57
+ readTestSuiteMetadata: () => readTestSuiteMetadata,
57
58
  readTextFile: () => readTextFile,
58
59
  resolveAndCreateProvider: () => resolveAndCreateProvider,
59
60
  resolveFileReference: () => resolveFileReference,
@@ -239,6 +240,33 @@ var ANSI_YELLOW = "\x1B[33m";
239
240
  var ANSI_RESET = "\x1B[0m";
240
241
  var SCHEMA_EVAL_V2 = "agentv-eval-v2";
241
242
  var SCHEMA_CONFIG_V2 = "agentv-config-v2";
243
+ async function readTestSuiteMetadata(testFilePath) {
244
+ try {
245
+ const absolutePath = import_node_path2.default.resolve(testFilePath);
246
+ const content = await (0, import_promises2.readFile)(absolutePath, "utf8");
247
+ const parsed = (0, import_yaml.parse)(content);
248
+ if (!isJsonObject(parsed)) {
249
+ return {};
250
+ }
251
+ return { target: extractTargetFromSuite(parsed) };
252
+ } catch {
253
+ return {};
254
+ }
255
+ }
256
+ function extractTargetFromSuite(suite) {
257
+ const execution = suite.execution;
258
+ if (execution && typeof execution === "object" && !Array.isArray(execution)) {
259
+ const executionTarget = execution.target;
260
+ if (typeof executionTarget === "string" && executionTarget.trim().length > 0) {
261
+ return executionTarget.trim();
262
+ }
263
+ }
264
+ const targetValue = suite.target;
265
+ if (typeof targetValue === "string" && targetValue.trim().length > 0) {
266
+ return targetValue.trim();
267
+ }
268
+ return void 0;
269
+ }
242
270
  async function loadConfig(evalFilePath, repoRoot) {
243
271
  const directories = buildDirectoryChain(evalFilePath, repoRoot);
244
272
  for (const directory of directories) {
@@ -415,6 +443,8 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
415
443
  throw new Error(`Invalid test file format: ${evalFilePath} - missing 'evalcases' field`);
416
444
  }
417
445
  const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm_judge";
446
+ const globalExecution = isJsonObject(suite.execution) ? suite.execution : void 0;
447
+ const globalTarget = asString(globalExecution?.target) ?? asString(suite.target);
418
448
  const results = [];
419
449
  for (const rawEvalcase of rawTestcases) {
420
450
  if (!isJsonObject(rawEvalcase)) {
@@ -434,14 +464,11 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
434
464
  logWarning(`Skipping incomplete eval case: ${id ?? "unknown"}`);
435
465
  continue;
436
466
  }
437
- if (!Array.isArray(expectedMessagesValue)) {
438
- logWarning(`Eval case '${id}' missing expected_messages array`);
439
- continue;
440
- }
467
+ const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
441
468
  const inputMessages = inputMessagesValue.filter((msg) => isTestMessage(msg));
442
- const expectedMessages = expectedMessagesValue.filter((msg) => isTestMessage(msg));
443
- if (expectedMessages.length === 0) {
444
- logWarning(`No expected message found for eval case: ${id}`);
469
+ const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
470
+ if (hasExpectedMessages && expectedMessages.length === 0) {
471
+ logWarning(`No valid expected message found for eval case: ${id}`);
445
472
  continue;
446
473
  }
447
474
  if (expectedMessages.length > 1) {
@@ -459,20 +486,20 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
459
486
  messageType: "input",
460
487
  verbose
461
488
  });
462
- const outputSegments = await processMessages({
489
+ const outputSegments = hasExpectedMessages ? await processMessages({
463
490
  messages: expectedMessages,
464
491
  searchRoots,
465
492
  repoRootPath,
466
493
  guidelinePatterns,
467
494
  messageType: "output",
468
495
  verbose
469
- });
496
+ }) : [];
470
497
  const codeSnippets = extractCodeBlocks(inputSegments);
471
498
  const expectedContent = expectedMessages[0]?.content;
472
- const referenceAnswer = await resolveAssistantContent(expectedContent, searchRoots, verbose);
499
+ const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
473
500
  const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
474
501
  const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
475
- const evaluators = await parseEvaluators(evalcase, searchRoots, id ?? "unknown");
502
+ const evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
476
503
  const userFilePaths = [];
477
504
  for (const segment of inputSegments) {
478
505
  if (segment.type === "file" && typeof segment.resolvedPath === "string") {
@@ -488,6 +515,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
488
515
  dataset: datasetName,
489
516
  conversation_id: conversationId,
490
517
  question,
518
+ input_messages: inputMessages,
491
519
  input_segments: inputSegments,
492
520
  output_segments: outputSegments,
493
521
  reference_answer: referenceAnswer,
@@ -515,6 +543,54 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
515
543
  }
516
544
  return results;
517
545
  }
546
+ function needsRoleMarkers(messages, processedSegmentsByMessage) {
547
+ if (messages.some((msg) => msg.role === "assistant" || msg.role === "tool")) {
548
+ return true;
549
+ }
550
+ let messagesWithContent = 0;
551
+ for (const segments of processedSegmentsByMessage) {
552
+ if (hasVisibleContent(segments)) {
553
+ messagesWithContent++;
554
+ }
555
+ }
556
+ return messagesWithContent > 1;
557
+ }
558
+ function hasVisibleContent(segments) {
559
+ return segments.some((segment) => {
560
+ const type = asString(segment.type);
561
+ if (type === "text") {
562
+ const value = asString(segment.value);
563
+ return value !== void 0 && value.trim().length > 0;
564
+ }
565
+ if (type === "guideline_ref") {
566
+ return false;
567
+ }
568
+ if (type === "file") {
569
+ const text = asString(segment.text);
570
+ return text !== void 0 && text.trim().length > 0;
571
+ }
572
+ return false;
573
+ });
574
+ }
575
+ function formatSegment(segment) {
576
+ const type = asString(segment.type);
577
+ if (type === "text") {
578
+ return asString(segment.value);
579
+ }
580
+ if (type === "guideline_ref") {
581
+ const refPath = asString(segment.path);
582
+ return refPath ? `<Attached: ${refPath}>` : void 0;
583
+ }
584
+ if (type === "file") {
585
+ const text = asString(segment.text);
586
+ const filePath = asString(segment.path);
587
+ if (text && filePath) {
588
+ return `=== ${filePath} ===
589
+ ${text}`;
590
+ }
591
+ }
592
+ return void 0;
593
+ }
518
594
  async function buildPromptInputs(testCase) {
519
595
  const guidelineContents = [];
520
596
  for (const rawPath of testCase.guideline_paths) {
@@ -531,36 +607,168 @@ ${content}`);
531
607
  logWarning(`Could not read guideline file ${absolutePath}: ${error.message}`);
532
608
  }
533
609
  }
534
- const questionParts = [];
610
+ const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
611
+ const segmentsByMessage = [];
612
+ const fileContentsByPath = /* @__PURE__ */ new Map();
535
613
  for (const segment of testCase.input_segments) {
536
- const typeValue = segment.type;
537
- if (typeof typeValue === "string" && typeValue === "file") {
538
- const pathValue = segment.path;
539
- const textValue = segment.text;
540
- const label = typeof pathValue === "string" ? pathValue : "file";
541
- const body = typeof textValue === "string" ? textValue : "";
542
- questionParts.push(`=== ${label} ===
543
- ${body}`);
544
- continue;
614
+ if (segment.type === "file" && typeof segment.path === "string" && typeof segment.text === "string") {
615
+ fileContentsByPath.set(segment.path, segment.text);
545
616
  }
546
- if (typeof typeValue === "string" && typeValue === "text") {
547
- const value = segment.value;
548
- if (typeof value === "string") {
549
- questionParts.push(value);
617
+ }
618
+ for (const message of testCase.input_messages) {
619
+ const messageSegments = [];
620
+ if (typeof message.content === "string") {
621
+ if (message.content.trim().length > 0) {
622
+ messageSegments.push({ type: "text", value: message.content });
623
+ }
624
+ } else if (Array.isArray(message.content)) {
625
+ for (const segment of message.content) {
626
+ if (typeof segment === "string") {
627
+ if (segment.trim().length > 0) {
628
+ messageSegments.push({ type: "text", value: segment });
629
+ }
630
+ } else if (isJsonObject(segment)) {
631
+ const type = asString(segment.type);
632
+ if (type === "file") {
633
+ const value = asString(segment.value);
634
+ if (!value) continue;
635
+ if (testCase.guideline_patterns && isGuidelineFile(value, testCase.guideline_patterns)) {
636
+ messageSegments.push({ type: "guideline_ref", path: value });
637
+ continue;
638
+ }
639
+ const fileText = fileContentsByPath.get(value);
640
+ if (fileText !== void 0) {
641
+ messageSegments.push({ type: "file", text: fileText, path: value });
642
+ }
643
+ } else if (type === "text") {
644
+ const textValue = asString(segment.value);
645
+ if (textValue && textValue.trim().length > 0) {
646
+ messageSegments.push({ type: "text", value: textValue });
647
+ }
648
+ }
649
+ }
650
+ }
651
+ }
652
+ segmentsByMessage.push(messageSegments);
653
+ }
654
+ const useRoleMarkers = needsRoleMarkers(testCase.input_messages, segmentsByMessage);
655
+ let question;
656
+ if (useRoleMarkers) {
657
+ const messageParts = [];
658
+ for (let i = 0; i < testCase.input_messages.length; i++) {
659
+ const message = testCase.input_messages[i];
660
+ const segments = segmentsByMessage[i];
661
+ if (!hasVisibleContent(segments)) {
662
+ continue;
663
+ }
664
+ const roleLabel = message.role.charAt(0).toUpperCase() + message.role.slice(1);
665
+ const contentParts = [];
666
+ for (const segment of segments) {
667
+ const formattedContent = formatSegment(segment);
668
+ if (formattedContent) {
669
+ contentParts.push(formattedContent);
670
+ }
671
+ }
672
+ if (contentParts.length > 0) {
673
+ const messageContent = contentParts.join("\n");
674
+ messageParts.push(`@[${roleLabel}]:
675
+ ${messageContent}`);
676
+ }
677
+ }
678
+ question = messageParts.join("\n\n");
679
+ } else {
680
+ const questionParts = [];
681
+ for (const segment of testCase.input_segments) {
682
+ const formattedContent = formatSegment(segment);
683
+ if (formattedContent) {
684
+ questionParts.push(formattedContent);
685
+ }
686
+ }
687
+ if (testCase.code_snippets.length > 0) {
688
+ questionParts.push(testCase.code_snippets.join("\n"));
689
+ }
690
+ question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
691
+ }
692
+ const chatPrompt = useRoleMarkers ? buildChatPromptFromSegments({
693
+ messages: testCase.input_messages,
694
+ segmentsByMessage,
695
+ guidelinePatterns: testCase.guideline_patterns,
696
+ guidelineContent: guidelines
697
+ }) : void 0;
698
+ return { question, guidelines, chatPrompt };
699
+ }
700
+ function buildChatPromptFromSegments(options) {
701
+ const { messages, segmentsByMessage, guidelinePatterns, guidelineContent, systemPrompt } = options;
702
+ if (messages.length === 0) {
703
+ return void 0;
704
+ }
705
+ const systemSegments = [];
706
+ if (systemPrompt && systemPrompt.trim().length > 0) {
707
+ systemSegments.push(systemPrompt.trim());
708
+ }
709
+ if (guidelineContent && guidelineContent.trim().length > 0) {
710
+ systemSegments.push(`[[ ## Guidelines ## ]]
711
+
712
+ ${guidelineContent.trim()}`);
713
+ }
714
+ let startIndex = 0;
715
+ while (startIndex < messages.length && messages[startIndex].role === "system") {
716
+ const segments = segmentsByMessage[startIndex];
717
+ const contentParts = [];
718
+ for (const segment of segments) {
719
+ const formatted = formatSegment(segment);
720
+ if (formatted) {
721
+ contentParts.push(formatted);
550
722
  }
551
- continue;
552
723
  }
553
- const genericValue = segment.value;
554
- if (typeof genericValue === "string") {
555
- questionParts.push(genericValue);
724
+ if (contentParts.length > 0) {
725
+ systemSegments.push(contentParts.join("\n"));
556
726
  }
727
+ startIndex += 1;
557
728
  }
558
- if (testCase.code_snippets.length > 0) {
559
- questionParts.push(testCase.code_snippets.join("\n"));
729
+ const chatPrompt = [];
730
+ if (systemSegments.length > 0) {
731
+ chatPrompt.push({
732
+ role: "system",
733
+ content: systemSegments.join("\n\n")
734
+ });
560
735
  }
561
- const question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
562
- const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
563
- return { question, guidelines };
736
+ for (let i = startIndex; i < messages.length; i++) {
737
+ const message = messages[i];
738
+ const segments = segmentsByMessage[i];
739
+ const contentParts = [];
740
+ let role = message.role;
741
+ let name;
742
+ if (role === "system") {
743
+ role = "assistant";
744
+ contentParts.push("@[System]:");
745
+ } else if (role === "tool") {
746
+ role = "function";
747
+ name = "tool";
748
+ }
749
+ for (const segment of segments) {
750
+ if (segment.type === "guideline_ref") {
751
+ continue;
752
+ }
753
+ const formatted = formatSegment(segment);
754
+ if (formatted) {
755
+ const isGuidelineRef = segment.type === "file" && typeof segment.path === "string" && guidelinePatterns && isGuidelineFile(segment.path, guidelinePatterns);
756
+ if (isGuidelineRef) {
757
+ continue;
758
+ }
759
+ contentParts.push(formatted);
760
+ }
761
+ }
762
+ if (contentParts.length === 0) {
763
+ continue;
764
+ }
765
+ chatPrompt.push({
766
+ role,
767
+ content: contentParts.join("\n"),
768
+ ...name ? { name } : {}
769
+ });
770
+ }
771
+ return chatPrompt.length > 0 ? chatPrompt : void 0;
564
772
  }
565
773
  async function fileExists2(absolutePath) {
566
774
  try {
@@ -658,9 +866,9 @@ async function resolveAssistantContent(content, searchRoots, verbose) {
658
866
  }
659
867
  return parts.join(" ");
660
868
  }
661
- async function parseEvaluators(rawEvalCase, searchRoots, evalId) {
869
+ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
662
870
  const execution = rawEvalCase.execution;
663
- const candidateEvaluators = isJsonObject(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators;
871
+ const candidateEvaluators = isJsonObject(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators ?? globalExecution?.evaluators;
664
872
  if (candidateEvaluators === void 0) {
665
873
  return void 0;
666
874
  }
@@ -698,6 +906,8 @@ async function parseEvaluators(rawEvalCase, searchRoots, evalId) {
698
906
  resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
699
907
  );
700
908
  }
909
+ } else {
910
+ resolvedCwd = searchRoots[0];
701
911
  }
702
912
  evaluators.push({
703
913
  name,
@@ -726,8 +936,7 @@ async function parseEvaluators(rawEvalCase, searchRoots, evalId) {
726
936
  name,
727
937
  type: "llm_judge",
728
938
  prompt,
729
- promptPath,
730
- model
939
+ promptPath
731
940
  });
732
941
  }
733
942
  return evaluators.length > 0 ? evaluators : void 0;
@@ -757,21 +966,14 @@ var import_ax = require("@ax-llm/ax");
757
966
  var DEFAULT_SYSTEM_PROMPT = "You are a careful assistant. Follow all provided instructions and do not fabricate results.";
758
967
  function buildChatPrompt(request) {
759
968
  if (request.chatPrompt) {
760
- return request.chatPrompt;
761
- }
762
- const systemSegments = [];
763
- const metadataSystemPrompt = typeof request.metadata?.systemPrompt === "string" ? request.metadata.systemPrompt : void 0;
764
- if (metadataSystemPrompt && metadataSystemPrompt.trim().length > 0) {
765
- systemSegments.push(metadataSystemPrompt.trim());
766
- } else {
767
- systemSegments.push(DEFAULT_SYSTEM_PROMPT);
768
- }
769
- if (request.guidelines && request.guidelines.trim().length > 0) {
770
- systemSegments.push(`[[ ## Guidelines ## ]]
771
-
772
- ${request.guidelines.trim()}`);
969
+ const hasSystemMessage = request.chatPrompt.some((message) => message.role === "system");
970
+ if (hasSystemMessage) {
971
+ return request.chatPrompt;
972
+ }
973
+ const systemContent2 = resolveSystemContent(request);
974
+ return [{ role: "system", content: systemContent2 }, ...request.chatPrompt];
773
975
  }
774
- const systemContent = systemSegments.join("\n\n");
976
+ const systemContent = resolveSystemContent(request);
775
977
  const userContent = request.question.trim();
776
978
  const prompt = [
777
979
  {
@@ -785,6 +987,21 @@ ${request.guidelines.trim()}`);
785
987
  ];
786
988
  return prompt;
787
989
  }
990
+ function resolveSystemContent(request) {
991
+ const systemSegments = [];
992
+ const metadataSystemPrompt = typeof request.metadata?.systemPrompt === "string" ? request.metadata.systemPrompt : void 0;
993
+ if (metadataSystemPrompt && metadataSystemPrompt.trim().length > 0) {
994
+ systemSegments.push(metadataSystemPrompt.trim());
995
+ } else {
996
+ systemSegments.push(DEFAULT_SYSTEM_PROMPT);
997
+ }
998
+ if (request.guidelines && request.guidelines.trim().length > 0) {
999
+ systemSegments.push(`[[ ## Guidelines ## ]]
1000
+
1001
+ ${request.guidelines.trim()}`);
1002
+ }
1003
+ return systemSegments.join("\n\n");
1004
+ }
788
1005
  function extractModelConfig(request, defaults) {
789
1006
  const temperature = request.temperature ?? defaults.temperature;
790
1007
  const maxTokens = request.maxOutputTokens ?? defaults.maxOutputTokens;
@@ -3020,24 +3237,23 @@ var LlmJudgeEvaluator = class {
3020
3237
  return this.evaluateWithPrompt(context, judgeProvider);
3021
3238
  }
3022
3239
  async evaluateWithPrompt(context, judgeProvider) {
3023
- let prompt = buildQualityPrompt(context.evalCase, context.candidate);
3024
- let systemPrompt = context.systemPrompt ?? this.customPrompt ?? QUALITY_SYSTEM_PROMPT;
3240
+ const hasReferenceAnswer = hasNonEmptyReferenceAnswer(context.evalCase);
3241
+ const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
3242
+ let prompt = buildQualityPrompt(context.evalCase, context.candidate, formattedQuestion);
3243
+ let systemPrompt = context.systemPrompt ?? this.customPrompt ?? buildSystemPrompt(hasReferenceAnswer);
3025
3244
  if (systemPrompt && hasTemplateVariables(systemPrompt)) {
3026
3245
  const variables = {
3027
3246
  input_messages: JSON.stringify(context.evalCase.input_segments, null, 2),
3028
3247
  output_messages: JSON.stringify(context.evalCase.output_segments, null, 2),
3029
3248
  candidate_answer: context.candidate,
3030
- reference_answer: context.evalCase.reference_answer,
3249
+ reference_answer: context.evalCase.reference_answer ?? "",
3031
3250
  expected_outcome: context.evalCase.expected_outcome,
3032
- question: context.evalCase.question
3251
+ question: formattedQuestion
3033
3252
  };
3034
3253
  prompt = substituteVariables(systemPrompt, variables);
3035
- systemPrompt = QUALITY_SYSTEM_PROMPT;
3254
+ systemPrompt = buildSystemPrompt(hasReferenceAnswer);
3036
3255
  }
3037
- const metadata = {
3038
- ...systemPrompt !== void 0 ? { systemPrompt } : {},
3039
- ...context.judgeModel !== void 0 ? { model: context.judgeModel } : {}
3040
- };
3256
+ const metadata = systemPrompt !== void 0 ? { systemPrompt } : {};
3041
3257
  const response = await judgeProvider.invoke({
3042
3258
  question: prompt,
3043
3259
  metadata,
@@ -3057,8 +3273,7 @@ var LlmJudgeEvaluator = class {
3057
3273
  provider: judgeProvider.id,
3058
3274
  prompt,
3059
3275
  target: context.target.name,
3060
- ...systemPrompt !== void 0 ? { systemPrompt } : {},
3061
- ...context.judgeModel !== void 0 ? { model: context.judgeModel } : {}
3276
+ ...systemPrompt !== void 0 && { systemPrompt }
3062
3277
  };
3063
3278
  return {
3064
3279
  score,
@@ -3070,38 +3285,51 @@ var LlmJudgeEvaluator = class {
3070
3285
  };
3071
3286
  }
3072
3287
  };
3073
- var QUALITY_SYSTEM_PROMPT = [
3074
- "You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.",
3075
- "",
3076
- "Use the reference_answer as a gold standard for a high-quality response. The candidate_answer does not need to match it verbatim, but it should capture the key points and follow the same spirit.",
3077
- "",
3078
- "Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.",
3079
- "",
3080
- "You must respond with a single JSON object matching this schema:",
3081
- "",
3082
- "{",
3083
- ' "score": <number between 0.0 and 1.0>,',
3084
- ' "hits": [<array of strings, max 4 items, brief specific achievements>],',
3085
- ' "misses": [<array of strings, max 4 items, brief specific failures or omissions, empty if none>],',
3086
- ' "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
3087
- "}"
3088
- ].join("\n");
3089
- function buildQualityPrompt(evalCase, candidate) {
3288
+ function buildSystemPrompt(hasReferenceAnswer) {
3289
+ const basePrompt = [
3290
+ "You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.",
3291
+ ""
3292
+ ];
3293
+ if (hasReferenceAnswer) {
3294
+ basePrompt.push(
3295
+ "Use the reference_answer as a gold standard for a high-quality response. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.",
3296
+ ""
3297
+ );
3298
+ }
3299
+ basePrompt.push(
3300
+ "Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.",
3301
+ "",
3302
+ "You must respond with a single JSON object matching this schema:",
3303
+ "",
3304
+ "{",
3305
+ ' "score": <number between 0.0 and 1.0>,',
3306
+ ' "hits": [<array of strings, max 4 items, brief specific achievements>],',
3307
+ ' "misses": [<array of strings, max 4 items, brief specific failures or omissions, empty if none>],',
3308
+ ' "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
3309
+ "}"
3310
+ );
3311
+ return basePrompt.join("\n");
3312
+ }
3313
+ function buildQualityPrompt(evalCase, candidate, question) {
3090
3314
  const parts = [
3091
3315
  "[[ ## expected_outcome ## ]]",
3092
3316
  evalCase.expected_outcome.trim(),
3093
3317
  "",
3094
3318
  "[[ ## question ## ]]",
3095
- evalCase.question.trim(),
3096
- "",
3097
- "[[ ## reference_answer ## ]]",
3098
- evalCase.reference_answer.trim(),
3099
- "",
3100
- "[[ ## candidate_answer ## ]]",
3101
- candidate.trim(),
3102
- "",
3103
- "Respond with a single JSON object matching the schema described in the system prompt."
3319
+ question.trim(),
3320
+ ""
3104
3321
  ];
3322
+ if (hasNonEmptyReferenceAnswer(evalCase)) {
3323
+ parts.push(
3324
+ "[[ ## reference_answer ## ]]",
3325
+ evalCase.reference_answer.trim(),
3326
+ ""
3327
+ );
3328
+ }
3329
+ parts.push(
3330
+ "[[ ## candidate_answer ## ]]",
3331
+ candidate.trim()
3332
+ );
3105
3333
  return parts.join("\n");
3106
3334
  }
3107
3335
  function clampScore(value) {
@@ -3184,6 +3412,9 @@ function extractJsonBlob(text) {
3184
3412
  function isNonEmptyString(value) {
3185
3413
  return typeof value === "string" && value.trim().length > 0;
3186
3414
  }
3415
+ function hasNonEmptyReferenceAnswer(evalCase) {
3416
+ return evalCase.reference_answer !== void 0 && evalCase.reference_answer.trim().length > 0;
3417
+ }
3187
3418
  var CodeEvaluator = class {
3188
3419
  kind = "code";
3189
3420
  script;
@@ -3842,11 +4073,27 @@ async function evaluateCandidate(options) {
3842
4073
  agentTimeoutMs
3843
4074
  });
3844
4075
  const completedAt = nowFn();
3845
- const rawRequest = {
3846
- question: promptInputs.question,
3847
- ...isAgentProvider(provider) ? {} : { guidelines: promptInputs.guidelines },
3848
- guideline_paths: evalCase.guideline_paths
3849
- };
4076
+ let agentProviderRequest;
4077
+ let lmProviderRequest;
4078
+ if (isAgentProvider(provider)) {
4079
+ agentProviderRequest = {
4080
+ question: promptInputs.question,
4081
+ guideline_paths: evalCase.guideline_paths
4082
+ };
4083
+ } else {
4084
+ if (promptInputs.chatPrompt) {
4085
+ lmProviderRequest = {
4086
+ chat_prompt: promptInputs.chatPrompt,
4087
+ guideline_paths: evalCase.guideline_paths
4088
+ };
4089
+ } else {
4090
+ lmProviderRequest = {
4091
+ question: promptInputs.question,
4092
+ guidelines: promptInputs.guidelines,
4093
+ guideline_paths: evalCase.guideline_paths
4094
+ };
4095
+ }
4096
+ }
3850
4097
  return {
3851
4098
  eval_id: evalCase.id,
3852
4099
  dataset: evalCase.dataset,
@@ -3860,7 +4107,8 @@ async function evaluateCandidate(options) {
3860
4107
  timestamp: completedAt.toISOString(),
3861
4108
  reasoning: score.reasoning,
3862
4109
  raw_aspects: score.rawAspects,
3863
- raw_request: rawRequest,
4110
+ agent_provider_request: agentProviderRequest,
4111
+ lm_provider_request: lmProviderRequest,
3864
4112
  evaluator_raw_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
3865
4113
  evaluator_results: evaluatorResults
3866
4114
  };
@@ -4019,8 +4267,7 @@ async function runLlmJudgeEvaluator(options) {
4019
4267
  now,
4020
4268
  judgeProvider,
4021
4269
  systemPrompt: customPrompt,
4022
- evaluator: config,
4023
- judgeModel: config.model
4270
+ evaluator: config
4024
4271
  });
4025
4272
  }
4026
4273
  async function resolveCustomPrompt(config) {
@@ -4089,6 +4336,7 @@ async function invokeProvider(provider, options) {
4089
4336
  question: promptInputs.question,
4090
4337
  guidelines: promptInputs.guidelines,
4091
4338
  guideline_patterns: evalCase.guideline_patterns,
4339
+ chatPrompt: promptInputs.chatPrompt,
4092
4340
  inputFiles: evalCase.file_paths,
4093
4341
  evalCaseId: evalCase.id,
4094
4342
  attempt,
@@ -4105,12 +4353,30 @@ async function invokeProvider(provider, options) {
4105
4353
  }
4106
4354
  function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider) {
4107
4355
  const message = error instanceof Error ? error.message : String(error);
4108
- const rawRequest = {
4109
- question: promptInputs.question,
4110
- ...isAgentProvider(provider) ? {} : { guidelines: promptInputs.guidelines },
4111
- guideline_paths: evalCase.guideline_paths,
4112
- error: message
4113
- };
4356
+ let agentProviderRequest;
4357
+ let lmProviderRequest;
4358
+ if (isAgentProvider(provider)) {
4359
+ agentProviderRequest = {
4360
+ question: promptInputs.question,
4361
+ guideline_paths: evalCase.guideline_paths,
4362
+ error: message
4363
+ };
4364
+ } else {
4365
+ if (promptInputs.chatPrompt) {
4366
+ lmProviderRequest = {
4367
+ chat_prompt: promptInputs.chatPrompt,
4368
+ guideline_paths: evalCase.guideline_paths,
4369
+ error: message
4370
+ };
4371
+ } else {
4372
+ lmProviderRequest = {
4373
+ question: promptInputs.question,
4374
+ guidelines: promptInputs.guidelines,
4375
+ guideline_paths: evalCase.guideline_paths,
4376
+ error: message
4377
+ };
4378
+ }
4379
+ }
4114
4380
  return {
4115
4381
  eval_id: evalCase.id,
4116
4382
  dataset: evalCase.dataset,
@@ -4123,7 +4389,8 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
4123
4389
  target: targetName,
4124
4390
  timestamp: timestamp.toISOString(),
4125
4391
  raw_aspects: [],
4126
- raw_request: rawRequest,
4392
+ agent_provider_request: agentProviderRequest,
4393
+ lm_provider_request: lmProviderRequest,
4127
4394
  error: message
4128
4395
  };
4129
4396
  }
@@ -4135,6 +4402,9 @@ function createCacheKey(provider, target, evalCase, promptInputs) {
4135
4402
  hash.update(promptInputs.question);
4136
4403
  hash.update(promptInputs.guidelines);
4137
4404
  hash.update(promptInputs.systemMessage ?? "");
4405
+ if (promptInputs.chatPrompt) {
4406
+ hash.update(JSON.stringify(promptInputs.chatPrompt));
4407
+ }
4138
4408
  return hash.digest("hex");
4139
4409
  }
4140
4410
  function isTimeoutLike(error) {
@@ -4183,6 +4453,7 @@ function createAgentKernel() {
4183
4453
  loadEvalCases,
4184
4454
  normalizeLineEndings,
4185
4455
  readTargetDefinitions,
4456
+ readTestSuiteMetadata,
4186
4457
  readTextFile,
4187
4458
  resolveAndCreateProvider,
4188
4459
  resolveFileReference,