@agentv/core 0.9.0 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-SNTZFB24.js → chunk-YQBJAT5I.js} +1 -1
- package/dist/{chunk-SNTZFB24.js.map → chunk-YQBJAT5I.js.map} +1 -1
- package/dist/evaluation/validation/index.cjs +30 -13
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +21 -4
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +335 -91
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +67 -62
- package/dist/index.d.ts +67 -62
- package/dist/index.js +336 -92
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.cjs
CHANGED
|
@@ -434,14 +434,11 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
434
434
|
logWarning(`Skipping incomplete eval case: ${id ?? "unknown"}`);
|
|
435
435
|
continue;
|
|
436
436
|
}
|
|
437
|
-
|
|
438
|
-
logWarning(`Eval case '${id}' missing expected_messages array`);
|
|
439
|
-
continue;
|
|
440
|
-
}
|
|
437
|
+
const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
|
|
441
438
|
const inputMessages = inputMessagesValue.filter((msg) => isTestMessage(msg));
|
|
442
|
-
const expectedMessages = expectedMessagesValue.filter((msg) => isTestMessage(msg));
|
|
443
|
-
if (expectedMessages.length === 0) {
|
|
444
|
-
logWarning(`No expected message found for eval case: ${id}`);
|
|
439
|
+
const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
|
|
440
|
+
if (hasExpectedMessages && expectedMessages.length === 0) {
|
|
441
|
+
logWarning(`No valid expected message found for eval case: ${id}`);
|
|
445
442
|
continue;
|
|
446
443
|
}
|
|
447
444
|
if (expectedMessages.length > 1) {
|
|
@@ -459,17 +456,17 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
459
456
|
messageType: "input",
|
|
460
457
|
verbose
|
|
461
458
|
});
|
|
462
|
-
const outputSegments = await processMessages({
|
|
459
|
+
const outputSegments = hasExpectedMessages ? await processMessages({
|
|
463
460
|
messages: expectedMessages,
|
|
464
461
|
searchRoots,
|
|
465
462
|
repoRootPath,
|
|
466
463
|
guidelinePatterns,
|
|
467
464
|
messageType: "output",
|
|
468
465
|
verbose
|
|
469
|
-
});
|
|
466
|
+
}) : [];
|
|
470
467
|
const codeSnippets = extractCodeBlocks(inputSegments);
|
|
471
468
|
const expectedContent = expectedMessages[0]?.content;
|
|
472
|
-
const referenceAnswer = await resolveAssistantContent(expectedContent, searchRoots, verbose);
|
|
469
|
+
const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
|
|
473
470
|
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
474
471
|
const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
|
|
475
472
|
const evaluators = await parseEvaluators(evalcase, searchRoots, id ?? "unknown");
|
|
@@ -488,6 +485,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
488
485
|
dataset: datasetName,
|
|
489
486
|
conversation_id: conversationId,
|
|
490
487
|
question,
|
|
488
|
+
input_messages: inputMessages,
|
|
491
489
|
input_segments: inputSegments,
|
|
492
490
|
output_segments: outputSegments,
|
|
493
491
|
reference_answer: referenceAnswer,
|
|
@@ -515,6 +513,54 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
515
513
|
}
|
|
516
514
|
return results;
|
|
517
515
|
}
|
|
516
|
+
function needsRoleMarkers(messages, processedSegmentsByMessage) {
|
|
517
|
+
if (messages.some((msg) => msg.role === "assistant" || msg.role === "tool")) {
|
|
518
|
+
return true;
|
|
519
|
+
}
|
|
520
|
+
let messagesWithContent = 0;
|
|
521
|
+
for (const segments of processedSegmentsByMessage) {
|
|
522
|
+
if (hasVisibleContent(segments)) {
|
|
523
|
+
messagesWithContent++;
|
|
524
|
+
}
|
|
525
|
+
}
|
|
526
|
+
return messagesWithContent > 1;
|
|
527
|
+
}
|
|
528
|
+
function hasVisibleContent(segments) {
|
|
529
|
+
return segments.some((segment) => {
|
|
530
|
+
const type = asString(segment.type);
|
|
531
|
+
if (type === "text") {
|
|
532
|
+
const value = asString(segment.value);
|
|
533
|
+
return value !== void 0 && value.trim().length > 0;
|
|
534
|
+
}
|
|
535
|
+
if (type === "guideline_ref") {
|
|
536
|
+
return false;
|
|
537
|
+
}
|
|
538
|
+
if (type === "file") {
|
|
539
|
+
const text = asString(segment.text);
|
|
540
|
+
return text !== void 0 && text.trim().length > 0;
|
|
541
|
+
}
|
|
542
|
+
return false;
|
|
543
|
+
});
|
|
544
|
+
}
|
|
545
|
+
function formatSegment(segment) {
|
|
546
|
+
const type = asString(segment.type);
|
|
547
|
+
if (type === "text") {
|
|
548
|
+
return asString(segment.value);
|
|
549
|
+
}
|
|
550
|
+
if (type === "guideline_ref") {
|
|
551
|
+
const refPath = asString(segment.path);
|
|
552
|
+
return refPath ? `<Attached: ${refPath}>` : void 0;
|
|
553
|
+
}
|
|
554
|
+
if (type === "file") {
|
|
555
|
+
const text = asString(segment.text);
|
|
556
|
+
const filePath = asString(segment.path);
|
|
557
|
+
if (text && filePath) {
|
|
558
|
+
return `=== ${filePath} ===
|
|
559
|
+
${text}`;
|
|
560
|
+
}
|
|
561
|
+
}
|
|
562
|
+
return void 0;
|
|
563
|
+
}
|
|
518
564
|
async function buildPromptInputs(testCase) {
|
|
519
565
|
const guidelineContents = [];
|
|
520
566
|
for (const rawPath of testCase.guideline_paths) {
|
|
@@ -531,36 +577,168 @@ ${content}`);
|
|
|
531
577
|
logWarning(`Could not read guideline file ${absolutePath}: ${error.message}`);
|
|
532
578
|
}
|
|
533
579
|
}
|
|
534
|
-
const
|
|
580
|
+
const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
|
|
581
|
+
const segmentsByMessage = [];
|
|
582
|
+
const fileContentsByPath = /* @__PURE__ */ new Map();
|
|
535
583
|
for (const segment of testCase.input_segments) {
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
const pathValue = segment.path;
|
|
539
|
-
const textValue = segment.text;
|
|
540
|
-
const label = typeof pathValue === "string" ? pathValue : "file";
|
|
541
|
-
const body = typeof textValue === "string" ? textValue : "";
|
|
542
|
-
questionParts.push(`=== ${label} ===
|
|
543
|
-
${body}`);
|
|
544
|
-
continue;
|
|
584
|
+
if (segment.type === "file" && typeof segment.path === "string" && typeof segment.text === "string") {
|
|
585
|
+
fileContentsByPath.set(segment.path, segment.text);
|
|
545
586
|
}
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
587
|
+
}
|
|
588
|
+
for (const message of testCase.input_messages) {
|
|
589
|
+
const messageSegments = [];
|
|
590
|
+
if (typeof message.content === "string") {
|
|
591
|
+
if (message.content.trim().length > 0) {
|
|
592
|
+
messageSegments.push({ type: "text", value: message.content });
|
|
593
|
+
}
|
|
594
|
+
} else if (Array.isArray(message.content)) {
|
|
595
|
+
for (const segment of message.content) {
|
|
596
|
+
if (typeof segment === "string") {
|
|
597
|
+
if (segment.trim().length > 0) {
|
|
598
|
+
messageSegments.push({ type: "text", value: segment });
|
|
599
|
+
}
|
|
600
|
+
} else if (isJsonObject(segment)) {
|
|
601
|
+
const type = asString(segment.type);
|
|
602
|
+
if (type === "file") {
|
|
603
|
+
const value = asString(segment.value);
|
|
604
|
+
if (!value) continue;
|
|
605
|
+
if (testCase.guideline_patterns && isGuidelineFile(value, testCase.guideline_patterns)) {
|
|
606
|
+
messageSegments.push({ type: "guideline_ref", path: value });
|
|
607
|
+
continue;
|
|
608
|
+
}
|
|
609
|
+
const fileText = fileContentsByPath.get(value);
|
|
610
|
+
if (fileText !== void 0) {
|
|
611
|
+
messageSegments.push({ type: "file", text: fileText, path: value });
|
|
612
|
+
}
|
|
613
|
+
} else if (type === "text") {
|
|
614
|
+
const textValue = asString(segment.value);
|
|
615
|
+
if (textValue && textValue.trim().length > 0) {
|
|
616
|
+
messageSegments.push({ type: "text", value: textValue });
|
|
617
|
+
}
|
|
618
|
+
}
|
|
619
|
+
}
|
|
550
620
|
}
|
|
551
|
-
continue;
|
|
552
621
|
}
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
622
|
+
segmentsByMessage.push(messageSegments);
|
|
623
|
+
}
|
|
624
|
+
const useRoleMarkers = needsRoleMarkers(testCase.input_messages, segmentsByMessage);
|
|
625
|
+
let question;
|
|
626
|
+
if (useRoleMarkers) {
|
|
627
|
+
const messageParts = [];
|
|
628
|
+
for (let i = 0; i < testCase.input_messages.length; i++) {
|
|
629
|
+
const message = testCase.input_messages[i];
|
|
630
|
+
const segments = segmentsByMessage[i];
|
|
631
|
+
if (!hasVisibleContent(segments)) {
|
|
632
|
+
continue;
|
|
633
|
+
}
|
|
634
|
+
const roleLabel = message.role.charAt(0).toUpperCase() + message.role.slice(1);
|
|
635
|
+
const contentParts = [];
|
|
636
|
+
for (const segment of segments) {
|
|
637
|
+
const formattedContent = formatSegment(segment);
|
|
638
|
+
if (formattedContent) {
|
|
639
|
+
contentParts.push(formattedContent);
|
|
640
|
+
}
|
|
641
|
+
}
|
|
642
|
+
if (contentParts.length > 0) {
|
|
643
|
+
const messageContent = contentParts.join("\n");
|
|
644
|
+
messageParts.push(`@[${roleLabel}]:
|
|
645
|
+
${messageContent}`);
|
|
646
|
+
}
|
|
556
647
|
}
|
|
648
|
+
question = messageParts.join("\n\n");
|
|
649
|
+
} else {
|
|
650
|
+
const questionParts = [];
|
|
651
|
+
for (const segment of testCase.input_segments) {
|
|
652
|
+
const formattedContent = formatSegment(segment);
|
|
653
|
+
if (formattedContent) {
|
|
654
|
+
questionParts.push(formattedContent);
|
|
655
|
+
}
|
|
656
|
+
}
|
|
657
|
+
if (testCase.code_snippets.length > 0) {
|
|
658
|
+
questionParts.push(testCase.code_snippets.join("\n"));
|
|
659
|
+
}
|
|
660
|
+
question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
|
|
557
661
|
}
|
|
558
|
-
|
|
559
|
-
|
|
662
|
+
const chatPrompt = useRoleMarkers ? buildChatPromptFromSegments({
|
|
663
|
+
messages: testCase.input_messages,
|
|
664
|
+
segmentsByMessage,
|
|
665
|
+
guidelinePatterns: testCase.guideline_patterns,
|
|
666
|
+
guidelineContent: guidelines
|
|
667
|
+
}) : void 0;
|
|
668
|
+
return { question, guidelines, chatPrompt };
|
|
669
|
+
}
|
|
670
|
+
function buildChatPromptFromSegments(options) {
|
|
671
|
+
const { messages, segmentsByMessage, guidelinePatterns, guidelineContent, systemPrompt } = options;
|
|
672
|
+
if (messages.length === 0) {
|
|
673
|
+
return void 0;
|
|
560
674
|
}
|
|
561
|
-
const
|
|
562
|
-
|
|
563
|
-
|
|
675
|
+
const systemSegments = [];
|
|
676
|
+
if (systemPrompt && systemPrompt.trim().length > 0) {
|
|
677
|
+
systemSegments.push(systemPrompt.trim());
|
|
678
|
+
}
|
|
679
|
+
if (guidelineContent && guidelineContent.trim().length > 0) {
|
|
680
|
+
systemSegments.push(`[[ ## Guidelines ## ]]
|
|
681
|
+
|
|
682
|
+
${guidelineContent.trim()}`);
|
|
683
|
+
}
|
|
684
|
+
let startIndex = 0;
|
|
685
|
+
while (startIndex < messages.length && messages[startIndex].role === "system") {
|
|
686
|
+
const segments = segmentsByMessage[startIndex];
|
|
687
|
+
const contentParts = [];
|
|
688
|
+
for (const segment of segments) {
|
|
689
|
+
const formatted = formatSegment(segment);
|
|
690
|
+
if (formatted) {
|
|
691
|
+
contentParts.push(formatted);
|
|
692
|
+
}
|
|
693
|
+
}
|
|
694
|
+
if (contentParts.length > 0) {
|
|
695
|
+
systemSegments.push(contentParts.join("\n"));
|
|
696
|
+
}
|
|
697
|
+
startIndex += 1;
|
|
698
|
+
}
|
|
699
|
+
const chatPrompt = [];
|
|
700
|
+
if (systemSegments.length > 0) {
|
|
701
|
+
chatPrompt.push({
|
|
702
|
+
role: "system",
|
|
703
|
+
content: systemSegments.join("\n\n")
|
|
704
|
+
});
|
|
705
|
+
}
|
|
706
|
+
for (let i = startIndex; i < messages.length; i++) {
|
|
707
|
+
const message = messages[i];
|
|
708
|
+
const segments = segmentsByMessage[i];
|
|
709
|
+
const contentParts = [];
|
|
710
|
+
let role = message.role;
|
|
711
|
+
let name;
|
|
712
|
+
if (role === "system") {
|
|
713
|
+
role = "assistant";
|
|
714
|
+
contentParts.push("@[System]:");
|
|
715
|
+
} else if (role === "tool") {
|
|
716
|
+
role = "function";
|
|
717
|
+
name = "tool";
|
|
718
|
+
}
|
|
719
|
+
for (const segment of segments) {
|
|
720
|
+
if (segment.type === "guideline_ref") {
|
|
721
|
+
continue;
|
|
722
|
+
}
|
|
723
|
+
const formatted = formatSegment(segment);
|
|
724
|
+
if (formatted) {
|
|
725
|
+
const isGuidelineRef = segment.type === "file" && typeof segment.path === "string" && guidelinePatterns && isGuidelineFile(segment.path, guidelinePatterns);
|
|
726
|
+
if (isGuidelineRef) {
|
|
727
|
+
continue;
|
|
728
|
+
}
|
|
729
|
+
contentParts.push(formatted);
|
|
730
|
+
}
|
|
731
|
+
}
|
|
732
|
+
if (contentParts.length === 0) {
|
|
733
|
+
continue;
|
|
734
|
+
}
|
|
735
|
+
chatPrompt.push({
|
|
736
|
+
role,
|
|
737
|
+
content: contentParts.join("\n"),
|
|
738
|
+
...name ? { name } : {}
|
|
739
|
+
});
|
|
740
|
+
}
|
|
741
|
+
return chatPrompt.length > 0 ? chatPrompt : void 0;
|
|
564
742
|
}
|
|
565
743
|
async function fileExists2(absolutePath) {
|
|
566
744
|
try {
|
|
@@ -757,21 +935,14 @@ var import_ax = require("@ax-llm/ax");
|
|
|
757
935
|
var DEFAULT_SYSTEM_PROMPT = "You are a careful assistant. Follow all provided instructions and do not fabricate results.";
|
|
758
936
|
function buildChatPrompt(request) {
|
|
759
937
|
if (request.chatPrompt) {
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
} else {
|
|
767
|
-
systemSegments.push(DEFAULT_SYSTEM_PROMPT);
|
|
768
|
-
}
|
|
769
|
-
if (request.guidelines && request.guidelines.trim().length > 0) {
|
|
770
|
-
systemSegments.push(`[[ ## Guidelines ## ]]
|
|
771
|
-
|
|
772
|
-
${request.guidelines.trim()}`);
|
|
938
|
+
const hasSystemMessage = request.chatPrompt.some((message) => message.role === "system");
|
|
939
|
+
if (hasSystemMessage) {
|
|
940
|
+
return request.chatPrompt;
|
|
941
|
+
}
|
|
942
|
+
const systemContent2 = resolveSystemContent(request);
|
|
943
|
+
return [{ role: "system", content: systemContent2 }, ...request.chatPrompt];
|
|
773
944
|
}
|
|
774
|
-
const systemContent =
|
|
945
|
+
const systemContent = resolveSystemContent(request);
|
|
775
946
|
const userContent = request.question.trim();
|
|
776
947
|
const prompt = [
|
|
777
948
|
{
|
|
@@ -785,6 +956,21 @@ ${request.guidelines.trim()}`);
|
|
|
785
956
|
];
|
|
786
957
|
return prompt;
|
|
787
958
|
}
|
|
959
|
+
function resolveSystemContent(request) {
|
|
960
|
+
const systemSegments = [];
|
|
961
|
+
const metadataSystemPrompt = typeof request.metadata?.systemPrompt === "string" ? request.metadata.systemPrompt : void 0;
|
|
962
|
+
if (metadataSystemPrompt && metadataSystemPrompt.trim().length > 0) {
|
|
963
|
+
systemSegments.push(metadataSystemPrompt.trim());
|
|
964
|
+
} else {
|
|
965
|
+
systemSegments.push(DEFAULT_SYSTEM_PROMPT);
|
|
966
|
+
}
|
|
967
|
+
if (request.guidelines && request.guidelines.trim().length > 0) {
|
|
968
|
+
systemSegments.push(`[[ ## Guidelines ## ]]
|
|
969
|
+
|
|
970
|
+
${request.guidelines.trim()}`);
|
|
971
|
+
}
|
|
972
|
+
return systemSegments.join("\n\n");
|
|
973
|
+
}
|
|
788
974
|
function extractModelConfig(request, defaults) {
|
|
789
975
|
const temperature = request.temperature ?? defaults.temperature;
|
|
790
976
|
const maxTokens = request.maxOutputTokens ?? defaults.maxOutputTokens;
|
|
@@ -3020,19 +3206,21 @@ var LlmJudgeEvaluator = class {
|
|
|
3020
3206
|
return this.evaluateWithPrompt(context, judgeProvider);
|
|
3021
3207
|
}
|
|
3022
3208
|
async evaluateWithPrompt(context, judgeProvider) {
|
|
3023
|
-
|
|
3024
|
-
|
|
3209
|
+
const hasReferenceAnswer = hasNonEmptyReferenceAnswer(context.evalCase);
|
|
3210
|
+
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
3211
|
+
let prompt = buildQualityPrompt(context.evalCase, context.candidate, formattedQuestion);
|
|
3212
|
+
let systemPrompt = context.systemPrompt ?? this.customPrompt ?? buildSystemPrompt(hasReferenceAnswer);
|
|
3025
3213
|
if (systemPrompt && hasTemplateVariables(systemPrompt)) {
|
|
3026
3214
|
const variables = {
|
|
3027
3215
|
input_messages: JSON.stringify(context.evalCase.input_segments, null, 2),
|
|
3028
3216
|
output_messages: JSON.stringify(context.evalCase.output_segments, null, 2),
|
|
3029
3217
|
candidate_answer: context.candidate,
|
|
3030
|
-
reference_answer: context.evalCase.reference_answer,
|
|
3218
|
+
reference_answer: context.evalCase.reference_answer ?? "",
|
|
3031
3219
|
expected_outcome: context.evalCase.expected_outcome,
|
|
3032
|
-
question:
|
|
3220
|
+
question: formattedQuestion
|
|
3033
3221
|
};
|
|
3034
3222
|
prompt = substituteVariables(systemPrompt, variables);
|
|
3035
|
-
systemPrompt =
|
|
3223
|
+
systemPrompt = buildSystemPrompt(hasReferenceAnswer);
|
|
3036
3224
|
}
|
|
3037
3225
|
const metadata = {
|
|
3038
3226
|
...systemPrompt !== void 0 ? { systemPrompt } : {},
|
|
@@ -3070,38 +3258,51 @@ var LlmJudgeEvaluator = class {
|
|
|
3070
3258
|
};
|
|
3071
3259
|
}
|
|
3072
3260
|
};
|
|
3073
|
-
|
|
3074
|
-
|
|
3075
|
-
|
|
3076
|
-
|
|
3077
|
-
|
|
3078
|
-
|
|
3079
|
-
|
|
3080
|
-
|
|
3081
|
-
|
|
3082
|
-
|
|
3083
|
-
|
|
3084
|
-
|
|
3085
|
-
|
|
3086
|
-
|
|
3087
|
-
|
|
3088
|
-
|
|
3089
|
-
|
|
3261
|
+
function buildSystemPrompt(hasReferenceAnswer) {
|
|
3262
|
+
const basePrompt = [
|
|
3263
|
+
"You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.",
|
|
3264
|
+
""
|
|
3265
|
+
];
|
|
3266
|
+
if (hasReferenceAnswer) {
|
|
3267
|
+
basePrompt.push(
|
|
3268
|
+
"Use the reference_answer as a gold standard for a high-quality response. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.",
|
|
3269
|
+
""
|
|
3270
|
+
);
|
|
3271
|
+
}
|
|
3272
|
+
basePrompt.push(
|
|
3273
|
+
"Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.",
|
|
3274
|
+
"",
|
|
3275
|
+
"You must respond with a single JSON object matching this schema:",
|
|
3276
|
+
"",
|
|
3277
|
+
"{",
|
|
3278
|
+
' "score": <number between 0.0 and 1.0>,',
|
|
3279
|
+
' "hits": [<array of strings, max 4 items, brief specific achievements>],',
|
|
3280
|
+
' "misses": [<array of strings, max 4 items, brief specific failures or omissions, empty if none>],',
|
|
3281
|
+
' "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
|
|
3282
|
+
"}"
|
|
3283
|
+
);
|
|
3284
|
+
return basePrompt.join("\n");
|
|
3285
|
+
}
|
|
3286
|
+
function buildQualityPrompt(evalCase, candidate, question) {
|
|
3090
3287
|
const parts = [
|
|
3091
3288
|
"[[ ## expected_outcome ## ]]",
|
|
3092
3289
|
evalCase.expected_outcome.trim(),
|
|
3093
3290
|
"",
|
|
3094
3291
|
"[[ ## question ## ]]",
|
|
3095
|
-
|
|
3096
|
-
""
|
|
3097
|
-
"[[ ## reference_answer ## ]]",
|
|
3098
|
-
evalCase.reference_answer.trim(),
|
|
3099
|
-
"",
|
|
3100
|
-
"[[ ## candidate_answer ## ]]",
|
|
3101
|
-
candidate.trim(),
|
|
3102
|
-
"",
|
|
3103
|
-
"Respond with a single JSON object matching the schema described in the system prompt."
|
|
3292
|
+
question.trim(),
|
|
3293
|
+
""
|
|
3104
3294
|
];
|
|
3295
|
+
if (hasNonEmptyReferenceAnswer(evalCase)) {
|
|
3296
|
+
parts.push(
|
|
3297
|
+
"[[ ## reference_answer ## ]]",
|
|
3298
|
+
evalCase.reference_answer.trim(),
|
|
3299
|
+
""
|
|
3300
|
+
);
|
|
3301
|
+
}
|
|
3302
|
+
parts.push(
|
|
3303
|
+
"[[ ## candidate_answer ## ]]",
|
|
3304
|
+
candidate.trim()
|
|
3305
|
+
);
|
|
3105
3306
|
return parts.join("\n");
|
|
3106
3307
|
}
|
|
3107
3308
|
function clampScore(value) {
|
|
@@ -3184,6 +3385,9 @@ function extractJsonBlob(text) {
|
|
|
3184
3385
|
function isNonEmptyString(value) {
|
|
3185
3386
|
return typeof value === "string" && value.trim().length > 0;
|
|
3186
3387
|
}
|
|
3388
|
+
function hasNonEmptyReferenceAnswer(evalCase) {
|
|
3389
|
+
return evalCase.reference_answer !== void 0 && evalCase.reference_answer.trim().length > 0;
|
|
3390
|
+
}
|
|
3187
3391
|
var CodeEvaluator = class {
|
|
3188
3392
|
kind = "code";
|
|
3189
3393
|
script;
|
|
@@ -3842,11 +4046,27 @@ async function evaluateCandidate(options) {
|
|
|
3842
4046
|
agentTimeoutMs
|
|
3843
4047
|
});
|
|
3844
4048
|
const completedAt = nowFn();
|
|
3845
|
-
|
|
3846
|
-
|
|
3847
|
-
|
|
3848
|
-
|
|
3849
|
-
|
|
4049
|
+
let agentProviderRequest;
|
|
4050
|
+
let lmProviderRequest;
|
|
4051
|
+
if (isAgentProvider(provider)) {
|
|
4052
|
+
agentProviderRequest = {
|
|
4053
|
+
question: promptInputs.question,
|
|
4054
|
+
guideline_paths: evalCase.guideline_paths
|
|
4055
|
+
};
|
|
4056
|
+
} else {
|
|
4057
|
+
if (promptInputs.chatPrompt) {
|
|
4058
|
+
lmProviderRequest = {
|
|
4059
|
+
chat_prompt: promptInputs.chatPrompt,
|
|
4060
|
+
guideline_paths: evalCase.guideline_paths
|
|
4061
|
+
};
|
|
4062
|
+
} else {
|
|
4063
|
+
lmProviderRequest = {
|
|
4064
|
+
question: promptInputs.question,
|
|
4065
|
+
guidelines: promptInputs.guidelines,
|
|
4066
|
+
guideline_paths: evalCase.guideline_paths
|
|
4067
|
+
};
|
|
4068
|
+
}
|
|
4069
|
+
}
|
|
3850
4070
|
return {
|
|
3851
4071
|
eval_id: evalCase.id,
|
|
3852
4072
|
dataset: evalCase.dataset,
|
|
@@ -3860,7 +4080,8 @@ async function evaluateCandidate(options) {
|
|
|
3860
4080
|
timestamp: completedAt.toISOString(),
|
|
3861
4081
|
reasoning: score.reasoning,
|
|
3862
4082
|
raw_aspects: score.rawAspects,
|
|
3863
|
-
|
|
4083
|
+
agent_provider_request: agentProviderRequest,
|
|
4084
|
+
lm_provider_request: lmProviderRequest,
|
|
3864
4085
|
evaluator_raw_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
|
|
3865
4086
|
evaluator_results: evaluatorResults
|
|
3866
4087
|
};
|
|
@@ -4089,6 +4310,7 @@ async function invokeProvider(provider, options) {
|
|
|
4089
4310
|
question: promptInputs.question,
|
|
4090
4311
|
guidelines: promptInputs.guidelines,
|
|
4091
4312
|
guideline_patterns: evalCase.guideline_patterns,
|
|
4313
|
+
chatPrompt: promptInputs.chatPrompt,
|
|
4092
4314
|
inputFiles: evalCase.file_paths,
|
|
4093
4315
|
evalCaseId: evalCase.id,
|
|
4094
4316
|
attempt,
|
|
@@ -4105,12 +4327,30 @@ async function invokeProvider(provider, options) {
|
|
|
4105
4327
|
}
|
|
4106
4328
|
function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider) {
|
|
4107
4329
|
const message = error instanceof Error ? error.message : String(error);
|
|
4108
|
-
|
|
4109
|
-
|
|
4110
|
-
|
|
4111
|
-
|
|
4112
|
-
|
|
4113
|
-
|
|
4330
|
+
let agentProviderRequest;
|
|
4331
|
+
let lmProviderRequest;
|
|
4332
|
+
if (isAgentProvider(provider)) {
|
|
4333
|
+
agentProviderRequest = {
|
|
4334
|
+
question: promptInputs.question,
|
|
4335
|
+
guideline_paths: evalCase.guideline_paths,
|
|
4336
|
+
error: message
|
|
4337
|
+
};
|
|
4338
|
+
} else {
|
|
4339
|
+
if (promptInputs.chatPrompt) {
|
|
4340
|
+
lmProviderRequest = {
|
|
4341
|
+
chat_prompt: promptInputs.chatPrompt,
|
|
4342
|
+
guideline_paths: evalCase.guideline_paths,
|
|
4343
|
+
error: message
|
|
4344
|
+
};
|
|
4345
|
+
} else {
|
|
4346
|
+
lmProviderRequest = {
|
|
4347
|
+
question: promptInputs.question,
|
|
4348
|
+
guidelines: promptInputs.guidelines,
|
|
4349
|
+
guideline_paths: evalCase.guideline_paths,
|
|
4350
|
+
error: message
|
|
4351
|
+
};
|
|
4352
|
+
}
|
|
4353
|
+
}
|
|
4114
4354
|
return {
|
|
4115
4355
|
eval_id: evalCase.id,
|
|
4116
4356
|
dataset: evalCase.dataset,
|
|
@@ -4123,7 +4363,8 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
4123
4363
|
target: targetName,
|
|
4124
4364
|
timestamp: timestamp.toISOString(),
|
|
4125
4365
|
raw_aspects: [],
|
|
4126
|
-
|
|
4366
|
+
agent_provider_request: agentProviderRequest,
|
|
4367
|
+
lm_provider_request: lmProviderRequest,
|
|
4127
4368
|
error: message
|
|
4128
4369
|
};
|
|
4129
4370
|
}
|
|
@@ -4135,6 +4376,9 @@ function createCacheKey(provider, target, evalCase, promptInputs) {
|
|
|
4135
4376
|
hash.update(promptInputs.question);
|
|
4136
4377
|
hash.update(promptInputs.guidelines);
|
|
4137
4378
|
hash.update(promptInputs.systemMessage ?? "");
|
|
4379
|
+
if (promptInputs.chatPrompt) {
|
|
4380
|
+
hash.update(JSON.stringify(promptInputs.chatPrompt));
|
|
4381
|
+
}
|
|
4138
4382
|
return hash.digest("hex");
|
|
4139
4383
|
}
|
|
4140
4384
|
function isTimeoutLike(error) {
|