@agentv/core 0.7.5 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-7XM7HYRS.js → chunk-YQBJAT5I.js} +97 -67
- package/dist/chunk-YQBJAT5I.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +61 -69
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +51 -58
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +538 -192
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +136 -58
- package/dist/index.d.ts +136 -58
- package/dist/index.js +443 -127
- package/dist/index.js.map +1 -1
- package/package.json +1 -2
- package/dist/chunk-7XM7HYRS.js.map +0 -1
package/dist/index.cjs
CHANGED
|
@@ -434,14 +434,11 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
434
434
|
logWarning(`Skipping incomplete eval case: ${id ?? "unknown"}`);
|
|
435
435
|
continue;
|
|
436
436
|
}
|
|
437
|
-
|
|
438
|
-
logWarning(`Eval case '${id}' missing expected_messages array`);
|
|
439
|
-
continue;
|
|
440
|
-
}
|
|
437
|
+
const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
|
|
441
438
|
const inputMessages = inputMessagesValue.filter((msg) => isTestMessage(msg));
|
|
442
|
-
const expectedMessages = expectedMessagesValue.filter((msg) => isTestMessage(msg));
|
|
443
|
-
if (expectedMessages.length === 0) {
|
|
444
|
-
logWarning(`No expected message found for eval case: ${id}`);
|
|
439
|
+
const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
|
|
440
|
+
if (hasExpectedMessages && expectedMessages.length === 0) {
|
|
441
|
+
logWarning(`No valid expected message found for eval case: ${id}`);
|
|
445
442
|
continue;
|
|
446
443
|
}
|
|
447
444
|
if (expectedMessages.length > 1) {
|
|
@@ -459,17 +456,17 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
459
456
|
messageType: "input",
|
|
460
457
|
verbose
|
|
461
458
|
});
|
|
462
|
-
const outputSegments = await processMessages({
|
|
459
|
+
const outputSegments = hasExpectedMessages ? await processMessages({
|
|
463
460
|
messages: expectedMessages,
|
|
464
461
|
searchRoots,
|
|
465
462
|
repoRootPath,
|
|
466
463
|
guidelinePatterns,
|
|
467
464
|
messageType: "output",
|
|
468
465
|
verbose
|
|
469
|
-
});
|
|
466
|
+
}) : [];
|
|
470
467
|
const codeSnippets = extractCodeBlocks(inputSegments);
|
|
471
468
|
const expectedContent = expectedMessages[0]?.content;
|
|
472
|
-
const referenceAnswer = await resolveAssistantContent(expectedContent, searchRoots, verbose);
|
|
469
|
+
const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
|
|
473
470
|
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
474
471
|
const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
|
|
475
472
|
const evaluators = await parseEvaluators(evalcase, searchRoots, id ?? "unknown");
|
|
@@ -488,6 +485,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
488
485
|
dataset: datasetName,
|
|
489
486
|
conversation_id: conversationId,
|
|
490
487
|
question,
|
|
488
|
+
input_messages: inputMessages,
|
|
491
489
|
input_segments: inputSegments,
|
|
492
490
|
output_segments: outputSegments,
|
|
493
491
|
reference_answer: referenceAnswer,
|
|
@@ -515,6 +513,54 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
515
513
|
}
|
|
516
514
|
return results;
|
|
517
515
|
}
|
|
516
|
+
function needsRoleMarkers(messages, processedSegmentsByMessage) {
|
|
517
|
+
if (messages.some((msg) => msg.role === "assistant" || msg.role === "tool")) {
|
|
518
|
+
return true;
|
|
519
|
+
}
|
|
520
|
+
let messagesWithContent = 0;
|
|
521
|
+
for (const segments of processedSegmentsByMessage) {
|
|
522
|
+
if (hasVisibleContent(segments)) {
|
|
523
|
+
messagesWithContent++;
|
|
524
|
+
}
|
|
525
|
+
}
|
|
526
|
+
return messagesWithContent > 1;
|
|
527
|
+
}
|
|
528
|
+
function hasVisibleContent(segments) {
|
|
529
|
+
return segments.some((segment) => {
|
|
530
|
+
const type = asString(segment.type);
|
|
531
|
+
if (type === "text") {
|
|
532
|
+
const value = asString(segment.value);
|
|
533
|
+
return value !== void 0 && value.trim().length > 0;
|
|
534
|
+
}
|
|
535
|
+
if (type === "guideline_ref") {
|
|
536
|
+
return false;
|
|
537
|
+
}
|
|
538
|
+
if (type === "file") {
|
|
539
|
+
const text = asString(segment.text);
|
|
540
|
+
return text !== void 0 && text.trim().length > 0;
|
|
541
|
+
}
|
|
542
|
+
return false;
|
|
543
|
+
});
|
|
544
|
+
}
|
|
545
|
+
function formatSegment(segment) {
|
|
546
|
+
const type = asString(segment.type);
|
|
547
|
+
if (type === "text") {
|
|
548
|
+
return asString(segment.value);
|
|
549
|
+
}
|
|
550
|
+
if (type === "guideline_ref") {
|
|
551
|
+
const refPath = asString(segment.path);
|
|
552
|
+
return refPath ? `<Attached: ${refPath}>` : void 0;
|
|
553
|
+
}
|
|
554
|
+
if (type === "file") {
|
|
555
|
+
const text = asString(segment.text);
|
|
556
|
+
const filePath = asString(segment.path);
|
|
557
|
+
if (text && filePath) {
|
|
558
|
+
return `=== ${filePath} ===
|
|
559
|
+
${text}`;
|
|
560
|
+
}
|
|
561
|
+
}
|
|
562
|
+
return void 0;
|
|
563
|
+
}
|
|
518
564
|
async function buildPromptInputs(testCase) {
|
|
519
565
|
const guidelineContents = [];
|
|
520
566
|
for (const rawPath of testCase.guideline_paths) {
|
|
@@ -531,36 +577,168 @@ ${content}`);
|
|
|
531
577
|
logWarning(`Could not read guideline file ${absolutePath}: ${error.message}`);
|
|
532
578
|
}
|
|
533
579
|
}
|
|
534
|
-
const
|
|
580
|
+
const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
|
|
581
|
+
const segmentsByMessage = [];
|
|
582
|
+
const fileContentsByPath = /* @__PURE__ */ new Map();
|
|
535
583
|
for (const segment of testCase.input_segments) {
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
const pathValue = segment.path;
|
|
539
|
-
const textValue = segment.text;
|
|
540
|
-
const label = typeof pathValue === "string" ? pathValue : "file";
|
|
541
|
-
const body = typeof textValue === "string" ? textValue : "";
|
|
542
|
-
questionParts.push(`=== ${label} ===
|
|
543
|
-
${body}`);
|
|
544
|
-
continue;
|
|
584
|
+
if (segment.type === "file" && typeof segment.path === "string" && typeof segment.text === "string") {
|
|
585
|
+
fileContentsByPath.set(segment.path, segment.text);
|
|
545
586
|
}
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
587
|
+
}
|
|
588
|
+
for (const message of testCase.input_messages) {
|
|
589
|
+
const messageSegments = [];
|
|
590
|
+
if (typeof message.content === "string") {
|
|
591
|
+
if (message.content.trim().length > 0) {
|
|
592
|
+
messageSegments.push({ type: "text", value: message.content });
|
|
593
|
+
}
|
|
594
|
+
} else if (Array.isArray(message.content)) {
|
|
595
|
+
for (const segment of message.content) {
|
|
596
|
+
if (typeof segment === "string") {
|
|
597
|
+
if (segment.trim().length > 0) {
|
|
598
|
+
messageSegments.push({ type: "text", value: segment });
|
|
599
|
+
}
|
|
600
|
+
} else if (isJsonObject(segment)) {
|
|
601
|
+
const type = asString(segment.type);
|
|
602
|
+
if (type === "file") {
|
|
603
|
+
const value = asString(segment.value);
|
|
604
|
+
if (!value) continue;
|
|
605
|
+
if (testCase.guideline_patterns && isGuidelineFile(value, testCase.guideline_patterns)) {
|
|
606
|
+
messageSegments.push({ type: "guideline_ref", path: value });
|
|
607
|
+
continue;
|
|
608
|
+
}
|
|
609
|
+
const fileText = fileContentsByPath.get(value);
|
|
610
|
+
if (fileText !== void 0) {
|
|
611
|
+
messageSegments.push({ type: "file", text: fileText, path: value });
|
|
612
|
+
}
|
|
613
|
+
} else if (type === "text") {
|
|
614
|
+
const textValue = asString(segment.value);
|
|
615
|
+
if (textValue && textValue.trim().length > 0) {
|
|
616
|
+
messageSegments.push({ type: "text", value: textValue });
|
|
617
|
+
}
|
|
618
|
+
}
|
|
619
|
+
}
|
|
620
|
+
}
|
|
621
|
+
}
|
|
622
|
+
segmentsByMessage.push(messageSegments);
|
|
623
|
+
}
|
|
624
|
+
const useRoleMarkers = needsRoleMarkers(testCase.input_messages, segmentsByMessage);
|
|
625
|
+
let question;
|
|
626
|
+
if (useRoleMarkers) {
|
|
627
|
+
const messageParts = [];
|
|
628
|
+
for (let i = 0; i < testCase.input_messages.length; i++) {
|
|
629
|
+
const message = testCase.input_messages[i];
|
|
630
|
+
const segments = segmentsByMessage[i];
|
|
631
|
+
if (!hasVisibleContent(segments)) {
|
|
632
|
+
continue;
|
|
633
|
+
}
|
|
634
|
+
const roleLabel = message.role.charAt(0).toUpperCase() + message.role.slice(1);
|
|
635
|
+
const contentParts = [];
|
|
636
|
+
for (const segment of segments) {
|
|
637
|
+
const formattedContent = formatSegment(segment);
|
|
638
|
+
if (formattedContent) {
|
|
639
|
+
contentParts.push(formattedContent);
|
|
640
|
+
}
|
|
641
|
+
}
|
|
642
|
+
if (contentParts.length > 0) {
|
|
643
|
+
const messageContent = contentParts.join("\n");
|
|
644
|
+
messageParts.push(`@[${roleLabel}]:
|
|
645
|
+
${messageContent}`);
|
|
550
646
|
}
|
|
551
|
-
continue;
|
|
552
647
|
}
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
648
|
+
question = messageParts.join("\n\n");
|
|
649
|
+
} else {
|
|
650
|
+
const questionParts = [];
|
|
651
|
+
for (const segment of testCase.input_segments) {
|
|
652
|
+
const formattedContent = formatSegment(segment);
|
|
653
|
+
if (formattedContent) {
|
|
654
|
+
questionParts.push(formattedContent);
|
|
655
|
+
}
|
|
556
656
|
}
|
|
657
|
+
if (testCase.code_snippets.length > 0) {
|
|
658
|
+
questionParts.push(testCase.code_snippets.join("\n"));
|
|
659
|
+
}
|
|
660
|
+
question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
|
|
557
661
|
}
|
|
558
|
-
|
|
559
|
-
|
|
662
|
+
const chatPrompt = useRoleMarkers ? buildChatPromptFromSegments({
|
|
663
|
+
messages: testCase.input_messages,
|
|
664
|
+
segmentsByMessage,
|
|
665
|
+
guidelinePatterns: testCase.guideline_patterns,
|
|
666
|
+
guidelineContent: guidelines
|
|
667
|
+
}) : void 0;
|
|
668
|
+
return { question, guidelines, chatPrompt };
|
|
669
|
+
}
|
|
670
|
+
function buildChatPromptFromSegments(options) {
|
|
671
|
+
const { messages, segmentsByMessage, guidelinePatterns, guidelineContent, systemPrompt } = options;
|
|
672
|
+
if (messages.length === 0) {
|
|
673
|
+
return void 0;
|
|
560
674
|
}
|
|
561
|
-
const
|
|
562
|
-
|
|
563
|
-
|
|
675
|
+
const systemSegments = [];
|
|
676
|
+
if (systemPrompt && systemPrompt.trim().length > 0) {
|
|
677
|
+
systemSegments.push(systemPrompt.trim());
|
|
678
|
+
}
|
|
679
|
+
if (guidelineContent && guidelineContent.trim().length > 0) {
|
|
680
|
+
systemSegments.push(`[[ ## Guidelines ## ]]
|
|
681
|
+
|
|
682
|
+
${guidelineContent.trim()}`);
|
|
683
|
+
}
|
|
684
|
+
let startIndex = 0;
|
|
685
|
+
while (startIndex < messages.length && messages[startIndex].role === "system") {
|
|
686
|
+
const segments = segmentsByMessage[startIndex];
|
|
687
|
+
const contentParts = [];
|
|
688
|
+
for (const segment of segments) {
|
|
689
|
+
const formatted = formatSegment(segment);
|
|
690
|
+
if (formatted) {
|
|
691
|
+
contentParts.push(formatted);
|
|
692
|
+
}
|
|
693
|
+
}
|
|
694
|
+
if (contentParts.length > 0) {
|
|
695
|
+
systemSegments.push(contentParts.join("\n"));
|
|
696
|
+
}
|
|
697
|
+
startIndex += 1;
|
|
698
|
+
}
|
|
699
|
+
const chatPrompt = [];
|
|
700
|
+
if (systemSegments.length > 0) {
|
|
701
|
+
chatPrompt.push({
|
|
702
|
+
role: "system",
|
|
703
|
+
content: systemSegments.join("\n\n")
|
|
704
|
+
});
|
|
705
|
+
}
|
|
706
|
+
for (let i = startIndex; i < messages.length; i++) {
|
|
707
|
+
const message = messages[i];
|
|
708
|
+
const segments = segmentsByMessage[i];
|
|
709
|
+
const contentParts = [];
|
|
710
|
+
let role = message.role;
|
|
711
|
+
let name;
|
|
712
|
+
if (role === "system") {
|
|
713
|
+
role = "assistant";
|
|
714
|
+
contentParts.push("@[System]:");
|
|
715
|
+
} else if (role === "tool") {
|
|
716
|
+
role = "function";
|
|
717
|
+
name = "tool";
|
|
718
|
+
}
|
|
719
|
+
for (const segment of segments) {
|
|
720
|
+
if (segment.type === "guideline_ref") {
|
|
721
|
+
continue;
|
|
722
|
+
}
|
|
723
|
+
const formatted = formatSegment(segment);
|
|
724
|
+
if (formatted) {
|
|
725
|
+
const isGuidelineRef = segment.type === "file" && typeof segment.path === "string" && guidelinePatterns && isGuidelineFile(segment.path, guidelinePatterns);
|
|
726
|
+
if (isGuidelineRef) {
|
|
727
|
+
continue;
|
|
728
|
+
}
|
|
729
|
+
contentParts.push(formatted);
|
|
730
|
+
}
|
|
731
|
+
}
|
|
732
|
+
if (contentParts.length === 0) {
|
|
733
|
+
continue;
|
|
734
|
+
}
|
|
735
|
+
chatPrompt.push({
|
|
736
|
+
role,
|
|
737
|
+
content: contentParts.join("\n"),
|
|
738
|
+
...name ? { name } : {}
|
|
739
|
+
});
|
|
740
|
+
}
|
|
741
|
+
return chatPrompt.length > 0 ? chatPrompt : void 0;
|
|
564
742
|
}
|
|
565
743
|
async function fileExists2(absolutePath) {
|
|
566
744
|
try {
|
|
@@ -757,21 +935,14 @@ var import_ax = require("@ax-llm/ax");
|
|
|
757
935
|
var DEFAULT_SYSTEM_PROMPT = "You are a careful assistant. Follow all provided instructions and do not fabricate results.";
|
|
758
936
|
function buildChatPrompt(request) {
|
|
759
937
|
if (request.chatPrompt) {
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
} else {
|
|
767
|
-
systemSegments.push(DEFAULT_SYSTEM_PROMPT);
|
|
768
|
-
}
|
|
769
|
-
if (request.guidelines && request.guidelines.trim().length > 0) {
|
|
770
|
-
systemSegments.push(`[[ ## Guidelines ## ]]
|
|
771
|
-
|
|
772
|
-
${request.guidelines.trim()}`);
|
|
938
|
+
const hasSystemMessage = request.chatPrompt.some((message) => message.role === "system");
|
|
939
|
+
if (hasSystemMessage) {
|
|
940
|
+
return request.chatPrompt;
|
|
941
|
+
}
|
|
942
|
+
const systemContent2 = resolveSystemContent(request);
|
|
943
|
+
return [{ role: "system", content: systemContent2 }, ...request.chatPrompt];
|
|
773
944
|
}
|
|
774
|
-
const systemContent =
|
|
945
|
+
const systemContent = resolveSystemContent(request);
|
|
775
946
|
const userContent = request.question.trim();
|
|
776
947
|
const prompt = [
|
|
777
948
|
{
|
|
@@ -785,6 +956,21 @@ ${request.guidelines.trim()}`);
|
|
|
785
956
|
];
|
|
786
957
|
return prompt;
|
|
787
958
|
}
|
|
959
|
+
function resolveSystemContent(request) {
|
|
960
|
+
const systemSegments = [];
|
|
961
|
+
const metadataSystemPrompt = typeof request.metadata?.systemPrompt === "string" ? request.metadata.systemPrompt : void 0;
|
|
962
|
+
if (metadataSystemPrompt && metadataSystemPrompt.trim().length > 0) {
|
|
963
|
+
systemSegments.push(metadataSystemPrompt.trim());
|
|
964
|
+
} else {
|
|
965
|
+
systemSegments.push(DEFAULT_SYSTEM_PROMPT);
|
|
966
|
+
}
|
|
967
|
+
if (request.guidelines && request.guidelines.trim().length > 0) {
|
|
968
|
+
systemSegments.push(`[[ ## Guidelines ## ]]
|
|
969
|
+
|
|
970
|
+
${request.guidelines.trim()}`);
|
|
971
|
+
}
|
|
972
|
+
return systemSegments.join("\n\n");
|
|
973
|
+
}
|
|
788
974
|
function extractModelConfig(request, defaults) {
|
|
789
975
|
const temperature = request.temperature ?? defaults.temperature;
|
|
790
976
|
const maxTokens = request.maxOutputTokens ?? defaults.maxOutputTokens;
|
|
@@ -828,6 +1014,67 @@ function ensureChatResponse(result) {
|
|
|
828
1014
|
}
|
|
829
1015
|
return result;
|
|
830
1016
|
}
|
|
1017
|
+
function isRetryableError(error, retryableStatusCodes) {
|
|
1018
|
+
if (!error || typeof error !== "object") {
|
|
1019
|
+
return false;
|
|
1020
|
+
}
|
|
1021
|
+
if ("status" in error && typeof error.status === "number") {
|
|
1022
|
+
return retryableStatusCodes.includes(error.status);
|
|
1023
|
+
}
|
|
1024
|
+
if ("message" in error && typeof error.message === "string") {
|
|
1025
|
+
const match = error.message.match(/HTTP (\d{3})/);
|
|
1026
|
+
if (match) {
|
|
1027
|
+
const status = Number.parseInt(match[1], 10);
|
|
1028
|
+
return retryableStatusCodes.includes(status);
|
|
1029
|
+
}
|
|
1030
|
+
}
|
|
1031
|
+
if ("name" in error && error.name === "AxAIServiceNetworkError") {
|
|
1032
|
+
return true;
|
|
1033
|
+
}
|
|
1034
|
+
return false;
|
|
1035
|
+
}
|
|
1036
|
+
function calculateRetryDelay(attempt, config) {
|
|
1037
|
+
const delay = Math.min(
|
|
1038
|
+
config.maxDelayMs,
|
|
1039
|
+
config.initialDelayMs * config.backoffFactor ** attempt
|
|
1040
|
+
);
|
|
1041
|
+
return delay * (0.75 + Math.random() * 0.5);
|
|
1042
|
+
}
|
|
1043
|
+
async function sleep(ms) {
|
|
1044
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
1045
|
+
}
|
|
1046
|
+
async function withRetry(fn, retryConfig, signal) {
|
|
1047
|
+
const config = {
|
|
1048
|
+
maxRetries: retryConfig?.maxRetries ?? 3,
|
|
1049
|
+
initialDelayMs: retryConfig?.initialDelayMs ?? 1e3,
|
|
1050
|
+
maxDelayMs: retryConfig?.maxDelayMs ?? 6e4,
|
|
1051
|
+
backoffFactor: retryConfig?.backoffFactor ?? 2,
|
|
1052
|
+
retryableStatusCodes: retryConfig?.retryableStatusCodes ?? [500, 408, 429, 502, 503, 504]
|
|
1053
|
+
};
|
|
1054
|
+
let lastError;
|
|
1055
|
+
for (let attempt = 0; attempt <= config.maxRetries; attempt++) {
|
|
1056
|
+
if (signal?.aborted) {
|
|
1057
|
+
throw new Error(`Request aborted: ${signal.reason ?? "Unknown reason"}`);
|
|
1058
|
+
}
|
|
1059
|
+
try {
|
|
1060
|
+
return await fn();
|
|
1061
|
+
} catch (error) {
|
|
1062
|
+
lastError = error;
|
|
1063
|
+
if (attempt >= config.maxRetries) {
|
|
1064
|
+
break;
|
|
1065
|
+
}
|
|
1066
|
+
if (!isRetryableError(error, config.retryableStatusCodes)) {
|
|
1067
|
+
throw error;
|
|
1068
|
+
}
|
|
1069
|
+
const delay = calculateRetryDelay(attempt, config);
|
|
1070
|
+
await sleep(delay);
|
|
1071
|
+
if (signal?.aborted) {
|
|
1072
|
+
throw new Error(`Request aborted: ${signal.reason ?? "Unknown reason"}`);
|
|
1073
|
+
}
|
|
1074
|
+
}
|
|
1075
|
+
}
|
|
1076
|
+
throw lastError;
|
|
1077
|
+
}
|
|
831
1078
|
var AzureProvider = class {
|
|
832
1079
|
constructor(targetName, config) {
|
|
833
1080
|
this.config = config;
|
|
@@ -837,6 +1084,7 @@ var AzureProvider = class {
|
|
|
837
1084
|
temperature: config.temperature,
|
|
838
1085
|
maxOutputTokens: config.maxOutputTokens
|
|
839
1086
|
};
|
|
1087
|
+
this.retryConfig = config.retry;
|
|
840
1088
|
this.ai = import_ax.AxAI.create({
|
|
841
1089
|
name: "azure-openai",
|
|
842
1090
|
apiKey: config.apiKey,
|
|
@@ -853,16 +1101,21 @@ var AzureProvider = class {
|
|
|
853
1101
|
targetName;
|
|
854
1102
|
ai;
|
|
855
1103
|
defaults;
|
|
1104
|
+
retryConfig;
|
|
856
1105
|
async invoke(request) {
|
|
857
1106
|
const chatPrompt = buildChatPrompt(request);
|
|
858
1107
|
const modelConfig = extractModelConfig(request, this.defaults);
|
|
859
|
-
const response = await
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
1108
|
+
const response = await withRetry(
|
|
1109
|
+
async () => await this.ai.chat(
|
|
1110
|
+
{
|
|
1111
|
+
chatPrompt,
|
|
1112
|
+
model: this.config.deploymentName,
|
|
1113
|
+
...modelConfig ? { modelConfig } : {}
|
|
1114
|
+
},
|
|
1115
|
+
request.signal ? { abortSignal: request.signal } : void 0
|
|
1116
|
+
),
|
|
1117
|
+
this.retryConfig,
|
|
1118
|
+
request.signal
|
|
866
1119
|
);
|
|
867
1120
|
return mapResponse(ensureChatResponse(response));
|
|
868
1121
|
}
|
|
@@ -880,6 +1133,7 @@ var AnthropicProvider = class {
|
|
|
880
1133
|
maxOutputTokens: config.maxOutputTokens,
|
|
881
1134
|
thinkingBudget: config.thinkingBudget
|
|
882
1135
|
};
|
|
1136
|
+
this.retryConfig = config.retry;
|
|
883
1137
|
this.ai = import_ax.AxAI.create({
|
|
884
1138
|
name: "anthropic",
|
|
885
1139
|
apiKey: config.apiKey
|
|
@@ -890,16 +1144,21 @@ var AnthropicProvider = class {
|
|
|
890
1144
|
targetName;
|
|
891
1145
|
ai;
|
|
892
1146
|
defaults;
|
|
1147
|
+
retryConfig;
|
|
893
1148
|
async invoke(request) {
|
|
894
1149
|
const chatPrompt = buildChatPrompt(request);
|
|
895
1150
|
const modelConfig = extractModelConfig(request, this.defaults);
|
|
896
|
-
const response = await
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
1151
|
+
const response = await withRetry(
|
|
1152
|
+
async () => await this.ai.chat(
|
|
1153
|
+
{
|
|
1154
|
+
chatPrompt,
|
|
1155
|
+
model: this.config.model,
|
|
1156
|
+
...modelConfig ? { modelConfig } : {}
|
|
1157
|
+
},
|
|
1158
|
+
request.signal ? { abortSignal: request.signal } : void 0
|
|
1159
|
+
),
|
|
1160
|
+
this.retryConfig,
|
|
1161
|
+
request.signal
|
|
903
1162
|
);
|
|
904
1163
|
return mapResponse(ensureChatResponse(response));
|
|
905
1164
|
}
|
|
@@ -916,6 +1175,7 @@ var GeminiProvider = class {
|
|
|
916
1175
|
temperature: config.temperature,
|
|
917
1176
|
maxOutputTokens: config.maxOutputTokens
|
|
918
1177
|
};
|
|
1178
|
+
this.retryConfig = config.retry;
|
|
919
1179
|
this.ai = import_ax.AxAI.create({
|
|
920
1180
|
name: "google-gemini",
|
|
921
1181
|
apiKey: config.apiKey
|
|
@@ -926,16 +1186,21 @@ var GeminiProvider = class {
|
|
|
926
1186
|
targetName;
|
|
927
1187
|
ai;
|
|
928
1188
|
defaults;
|
|
1189
|
+
retryConfig;
|
|
929
1190
|
async invoke(request) {
|
|
930
1191
|
const chatPrompt = buildChatPrompt(request);
|
|
931
1192
|
const modelConfig = extractModelConfig(request, this.defaults);
|
|
932
|
-
const response = await
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
|
|
1193
|
+
const response = await withRetry(
|
|
1194
|
+
async () => await this.ai.chat(
|
|
1195
|
+
{
|
|
1196
|
+
chatPrompt,
|
|
1197
|
+
model: this.config.model,
|
|
1198
|
+
...modelConfig ? { modelConfig } : {}
|
|
1199
|
+
},
|
|
1200
|
+
request.signal ? { abortSignal: request.signal } : void 0
|
|
1201
|
+
),
|
|
1202
|
+
this.retryConfig,
|
|
1203
|
+
request.signal
|
|
939
1204
|
);
|
|
940
1205
|
return mapResponse(ensureChatResponse(response));
|
|
941
1206
|
}
|
|
@@ -1005,10 +1270,9 @@ var CliProvider = class {
|
|
|
1005
1270
|
const outputFilePath = generateOutputFilePath(request.evalCaseId);
|
|
1006
1271
|
const templateValues = buildTemplateValues(request, this.config, outputFilePath);
|
|
1007
1272
|
const renderedCommand = renderTemplate(this.config.commandTemplate, templateValues);
|
|
1008
|
-
const env = this.config.env ? { ...process.env, ...this.config.env } : process.env;
|
|
1009
1273
|
const result = await this.runCommand(renderedCommand, {
|
|
1010
1274
|
cwd: this.config.cwd,
|
|
1011
|
-
env,
|
|
1275
|
+
env: process.env,
|
|
1012
1276
|
timeoutMs: this.config.timeoutMs,
|
|
1013
1277
|
signal: request.signal
|
|
1014
1278
|
});
|
|
@@ -1097,10 +1361,9 @@ var CliProvider = class {
|
|
|
1097
1361
|
generateOutputFilePath("healthcheck")
|
|
1098
1362
|
)
|
|
1099
1363
|
);
|
|
1100
|
-
const env = this.config.env ? { ...process.env, ...this.config.env } : process.env;
|
|
1101
1364
|
const result = await this.runCommand(renderedCommand, {
|
|
1102
1365
|
cwd: healthcheck.cwd ?? this.config.cwd,
|
|
1103
|
-
env,
|
|
1366
|
+
env: process.env,
|
|
1104
1367
|
timeoutMs,
|
|
1105
1368
|
signal
|
|
1106
1369
|
});
|
|
@@ -2051,10 +2314,9 @@ var CLI_PLACEHOLDERS = /* @__PURE__ */ new Set(["PROMPT", "GUIDELINES", "EVAL_ID
|
|
|
2051
2314
|
var BASE_TARGET_SCHEMA = import_zod.z.object({
|
|
2052
2315
|
name: import_zod.z.string().min(1, "target name is required"),
|
|
2053
2316
|
provider: import_zod.z.string().min(1, "provider is required"),
|
|
2054
|
-
settings: import_zod.z.record(import_zod.z.unknown()).optional(),
|
|
2055
2317
|
judge_target: import_zod.z.string().optional(),
|
|
2056
2318
|
workers: import_zod.z.number().int().min(1).optional()
|
|
2057
|
-
});
|
|
2319
|
+
}).passthrough();
|
|
2058
2320
|
var DEFAULT_AZURE_API_VERSION = "2024-10-01-preview";
|
|
2059
2321
|
function normalizeAzureApiVersion(value) {
|
|
2060
2322
|
if (!value) {
|
|
@@ -2067,11 +2329,43 @@ function normalizeAzureApiVersion(value) {
|
|
|
2067
2329
|
const withoutPrefix = trimmed.replace(/^api[-_]?version\s*=\s*/i, "").trim();
|
|
2068
2330
|
return withoutPrefix.length > 0 ? withoutPrefix : DEFAULT_AZURE_API_VERSION;
|
|
2069
2331
|
}
|
|
2332
|
+
function resolveRetryConfig(target) {
|
|
2333
|
+
const maxRetries = resolveOptionalNumber(
|
|
2334
|
+
target.max_retries ?? target.maxRetries,
|
|
2335
|
+
`${target.name} max retries`
|
|
2336
|
+
);
|
|
2337
|
+
const initialDelayMs = resolveOptionalNumber(
|
|
2338
|
+
target.retry_initial_delay_ms ?? target.retryInitialDelayMs,
|
|
2339
|
+
`${target.name} retry initial delay`
|
|
2340
|
+
);
|
|
2341
|
+
const maxDelayMs = resolveOptionalNumber(
|
|
2342
|
+
target.retry_max_delay_ms ?? target.retryMaxDelayMs,
|
|
2343
|
+
`${target.name} retry max delay`
|
|
2344
|
+
);
|
|
2345
|
+
const backoffFactor = resolveOptionalNumber(
|
|
2346
|
+
target.retry_backoff_factor ?? target.retryBackoffFactor,
|
|
2347
|
+
`${target.name} retry backoff factor`
|
|
2348
|
+
);
|
|
2349
|
+
const retryableStatusCodes = resolveOptionalNumberArray(
|
|
2350
|
+
target.retry_status_codes ?? target.retryStatusCodes,
|
|
2351
|
+
`${target.name} retry status codes`
|
|
2352
|
+
);
|
|
2353
|
+
if (maxRetries === void 0 && initialDelayMs === void 0 && maxDelayMs === void 0 && backoffFactor === void 0 && retryableStatusCodes === void 0) {
|
|
2354
|
+
return void 0;
|
|
2355
|
+
}
|
|
2356
|
+
return {
|
|
2357
|
+
maxRetries,
|
|
2358
|
+
initialDelayMs,
|
|
2359
|
+
maxDelayMs,
|
|
2360
|
+
backoffFactor,
|
|
2361
|
+
retryableStatusCodes
|
|
2362
|
+
};
|
|
2363
|
+
}
|
|
2070
2364
|
function resolveTargetDefinition(definition, env = process.env) {
|
|
2071
2365
|
const parsed = BASE_TARGET_SCHEMA.parse(definition);
|
|
2072
2366
|
const provider = parsed.provider.toLowerCase();
|
|
2073
2367
|
const providerBatching = resolveOptionalBoolean(
|
|
2074
|
-
parsed.
|
|
2368
|
+
parsed.provider_batching ?? parsed.providerBatching
|
|
2075
2369
|
);
|
|
2076
2370
|
switch (provider) {
|
|
2077
2371
|
case "azure":
|
|
@@ -2147,13 +2441,12 @@ function resolveTargetDefinition(definition, env = process.env) {
|
|
|
2147
2441
|
}
|
|
2148
2442
|
}
|
|
2149
2443
|
function resolveAzureConfig(target, env) {
|
|
2150
|
-
const
|
|
2151
|
-
const
|
|
2152
|
-
const
|
|
2153
|
-
const
|
|
2154
|
-
const
|
|
2155
|
-
const
|
|
2156
|
-
const maxTokensSource = settings.max_output_tokens ?? settings.maxTokens;
|
|
2444
|
+
const endpointSource = target.endpoint ?? target.resource ?? target.resourceName;
|
|
2445
|
+
const apiKeySource = target.api_key ?? target.apiKey;
|
|
2446
|
+
const deploymentSource = target.deployment ?? target.deploymentName ?? target.model;
|
|
2447
|
+
const versionSource = target.version ?? target.api_version;
|
|
2448
|
+
const temperatureSource = target.temperature;
|
|
2449
|
+
const maxTokensSource = target.max_output_tokens ?? target.maxTokens;
|
|
2157
2450
|
const resourceName = resolveString(endpointSource, env, `${target.name} endpoint`);
|
|
2158
2451
|
const apiKey = resolveString(apiKeySource, env, `${target.name} api key`);
|
|
2159
2452
|
const deploymentName = resolveString(deploymentSource, env, `${target.name} deployment`);
|
|
@@ -2165,58 +2458,61 @@ function resolveAzureConfig(target, env) {
|
|
|
2165
2458
|
maxTokensSource,
|
|
2166
2459
|
`${target.name} max output tokens`
|
|
2167
2460
|
);
|
|
2461
|
+
const retry = resolveRetryConfig(target);
|
|
2168
2462
|
return {
|
|
2169
2463
|
resourceName,
|
|
2170
2464
|
deploymentName,
|
|
2171
2465
|
apiKey,
|
|
2172
2466
|
version,
|
|
2173
2467
|
temperature,
|
|
2174
|
-
maxOutputTokens
|
|
2468
|
+
maxOutputTokens,
|
|
2469
|
+
retry
|
|
2175
2470
|
};
|
|
2176
2471
|
}
|
|
2177
2472
|
function resolveAnthropicConfig(target, env) {
|
|
2178
|
-
const
|
|
2179
|
-
const
|
|
2180
|
-
const
|
|
2181
|
-
const
|
|
2182
|
-
const
|
|
2183
|
-
const thinkingBudgetSource = settings.thinking_budget ?? settings.thinkingBudget;
|
|
2473
|
+
const apiKeySource = target.api_key ?? target.apiKey;
|
|
2474
|
+
const modelSource = target.model ?? target.deployment ?? target.variant;
|
|
2475
|
+
const temperatureSource = target.temperature;
|
|
2476
|
+
const maxTokensSource = target.max_output_tokens ?? target.maxTokens;
|
|
2477
|
+
const thinkingBudgetSource = target.thinking_budget ?? target.thinkingBudget;
|
|
2184
2478
|
const apiKey = resolveString(apiKeySource, env, `${target.name} Anthropic api key`);
|
|
2185
2479
|
const model = resolveString(modelSource, env, `${target.name} Anthropic model`);
|
|
2480
|
+
const retry = resolveRetryConfig(target);
|
|
2186
2481
|
return {
|
|
2187
2482
|
apiKey,
|
|
2188
2483
|
model,
|
|
2189
2484
|
temperature: resolveOptionalNumber(temperatureSource, `${target.name} temperature`),
|
|
2190
2485
|
maxOutputTokens: resolveOptionalNumber(maxTokensSource, `${target.name} max output tokens`),
|
|
2191
|
-
thinkingBudget: resolveOptionalNumber(thinkingBudgetSource, `${target.name} thinking budget`)
|
|
2486
|
+
thinkingBudget: resolveOptionalNumber(thinkingBudgetSource, `${target.name} thinking budget`),
|
|
2487
|
+
retry
|
|
2192
2488
|
};
|
|
2193
2489
|
}
|
|
2194
2490
|
function resolveGeminiConfig(target, env) {
|
|
2195
|
-
const
|
|
2196
|
-
const
|
|
2197
|
-
const
|
|
2198
|
-
const
|
|
2199
|
-
const maxTokensSource = settings.max_output_tokens ?? settings.maxTokens;
|
|
2491
|
+
const apiKeySource = target.api_key ?? target.apiKey;
|
|
2492
|
+
const modelSource = target.model ?? target.deployment ?? target.variant;
|
|
2493
|
+
const temperatureSource = target.temperature;
|
|
2494
|
+
const maxTokensSource = target.max_output_tokens ?? target.maxTokens;
|
|
2200
2495
|
const apiKey = resolveString(apiKeySource, env, `${target.name} Google API key`);
|
|
2201
2496
|
const model = resolveOptionalString(modelSource, env, `${target.name} Gemini model`, {
|
|
2202
2497
|
allowLiteral: true,
|
|
2203
2498
|
optionalEnv: true
|
|
2204
2499
|
}) ?? "gemini-2.5-flash";
|
|
2500
|
+
const retry = resolveRetryConfig(target);
|
|
2205
2501
|
return {
|
|
2206
2502
|
apiKey,
|
|
2207
2503
|
model,
|
|
2208
2504
|
temperature: resolveOptionalNumber(temperatureSource, `${target.name} temperature`),
|
|
2209
|
-
maxOutputTokens: resolveOptionalNumber(maxTokensSource, `${target.name} max output tokens`)
|
|
2505
|
+
maxOutputTokens: resolveOptionalNumber(maxTokensSource, `${target.name} max output tokens`),
|
|
2506
|
+
retry
|
|
2210
2507
|
};
|
|
2211
2508
|
}
|
|
2212
2509
|
function resolveCodexConfig(target, env) {
|
|
2213
|
-
const
|
|
2214
|
-
const
|
|
2215
|
-
const
|
|
2216
|
-
const
|
|
2217
|
-
const
|
|
2218
|
-
const
|
|
2219
|
-
const logFormatSource = settings.log_format ?? settings.logFormat ?? settings.log_output_format ?? settings.logOutputFormat ?? env.AGENTV_CODEX_LOG_FORMAT;
|
|
2510
|
+
const executableSource = target.executable ?? target.command ?? target.binary;
|
|
2511
|
+
const argsSource = target.args ?? target.arguments;
|
|
2512
|
+
const cwdSource = target.cwd;
|
|
2513
|
+
const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
|
|
2514
|
+
const logDirSource = target.log_dir ?? target.logDir ?? target.log_directory ?? target.logDirectory;
|
|
2515
|
+
const logFormatSource = target.log_format ?? target.logFormat ?? target.log_output_format ?? target.logOutputFormat ?? env.AGENTV_CODEX_LOG_FORMAT;
|
|
2220
2516
|
const executable = resolveOptionalString(executableSource, env, `${target.name} codex executable`, {
|
|
2221
2517
|
allowLiteral: true,
|
|
2222
2518
|
optionalEnv: true
|
|
@@ -2255,21 +2551,19 @@ function normalizeCodexLogFormat(value) {
|
|
|
2255
2551
|
throw new Error("codex log format must be 'summary' or 'json'");
|
|
2256
2552
|
}
|
|
2257
2553
|
function resolveMockConfig(target) {
|
|
2258
|
-
const
|
|
2259
|
-
const response = typeof settings.response === "string" ? settings.response : void 0;
|
|
2554
|
+
const response = typeof target.response === "string" ? target.response : void 0;
|
|
2260
2555
|
return { response };
|
|
2261
2556
|
}
|
|
2262
2557
|
function resolveVSCodeConfig(target, env, insiders) {
|
|
2263
|
-
const
|
|
2264
|
-
const workspaceTemplateEnvVar = resolveOptionalLiteralString(settings.workspace_template ?? settings.workspaceTemplate);
|
|
2558
|
+
const workspaceTemplateEnvVar = resolveOptionalLiteralString(target.workspace_template ?? target.workspaceTemplate);
|
|
2265
2559
|
const workspaceTemplate = workspaceTemplateEnvVar ? resolveOptionalString(workspaceTemplateEnvVar, env, `${target.name} workspace template path`, {
|
|
2266
2560
|
allowLiteral: false,
|
|
2267
2561
|
optionalEnv: true
|
|
2268
2562
|
}) : void 0;
|
|
2269
|
-
const commandSource =
|
|
2270
|
-
const waitSource =
|
|
2271
|
-
const dryRunSource =
|
|
2272
|
-
const subagentRootSource =
|
|
2563
|
+
const commandSource = target.vscode_cmd ?? target.command;
|
|
2564
|
+
const waitSource = target.wait;
|
|
2565
|
+
const dryRunSource = target.dry_run ?? target.dryRun;
|
|
2566
|
+
const subagentRootSource = target.subagent_root ?? target.subagentRoot;
|
|
2273
2567
|
const defaultCommand = insiders ? "code-insiders" : "code";
|
|
2274
2568
|
const command = resolveOptionalLiteralString(commandSource) ?? defaultCommand;
|
|
2275
2569
|
return {
|
|
@@ -2284,18 +2578,16 @@ function resolveVSCodeConfig(target, env, insiders) {
|
|
|
2284
2578
|
};
|
|
2285
2579
|
}
|
|
2286
2580
|
function resolveCliConfig(target, env) {
|
|
2287
|
-
const
|
|
2288
|
-
const commandTemplateSource = settings.command_template ?? settings.commandTemplate;
|
|
2581
|
+
const commandTemplateSource = target.command_template ?? target.commandTemplate;
|
|
2289
2582
|
const filesFormat = resolveOptionalLiteralString(
|
|
2290
|
-
|
|
2583
|
+
target.files_format ?? target.filesFormat ?? target.attachments_format ?? target.attachmentsFormat
|
|
2291
2584
|
);
|
|
2292
|
-
const cwd = resolveOptionalString(
|
|
2585
|
+
const cwd = resolveOptionalString(target.cwd, env, `${target.name} working directory`, {
|
|
2293
2586
|
allowLiteral: true,
|
|
2294
2587
|
optionalEnv: true
|
|
2295
2588
|
});
|
|
2296
|
-
const
|
|
2297
|
-
const
|
|
2298
|
-
const healthcheck = resolveCliHealthcheck(settings.healthcheck, env, target.name);
|
|
2589
|
+
const timeoutMs = resolveTimeoutMs(target.timeout_seconds ?? target.timeoutSeconds, `${target.name} timeout`);
|
|
2590
|
+
const healthcheck = resolveCliHealthcheck(target.healthcheck, env, target.name);
|
|
2299
2591
|
const commandTemplate = resolveString(
|
|
2300
2592
|
commandTemplateSource,
|
|
2301
2593
|
env,
|
|
@@ -2307,29 +2599,10 @@ function resolveCliConfig(target, env) {
|
|
|
2307
2599
|
commandTemplate,
|
|
2308
2600
|
filesFormat,
|
|
2309
2601
|
cwd,
|
|
2310
|
-
env: envOverrides,
|
|
2311
2602
|
timeoutMs,
|
|
2312
2603
|
healthcheck
|
|
2313
2604
|
};
|
|
2314
2605
|
}
|
|
2315
|
-
function resolveEnvOverrides(source, env, targetName) {
|
|
2316
|
-
if (source === void 0 || source === null) {
|
|
2317
|
-
return void 0;
|
|
2318
|
-
}
|
|
2319
|
-
if (typeof source !== "object" || Array.isArray(source)) {
|
|
2320
|
-
throw new Error(`${targetName} env overrides must be an object map of strings`);
|
|
2321
|
-
}
|
|
2322
|
-
const entries = Object.entries(source);
|
|
2323
|
-
const resolved = {};
|
|
2324
|
-
for (const [key, value] of entries) {
|
|
2325
|
-
if (typeof value !== "string") {
|
|
2326
|
-
throw new Error(`${targetName} env override '${key}' must be a string`);
|
|
2327
|
-
}
|
|
2328
|
-
const resolvedValue = resolveString(value, env, `${targetName} env override '${key}'`);
|
|
2329
|
-
resolved[key] = resolvedValue;
|
|
2330
|
-
}
|
|
2331
|
-
return Object.keys(resolved).length > 0 ? resolved : void 0;
|
|
2332
|
-
}
|
|
2333
2606
|
function resolveTimeoutMs(source, description) {
|
|
2334
2607
|
const seconds = resolveOptionalNumber(source, `${description} (seconds)`);
|
|
2335
2608
|
if (seconds === void 0) {
|
|
@@ -2525,6 +2798,26 @@ function resolveOptionalStringArray(source, env, description) {
|
|
|
2525
2798
|
}
|
|
2526
2799
|
return resolved.length > 0 ? resolved : void 0;
|
|
2527
2800
|
}
|
|
2801
|
+
function resolveOptionalNumberArray(source, description) {
|
|
2802
|
+
if (source === void 0 || source === null) {
|
|
2803
|
+
return void 0;
|
|
2804
|
+
}
|
|
2805
|
+
if (!Array.isArray(source)) {
|
|
2806
|
+
throw new Error(`${description} must be an array of numbers`);
|
|
2807
|
+
}
|
|
2808
|
+
if (source.length === 0) {
|
|
2809
|
+
return void 0;
|
|
2810
|
+
}
|
|
2811
|
+
const resolved = [];
|
|
2812
|
+
for (let i = 0; i < source.length; i++) {
|
|
2813
|
+
const item = source[i];
|
|
2814
|
+
if (typeof item !== "number" || !Number.isFinite(item)) {
|
|
2815
|
+
throw new Error(`${description}[${i}] must be a number`);
|
|
2816
|
+
}
|
|
2817
|
+
resolved.push(item);
|
|
2818
|
+
}
|
|
2819
|
+
return resolved.length > 0 ? resolved : void 0;
|
|
2820
|
+
}
|
|
2528
2821
|
|
|
2529
2822
|
// src/evaluation/providers/vscode.ts
|
|
2530
2823
|
var import_node_path6 = __toESM(require("path"), 1);
|
|
@@ -2784,7 +3077,7 @@ var AGENT_PROVIDER_KINDS = [
|
|
|
2784
3077
|
"vscode",
|
|
2785
3078
|
"vscode-insiders"
|
|
2786
3079
|
];
|
|
2787
|
-
var TARGETS_SCHEMA_V2 = "agentv-targets-v2.
|
|
3080
|
+
var TARGETS_SCHEMA_V2 = "agentv-targets-v2.2";
|
|
2788
3081
|
function isAgentProvider(provider) {
|
|
2789
3082
|
return provider ? AGENT_PROVIDER_KINDS.includes(provider.kind) : false;
|
|
2790
3083
|
}
|
|
@@ -2827,20 +3120,13 @@ function assertTargetDefinition(value, index, filePath) {
|
|
|
2827
3120
|
}
|
|
2828
3121
|
const name = value.name;
|
|
2829
3122
|
const provider = value.provider;
|
|
2830
|
-
const settings = value.settings;
|
|
2831
|
-
const judgeTarget = value.judge_target;
|
|
2832
3123
|
if (typeof name !== "string" || name.trim().length === 0) {
|
|
2833
3124
|
throw new Error(`targets.yaml entry at index ${index} in ${filePath} is missing a valid 'name'`);
|
|
2834
3125
|
}
|
|
2835
3126
|
if (typeof provider !== "string" || provider.trim().length === 0) {
|
|
2836
3127
|
throw new Error(`targets.yaml entry '${name}' in ${filePath} is missing a valid 'provider'`);
|
|
2837
3128
|
}
|
|
2838
|
-
return
|
|
2839
|
-
name,
|
|
2840
|
-
provider,
|
|
2841
|
-
settings: isRecord(settings) ? settings : void 0,
|
|
2842
|
-
judge_target: typeof judgeTarget === "string" ? judgeTarget : void 0
|
|
2843
|
-
};
|
|
3129
|
+
return value;
|
|
2844
3130
|
}
|
|
2845
3131
|
async function fileExists3(filePath) {
|
|
2846
3132
|
try {
|
|
@@ -2920,19 +3206,21 @@ var LlmJudgeEvaluator = class {
|
|
|
2920
3206
|
return this.evaluateWithPrompt(context, judgeProvider);
|
|
2921
3207
|
}
|
|
2922
3208
|
async evaluateWithPrompt(context, judgeProvider) {
|
|
2923
|
-
|
|
2924
|
-
|
|
3209
|
+
const hasReferenceAnswer = hasNonEmptyReferenceAnswer(context.evalCase);
|
|
3210
|
+
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
3211
|
+
let prompt = buildQualityPrompt(context.evalCase, context.candidate, formattedQuestion);
|
|
3212
|
+
let systemPrompt = context.systemPrompt ?? this.customPrompt ?? buildSystemPrompt(hasReferenceAnswer);
|
|
2925
3213
|
if (systemPrompt && hasTemplateVariables(systemPrompt)) {
|
|
2926
3214
|
const variables = {
|
|
2927
3215
|
input_messages: JSON.stringify(context.evalCase.input_segments, null, 2),
|
|
2928
3216
|
output_messages: JSON.stringify(context.evalCase.output_segments, null, 2),
|
|
2929
3217
|
candidate_answer: context.candidate,
|
|
2930
|
-
reference_answer: context.evalCase.reference_answer,
|
|
3218
|
+
reference_answer: context.evalCase.reference_answer ?? "",
|
|
2931
3219
|
expected_outcome: context.evalCase.expected_outcome,
|
|
2932
|
-
question:
|
|
3220
|
+
question: formattedQuestion
|
|
2933
3221
|
};
|
|
2934
3222
|
prompt = substituteVariables(systemPrompt, variables);
|
|
2935
|
-
systemPrompt =
|
|
3223
|
+
systemPrompt = buildSystemPrompt(hasReferenceAnswer);
|
|
2936
3224
|
}
|
|
2937
3225
|
const metadata = {
|
|
2938
3226
|
...systemPrompt !== void 0 ? { systemPrompt } : {},
|
|
@@ -2970,38 +3258,51 @@ var LlmJudgeEvaluator = class {
|
|
|
2970
3258
|
};
|
|
2971
3259
|
}
|
|
2972
3260
|
};
|
|
2973
|
-
|
|
2974
|
-
|
|
2975
|
-
|
|
2976
|
-
|
|
2977
|
-
|
|
2978
|
-
|
|
2979
|
-
|
|
2980
|
-
|
|
2981
|
-
|
|
2982
|
-
|
|
2983
|
-
|
|
2984
|
-
|
|
2985
|
-
|
|
2986
|
-
|
|
2987
|
-
|
|
2988
|
-
|
|
2989
|
-
|
|
3261
|
+
function buildSystemPrompt(hasReferenceAnswer) {
|
|
3262
|
+
const basePrompt = [
|
|
3263
|
+
"You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.",
|
|
3264
|
+
""
|
|
3265
|
+
];
|
|
3266
|
+
if (hasReferenceAnswer) {
|
|
3267
|
+
basePrompt.push(
|
|
3268
|
+
"Use the reference_answer as a gold standard for a high-quality response. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.",
|
|
3269
|
+
""
|
|
3270
|
+
);
|
|
3271
|
+
}
|
|
3272
|
+
basePrompt.push(
|
|
3273
|
+
"Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.",
|
|
3274
|
+
"",
|
|
3275
|
+
"You must respond with a single JSON object matching this schema:",
|
|
3276
|
+
"",
|
|
3277
|
+
"{",
|
|
3278
|
+
' "score": <number between 0.0 and 1.0>,',
|
|
3279
|
+
' "hits": [<array of strings, max 4 items, brief specific achievements>],',
|
|
3280
|
+
' "misses": [<array of strings, max 4 items, brief specific failures or omissions, empty if none>],',
|
|
3281
|
+
' "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
|
|
3282
|
+
"}"
|
|
3283
|
+
);
|
|
3284
|
+
return basePrompt.join("\n");
|
|
3285
|
+
}
|
|
3286
|
+
function buildQualityPrompt(evalCase, candidate, question) {
|
|
2990
3287
|
const parts = [
|
|
2991
3288
|
"[[ ## expected_outcome ## ]]",
|
|
2992
3289
|
evalCase.expected_outcome.trim(),
|
|
2993
3290
|
"",
|
|
2994
3291
|
"[[ ## question ## ]]",
|
|
2995
|
-
|
|
2996
|
-
""
|
|
2997
|
-
"[[ ## reference_answer ## ]]",
|
|
2998
|
-
evalCase.reference_answer.trim(),
|
|
2999
|
-
"",
|
|
3000
|
-
"[[ ## candidate_answer ## ]]",
|
|
3001
|
-
candidate.trim(),
|
|
3002
|
-
"",
|
|
3003
|
-
"Respond with a single JSON object matching the schema described in the system prompt."
|
|
3292
|
+
question.trim(),
|
|
3293
|
+
""
|
|
3004
3294
|
];
|
|
3295
|
+
if (hasNonEmptyReferenceAnswer(evalCase)) {
|
|
3296
|
+
parts.push(
|
|
3297
|
+
"[[ ## reference_answer ## ]]",
|
|
3298
|
+
evalCase.reference_answer.trim(),
|
|
3299
|
+
""
|
|
3300
|
+
);
|
|
3301
|
+
}
|
|
3302
|
+
parts.push(
|
|
3303
|
+
"[[ ## candidate_answer ## ]]",
|
|
3304
|
+
candidate.trim()
|
|
3305
|
+
);
|
|
3005
3306
|
return parts.join("\n");
|
|
3006
3307
|
}
|
|
3007
3308
|
function clampScore(value) {
|
|
@@ -3084,6 +3385,9 @@ function extractJsonBlob(text) {
|
|
|
3084
3385
|
function isNonEmptyString(value) {
|
|
3085
3386
|
return typeof value === "string" && value.trim().length > 0;
|
|
3086
3387
|
}
|
|
3388
|
+
function hasNonEmptyReferenceAnswer(evalCase) {
|
|
3389
|
+
return evalCase.reference_answer !== void 0 && evalCase.reference_answer.trim().length > 0;
|
|
3390
|
+
}
|
|
3087
3391
|
var CodeEvaluator = class {
|
|
3088
3392
|
kind = "code";
|
|
3089
3393
|
script;
|
|
@@ -3481,10 +3785,11 @@ async function runEvaluation(options) {
|
|
|
3481
3785
|
await onProgress({
|
|
3482
3786
|
workerId,
|
|
3483
3787
|
evalId: evalCase.id,
|
|
3484
|
-
status: "completed",
|
|
3788
|
+
status: result.error ? "failed" : "completed",
|
|
3485
3789
|
startedAt: 0,
|
|
3486
3790
|
// Not used for completed status
|
|
3487
|
-
completedAt: Date.now()
|
|
3791
|
+
completedAt: Date.now(),
|
|
3792
|
+
error: result.error
|
|
3488
3793
|
});
|
|
3489
3794
|
}
|
|
3490
3795
|
if (onResult) {
|
|
@@ -3741,11 +4046,27 @@ async function evaluateCandidate(options) {
|
|
|
3741
4046
|
agentTimeoutMs
|
|
3742
4047
|
});
|
|
3743
4048
|
const completedAt = nowFn();
|
|
3744
|
-
|
|
3745
|
-
|
|
3746
|
-
|
|
3747
|
-
|
|
3748
|
-
|
|
4049
|
+
let agentProviderRequest;
|
|
4050
|
+
let lmProviderRequest;
|
|
4051
|
+
if (isAgentProvider(provider)) {
|
|
4052
|
+
agentProviderRequest = {
|
|
4053
|
+
question: promptInputs.question,
|
|
4054
|
+
guideline_paths: evalCase.guideline_paths
|
|
4055
|
+
};
|
|
4056
|
+
} else {
|
|
4057
|
+
if (promptInputs.chatPrompt) {
|
|
4058
|
+
lmProviderRequest = {
|
|
4059
|
+
chat_prompt: promptInputs.chatPrompt,
|
|
4060
|
+
guideline_paths: evalCase.guideline_paths
|
|
4061
|
+
};
|
|
4062
|
+
} else {
|
|
4063
|
+
lmProviderRequest = {
|
|
4064
|
+
question: promptInputs.question,
|
|
4065
|
+
guidelines: promptInputs.guidelines,
|
|
4066
|
+
guideline_paths: evalCase.guideline_paths
|
|
4067
|
+
};
|
|
4068
|
+
}
|
|
4069
|
+
}
|
|
3749
4070
|
return {
|
|
3750
4071
|
eval_id: evalCase.id,
|
|
3751
4072
|
dataset: evalCase.dataset,
|
|
@@ -3759,7 +4080,8 @@ async function evaluateCandidate(options) {
|
|
|
3759
4080
|
timestamp: completedAt.toISOString(),
|
|
3760
4081
|
reasoning: score.reasoning,
|
|
3761
4082
|
raw_aspects: score.rawAspects,
|
|
3762
|
-
|
|
4083
|
+
agent_provider_request: agentProviderRequest,
|
|
4084
|
+
lm_provider_request: lmProviderRequest,
|
|
3763
4085
|
evaluator_raw_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
|
|
3764
4086
|
evaluator_results: evaluatorResults
|
|
3765
4087
|
};
|
|
@@ -3988,6 +4310,7 @@ async function invokeProvider(provider, options) {
|
|
|
3988
4310
|
question: promptInputs.question,
|
|
3989
4311
|
guidelines: promptInputs.guidelines,
|
|
3990
4312
|
guideline_patterns: evalCase.guideline_patterns,
|
|
4313
|
+
chatPrompt: promptInputs.chatPrompt,
|
|
3991
4314
|
inputFiles: evalCase.file_paths,
|
|
3992
4315
|
evalCaseId: evalCase.id,
|
|
3993
4316
|
attempt,
|
|
@@ -4004,12 +4327,30 @@ async function invokeProvider(provider, options) {
|
|
|
4004
4327
|
}
|
|
4005
4328
|
function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider) {
|
|
4006
4329
|
const message = error instanceof Error ? error.message : String(error);
|
|
4007
|
-
|
|
4008
|
-
|
|
4009
|
-
|
|
4010
|
-
|
|
4011
|
-
|
|
4012
|
-
|
|
4330
|
+
let agentProviderRequest;
|
|
4331
|
+
let lmProviderRequest;
|
|
4332
|
+
if (isAgentProvider(provider)) {
|
|
4333
|
+
agentProviderRequest = {
|
|
4334
|
+
question: promptInputs.question,
|
|
4335
|
+
guideline_paths: evalCase.guideline_paths,
|
|
4336
|
+
error: message
|
|
4337
|
+
};
|
|
4338
|
+
} else {
|
|
4339
|
+
if (promptInputs.chatPrompt) {
|
|
4340
|
+
lmProviderRequest = {
|
|
4341
|
+
chat_prompt: promptInputs.chatPrompt,
|
|
4342
|
+
guideline_paths: evalCase.guideline_paths,
|
|
4343
|
+
error: message
|
|
4344
|
+
};
|
|
4345
|
+
} else {
|
|
4346
|
+
lmProviderRequest = {
|
|
4347
|
+
question: promptInputs.question,
|
|
4348
|
+
guidelines: promptInputs.guidelines,
|
|
4349
|
+
guideline_paths: evalCase.guideline_paths,
|
|
4350
|
+
error: message
|
|
4351
|
+
};
|
|
4352
|
+
}
|
|
4353
|
+
}
|
|
4013
4354
|
return {
|
|
4014
4355
|
eval_id: evalCase.id,
|
|
4015
4356
|
dataset: evalCase.dataset,
|
|
@@ -4022,7 +4363,9 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
4022
4363
|
target: targetName,
|
|
4023
4364
|
timestamp: timestamp.toISOString(),
|
|
4024
4365
|
raw_aspects: [],
|
|
4025
|
-
|
|
4366
|
+
agent_provider_request: agentProviderRequest,
|
|
4367
|
+
lm_provider_request: lmProviderRequest,
|
|
4368
|
+
error: message
|
|
4026
4369
|
};
|
|
4027
4370
|
}
|
|
4028
4371
|
function createCacheKey(provider, target, evalCase, promptInputs) {
|
|
@@ -4033,6 +4376,9 @@ function createCacheKey(provider, target, evalCase, promptInputs) {
|
|
|
4033
4376
|
hash.update(promptInputs.question);
|
|
4034
4377
|
hash.update(promptInputs.guidelines);
|
|
4035
4378
|
hash.update(promptInputs.systemMessage ?? "");
|
|
4379
|
+
if (promptInputs.chatPrompt) {
|
|
4380
|
+
hash.update(JSON.stringify(promptInputs.chatPrompt));
|
|
4381
|
+
}
|
|
4036
4382
|
return hash.digest("hex");
|
|
4037
4383
|
}
|
|
4038
4384
|
function isTimeoutLike(error) {
|