@agentv/core 0.9.0 → 0.10.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-SNTZFB24.js → chunk-YQBJAT5I.js} +1 -1
- package/dist/{chunk-SNTZFB24.js.map → chunk-YQBJAT5I.js.map} +1 -1
- package/dist/evaluation/validation/index.cjs +30 -13
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +21 -4
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +375 -104
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +74 -64
- package/dist/index.d.ts +74 -64
- package/dist/index.js +375 -105
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.cjs
CHANGED
|
@@ -54,6 +54,7 @@ __export(index_exports, {
|
|
|
54
54
|
loadEvalCases: () => loadEvalCases,
|
|
55
55
|
normalizeLineEndings: () => normalizeLineEndings,
|
|
56
56
|
readTargetDefinitions: () => readTargetDefinitions,
|
|
57
|
+
readTestSuiteMetadata: () => readTestSuiteMetadata,
|
|
57
58
|
readTextFile: () => readTextFile,
|
|
58
59
|
resolveAndCreateProvider: () => resolveAndCreateProvider,
|
|
59
60
|
resolveFileReference: () => resolveFileReference,
|
|
@@ -239,6 +240,33 @@ var ANSI_YELLOW = "\x1B[33m";
|
|
|
239
240
|
var ANSI_RESET = "\x1B[0m";
|
|
240
241
|
var SCHEMA_EVAL_V2 = "agentv-eval-v2";
|
|
241
242
|
var SCHEMA_CONFIG_V2 = "agentv-config-v2";
|
|
243
|
+
async function readTestSuiteMetadata(testFilePath) {
|
|
244
|
+
try {
|
|
245
|
+
const absolutePath = import_node_path2.default.resolve(testFilePath);
|
|
246
|
+
const content = await (0, import_promises2.readFile)(absolutePath, "utf8");
|
|
247
|
+
const parsed = (0, import_yaml.parse)(content);
|
|
248
|
+
if (!isJsonObject(parsed)) {
|
|
249
|
+
return {};
|
|
250
|
+
}
|
|
251
|
+
return { target: extractTargetFromSuite(parsed) };
|
|
252
|
+
} catch {
|
|
253
|
+
return {};
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
function extractTargetFromSuite(suite) {
|
|
257
|
+
const execution = suite.execution;
|
|
258
|
+
if (execution && typeof execution === "object" && !Array.isArray(execution)) {
|
|
259
|
+
const executionTarget = execution.target;
|
|
260
|
+
if (typeof executionTarget === "string" && executionTarget.trim().length > 0) {
|
|
261
|
+
return executionTarget.trim();
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
const targetValue = suite.target;
|
|
265
|
+
if (typeof targetValue === "string" && targetValue.trim().length > 0) {
|
|
266
|
+
return targetValue.trim();
|
|
267
|
+
}
|
|
268
|
+
return void 0;
|
|
269
|
+
}
|
|
242
270
|
async function loadConfig(evalFilePath, repoRoot) {
|
|
243
271
|
const directories = buildDirectoryChain(evalFilePath, repoRoot);
|
|
244
272
|
for (const directory of directories) {
|
|
@@ -415,6 +443,8 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
415
443
|
throw new Error(`Invalid test file format: ${evalFilePath} - missing 'evalcases' field`);
|
|
416
444
|
}
|
|
417
445
|
const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm_judge";
|
|
446
|
+
const globalExecution = isJsonObject(suite.execution) ? suite.execution : void 0;
|
|
447
|
+
const globalTarget = asString(globalExecution?.target) ?? asString(suite.target);
|
|
418
448
|
const results = [];
|
|
419
449
|
for (const rawEvalcase of rawTestcases) {
|
|
420
450
|
if (!isJsonObject(rawEvalcase)) {
|
|
@@ -434,14 +464,11 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
434
464
|
logWarning(`Skipping incomplete eval case: ${id ?? "unknown"}`);
|
|
435
465
|
continue;
|
|
436
466
|
}
|
|
437
|
-
|
|
438
|
-
logWarning(`Eval case '${id}' missing expected_messages array`);
|
|
439
|
-
continue;
|
|
440
|
-
}
|
|
467
|
+
const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
|
|
441
468
|
const inputMessages = inputMessagesValue.filter((msg) => isTestMessage(msg));
|
|
442
|
-
const expectedMessages = expectedMessagesValue.filter((msg) => isTestMessage(msg));
|
|
443
|
-
if (expectedMessages.length === 0) {
|
|
444
|
-
logWarning(`No expected message found for eval case: ${id}`);
|
|
469
|
+
const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
|
|
470
|
+
if (hasExpectedMessages && expectedMessages.length === 0) {
|
|
471
|
+
logWarning(`No valid expected message found for eval case: ${id}`);
|
|
445
472
|
continue;
|
|
446
473
|
}
|
|
447
474
|
if (expectedMessages.length > 1) {
|
|
@@ -459,20 +486,20 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
459
486
|
messageType: "input",
|
|
460
487
|
verbose
|
|
461
488
|
});
|
|
462
|
-
const outputSegments = await processMessages({
|
|
489
|
+
const outputSegments = hasExpectedMessages ? await processMessages({
|
|
463
490
|
messages: expectedMessages,
|
|
464
491
|
searchRoots,
|
|
465
492
|
repoRootPath,
|
|
466
493
|
guidelinePatterns,
|
|
467
494
|
messageType: "output",
|
|
468
495
|
verbose
|
|
469
|
-
});
|
|
496
|
+
}) : [];
|
|
470
497
|
const codeSnippets = extractCodeBlocks(inputSegments);
|
|
471
498
|
const expectedContent = expectedMessages[0]?.content;
|
|
472
|
-
const referenceAnswer = await resolveAssistantContent(expectedContent, searchRoots, verbose);
|
|
499
|
+
const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
|
|
473
500
|
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
474
501
|
const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
|
|
475
|
-
const evaluators = await parseEvaluators(evalcase, searchRoots, id ?? "unknown");
|
|
502
|
+
const evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
|
|
476
503
|
const userFilePaths = [];
|
|
477
504
|
for (const segment of inputSegments) {
|
|
478
505
|
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
@@ -488,6 +515,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
488
515
|
dataset: datasetName,
|
|
489
516
|
conversation_id: conversationId,
|
|
490
517
|
question,
|
|
518
|
+
input_messages: inputMessages,
|
|
491
519
|
input_segments: inputSegments,
|
|
492
520
|
output_segments: outputSegments,
|
|
493
521
|
reference_answer: referenceAnswer,
|
|
@@ -515,6 +543,54 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
515
543
|
}
|
|
516
544
|
return results;
|
|
517
545
|
}
|
|
546
|
+
function needsRoleMarkers(messages, processedSegmentsByMessage) {
|
|
547
|
+
if (messages.some((msg) => msg.role === "assistant" || msg.role === "tool")) {
|
|
548
|
+
return true;
|
|
549
|
+
}
|
|
550
|
+
let messagesWithContent = 0;
|
|
551
|
+
for (const segments of processedSegmentsByMessage) {
|
|
552
|
+
if (hasVisibleContent(segments)) {
|
|
553
|
+
messagesWithContent++;
|
|
554
|
+
}
|
|
555
|
+
}
|
|
556
|
+
return messagesWithContent > 1;
|
|
557
|
+
}
|
|
558
|
+
function hasVisibleContent(segments) {
|
|
559
|
+
return segments.some((segment) => {
|
|
560
|
+
const type = asString(segment.type);
|
|
561
|
+
if (type === "text") {
|
|
562
|
+
const value = asString(segment.value);
|
|
563
|
+
return value !== void 0 && value.trim().length > 0;
|
|
564
|
+
}
|
|
565
|
+
if (type === "guideline_ref") {
|
|
566
|
+
return false;
|
|
567
|
+
}
|
|
568
|
+
if (type === "file") {
|
|
569
|
+
const text = asString(segment.text);
|
|
570
|
+
return text !== void 0 && text.trim().length > 0;
|
|
571
|
+
}
|
|
572
|
+
return false;
|
|
573
|
+
});
|
|
574
|
+
}
|
|
575
|
+
function formatSegment(segment) {
|
|
576
|
+
const type = asString(segment.type);
|
|
577
|
+
if (type === "text") {
|
|
578
|
+
return asString(segment.value);
|
|
579
|
+
}
|
|
580
|
+
if (type === "guideline_ref") {
|
|
581
|
+
const refPath = asString(segment.path);
|
|
582
|
+
return refPath ? `<Attached: ${refPath}>` : void 0;
|
|
583
|
+
}
|
|
584
|
+
if (type === "file") {
|
|
585
|
+
const text = asString(segment.text);
|
|
586
|
+
const filePath = asString(segment.path);
|
|
587
|
+
if (text && filePath) {
|
|
588
|
+
return `=== ${filePath} ===
|
|
589
|
+
${text}`;
|
|
590
|
+
}
|
|
591
|
+
}
|
|
592
|
+
return void 0;
|
|
593
|
+
}
|
|
518
594
|
async function buildPromptInputs(testCase) {
|
|
519
595
|
const guidelineContents = [];
|
|
520
596
|
for (const rawPath of testCase.guideline_paths) {
|
|
@@ -531,36 +607,168 @@ ${content}`);
|
|
|
531
607
|
logWarning(`Could not read guideline file ${absolutePath}: ${error.message}`);
|
|
532
608
|
}
|
|
533
609
|
}
|
|
534
|
-
const
|
|
610
|
+
const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
|
|
611
|
+
const segmentsByMessage = [];
|
|
612
|
+
const fileContentsByPath = /* @__PURE__ */ new Map();
|
|
535
613
|
for (const segment of testCase.input_segments) {
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
const pathValue = segment.path;
|
|
539
|
-
const textValue = segment.text;
|
|
540
|
-
const label = typeof pathValue === "string" ? pathValue : "file";
|
|
541
|
-
const body = typeof textValue === "string" ? textValue : "";
|
|
542
|
-
questionParts.push(`=== ${label} ===
|
|
543
|
-
${body}`);
|
|
544
|
-
continue;
|
|
614
|
+
if (segment.type === "file" && typeof segment.path === "string" && typeof segment.text === "string") {
|
|
615
|
+
fileContentsByPath.set(segment.path, segment.text);
|
|
545
616
|
}
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
617
|
+
}
|
|
618
|
+
for (const message of testCase.input_messages) {
|
|
619
|
+
const messageSegments = [];
|
|
620
|
+
if (typeof message.content === "string") {
|
|
621
|
+
if (message.content.trim().length > 0) {
|
|
622
|
+
messageSegments.push({ type: "text", value: message.content });
|
|
623
|
+
}
|
|
624
|
+
} else if (Array.isArray(message.content)) {
|
|
625
|
+
for (const segment of message.content) {
|
|
626
|
+
if (typeof segment === "string") {
|
|
627
|
+
if (segment.trim().length > 0) {
|
|
628
|
+
messageSegments.push({ type: "text", value: segment });
|
|
629
|
+
}
|
|
630
|
+
} else if (isJsonObject(segment)) {
|
|
631
|
+
const type = asString(segment.type);
|
|
632
|
+
if (type === "file") {
|
|
633
|
+
const value = asString(segment.value);
|
|
634
|
+
if (!value) continue;
|
|
635
|
+
if (testCase.guideline_patterns && isGuidelineFile(value, testCase.guideline_patterns)) {
|
|
636
|
+
messageSegments.push({ type: "guideline_ref", path: value });
|
|
637
|
+
continue;
|
|
638
|
+
}
|
|
639
|
+
const fileText = fileContentsByPath.get(value);
|
|
640
|
+
if (fileText !== void 0) {
|
|
641
|
+
messageSegments.push({ type: "file", text: fileText, path: value });
|
|
642
|
+
}
|
|
643
|
+
} else if (type === "text") {
|
|
644
|
+
const textValue = asString(segment.value);
|
|
645
|
+
if (textValue && textValue.trim().length > 0) {
|
|
646
|
+
messageSegments.push({ type: "text", value: textValue });
|
|
647
|
+
}
|
|
648
|
+
}
|
|
649
|
+
}
|
|
650
|
+
}
|
|
651
|
+
}
|
|
652
|
+
segmentsByMessage.push(messageSegments);
|
|
653
|
+
}
|
|
654
|
+
const useRoleMarkers = needsRoleMarkers(testCase.input_messages, segmentsByMessage);
|
|
655
|
+
let question;
|
|
656
|
+
if (useRoleMarkers) {
|
|
657
|
+
const messageParts = [];
|
|
658
|
+
for (let i = 0; i < testCase.input_messages.length; i++) {
|
|
659
|
+
const message = testCase.input_messages[i];
|
|
660
|
+
const segments = segmentsByMessage[i];
|
|
661
|
+
if (!hasVisibleContent(segments)) {
|
|
662
|
+
continue;
|
|
663
|
+
}
|
|
664
|
+
const roleLabel = message.role.charAt(0).toUpperCase() + message.role.slice(1);
|
|
665
|
+
const contentParts = [];
|
|
666
|
+
for (const segment of segments) {
|
|
667
|
+
const formattedContent = formatSegment(segment);
|
|
668
|
+
if (formattedContent) {
|
|
669
|
+
contentParts.push(formattedContent);
|
|
670
|
+
}
|
|
671
|
+
}
|
|
672
|
+
if (contentParts.length > 0) {
|
|
673
|
+
const messageContent = contentParts.join("\n");
|
|
674
|
+
messageParts.push(`@[${roleLabel}]:
|
|
675
|
+
${messageContent}`);
|
|
676
|
+
}
|
|
677
|
+
}
|
|
678
|
+
question = messageParts.join("\n\n");
|
|
679
|
+
} else {
|
|
680
|
+
const questionParts = [];
|
|
681
|
+
for (const segment of testCase.input_segments) {
|
|
682
|
+
const formattedContent = formatSegment(segment);
|
|
683
|
+
if (formattedContent) {
|
|
684
|
+
questionParts.push(formattedContent);
|
|
685
|
+
}
|
|
686
|
+
}
|
|
687
|
+
if (testCase.code_snippets.length > 0) {
|
|
688
|
+
questionParts.push(testCase.code_snippets.join("\n"));
|
|
689
|
+
}
|
|
690
|
+
question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
|
|
691
|
+
}
|
|
692
|
+
const chatPrompt = useRoleMarkers ? buildChatPromptFromSegments({
|
|
693
|
+
messages: testCase.input_messages,
|
|
694
|
+
segmentsByMessage,
|
|
695
|
+
guidelinePatterns: testCase.guideline_patterns,
|
|
696
|
+
guidelineContent: guidelines
|
|
697
|
+
}) : void 0;
|
|
698
|
+
return { question, guidelines, chatPrompt };
|
|
699
|
+
}
|
|
700
|
+
function buildChatPromptFromSegments(options) {
|
|
701
|
+
const { messages, segmentsByMessage, guidelinePatterns, guidelineContent, systemPrompt } = options;
|
|
702
|
+
if (messages.length === 0) {
|
|
703
|
+
return void 0;
|
|
704
|
+
}
|
|
705
|
+
const systemSegments = [];
|
|
706
|
+
if (systemPrompt && systemPrompt.trim().length > 0) {
|
|
707
|
+
systemSegments.push(systemPrompt.trim());
|
|
708
|
+
}
|
|
709
|
+
if (guidelineContent && guidelineContent.trim().length > 0) {
|
|
710
|
+
systemSegments.push(`[[ ## Guidelines ## ]]
|
|
711
|
+
|
|
712
|
+
${guidelineContent.trim()}`);
|
|
713
|
+
}
|
|
714
|
+
let startIndex = 0;
|
|
715
|
+
while (startIndex < messages.length && messages[startIndex].role === "system") {
|
|
716
|
+
const segments = segmentsByMessage[startIndex];
|
|
717
|
+
const contentParts = [];
|
|
718
|
+
for (const segment of segments) {
|
|
719
|
+
const formatted = formatSegment(segment);
|
|
720
|
+
if (formatted) {
|
|
721
|
+
contentParts.push(formatted);
|
|
550
722
|
}
|
|
551
|
-
continue;
|
|
552
723
|
}
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
questionParts.push(genericValue);
|
|
724
|
+
if (contentParts.length > 0) {
|
|
725
|
+
systemSegments.push(contentParts.join("\n"));
|
|
556
726
|
}
|
|
727
|
+
startIndex += 1;
|
|
557
728
|
}
|
|
558
|
-
|
|
559
|
-
|
|
729
|
+
const chatPrompt = [];
|
|
730
|
+
if (systemSegments.length > 0) {
|
|
731
|
+
chatPrompt.push({
|
|
732
|
+
role: "system",
|
|
733
|
+
content: systemSegments.join("\n\n")
|
|
734
|
+
});
|
|
560
735
|
}
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
736
|
+
for (let i = startIndex; i < messages.length; i++) {
|
|
737
|
+
const message = messages[i];
|
|
738
|
+
const segments = segmentsByMessage[i];
|
|
739
|
+
const contentParts = [];
|
|
740
|
+
let role = message.role;
|
|
741
|
+
let name;
|
|
742
|
+
if (role === "system") {
|
|
743
|
+
role = "assistant";
|
|
744
|
+
contentParts.push("@[System]:");
|
|
745
|
+
} else if (role === "tool") {
|
|
746
|
+
role = "function";
|
|
747
|
+
name = "tool";
|
|
748
|
+
}
|
|
749
|
+
for (const segment of segments) {
|
|
750
|
+
if (segment.type === "guideline_ref") {
|
|
751
|
+
continue;
|
|
752
|
+
}
|
|
753
|
+
const formatted = formatSegment(segment);
|
|
754
|
+
if (formatted) {
|
|
755
|
+
const isGuidelineRef = segment.type === "file" && typeof segment.path === "string" && guidelinePatterns && isGuidelineFile(segment.path, guidelinePatterns);
|
|
756
|
+
if (isGuidelineRef) {
|
|
757
|
+
continue;
|
|
758
|
+
}
|
|
759
|
+
contentParts.push(formatted);
|
|
760
|
+
}
|
|
761
|
+
}
|
|
762
|
+
if (contentParts.length === 0) {
|
|
763
|
+
continue;
|
|
764
|
+
}
|
|
765
|
+
chatPrompt.push({
|
|
766
|
+
role,
|
|
767
|
+
content: contentParts.join("\n"),
|
|
768
|
+
...name ? { name } : {}
|
|
769
|
+
});
|
|
770
|
+
}
|
|
771
|
+
return chatPrompt.length > 0 ? chatPrompt : void 0;
|
|
564
772
|
}
|
|
565
773
|
async function fileExists2(absolutePath) {
|
|
566
774
|
try {
|
|
@@ -658,9 +866,9 @@ async function resolveAssistantContent(content, searchRoots, verbose) {
|
|
|
658
866
|
}
|
|
659
867
|
return parts.join(" ");
|
|
660
868
|
}
|
|
661
|
-
async function parseEvaluators(rawEvalCase, searchRoots, evalId) {
|
|
869
|
+
async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
|
|
662
870
|
const execution = rawEvalCase.execution;
|
|
663
|
-
const candidateEvaluators = isJsonObject(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators;
|
|
871
|
+
const candidateEvaluators = isJsonObject(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators ?? globalExecution?.evaluators;
|
|
664
872
|
if (candidateEvaluators === void 0) {
|
|
665
873
|
return void 0;
|
|
666
874
|
}
|
|
@@ -698,6 +906,8 @@ async function parseEvaluators(rawEvalCase, searchRoots, evalId) {
|
|
|
698
906
|
resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
|
|
699
907
|
);
|
|
700
908
|
}
|
|
909
|
+
} else {
|
|
910
|
+
resolvedCwd = searchRoots[0];
|
|
701
911
|
}
|
|
702
912
|
evaluators.push({
|
|
703
913
|
name,
|
|
@@ -726,8 +936,7 @@ async function parseEvaluators(rawEvalCase, searchRoots, evalId) {
|
|
|
726
936
|
name,
|
|
727
937
|
type: "llm_judge",
|
|
728
938
|
prompt,
|
|
729
|
-
promptPath
|
|
730
|
-
model
|
|
939
|
+
promptPath
|
|
731
940
|
});
|
|
732
941
|
}
|
|
733
942
|
return evaluators.length > 0 ? evaluators : void 0;
|
|
@@ -757,21 +966,14 @@ var import_ax = require("@ax-llm/ax");
|
|
|
757
966
|
var DEFAULT_SYSTEM_PROMPT = "You are a careful assistant. Follow all provided instructions and do not fabricate results.";
|
|
758
967
|
function buildChatPrompt(request) {
|
|
759
968
|
if (request.chatPrompt) {
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
} else {
|
|
767
|
-
systemSegments.push(DEFAULT_SYSTEM_PROMPT);
|
|
768
|
-
}
|
|
769
|
-
if (request.guidelines && request.guidelines.trim().length > 0) {
|
|
770
|
-
systemSegments.push(`[[ ## Guidelines ## ]]
|
|
771
|
-
|
|
772
|
-
${request.guidelines.trim()}`);
|
|
969
|
+
const hasSystemMessage = request.chatPrompt.some((message) => message.role === "system");
|
|
970
|
+
if (hasSystemMessage) {
|
|
971
|
+
return request.chatPrompt;
|
|
972
|
+
}
|
|
973
|
+
const systemContent2 = resolveSystemContent(request);
|
|
974
|
+
return [{ role: "system", content: systemContent2 }, ...request.chatPrompt];
|
|
773
975
|
}
|
|
774
|
-
const systemContent =
|
|
976
|
+
const systemContent = resolveSystemContent(request);
|
|
775
977
|
const userContent = request.question.trim();
|
|
776
978
|
const prompt = [
|
|
777
979
|
{
|
|
@@ -785,6 +987,21 @@ ${request.guidelines.trim()}`);
|
|
|
785
987
|
];
|
|
786
988
|
return prompt;
|
|
787
989
|
}
|
|
990
|
+
function resolveSystemContent(request) {
|
|
991
|
+
const systemSegments = [];
|
|
992
|
+
const metadataSystemPrompt = typeof request.metadata?.systemPrompt === "string" ? request.metadata.systemPrompt : void 0;
|
|
993
|
+
if (metadataSystemPrompt && metadataSystemPrompt.trim().length > 0) {
|
|
994
|
+
systemSegments.push(metadataSystemPrompt.trim());
|
|
995
|
+
} else {
|
|
996
|
+
systemSegments.push(DEFAULT_SYSTEM_PROMPT);
|
|
997
|
+
}
|
|
998
|
+
if (request.guidelines && request.guidelines.trim().length > 0) {
|
|
999
|
+
systemSegments.push(`[[ ## Guidelines ## ]]
|
|
1000
|
+
|
|
1001
|
+
${request.guidelines.trim()}`);
|
|
1002
|
+
}
|
|
1003
|
+
return systemSegments.join("\n\n");
|
|
1004
|
+
}
|
|
788
1005
|
function extractModelConfig(request, defaults) {
|
|
789
1006
|
const temperature = request.temperature ?? defaults.temperature;
|
|
790
1007
|
const maxTokens = request.maxOutputTokens ?? defaults.maxOutputTokens;
|
|
@@ -3020,24 +3237,23 @@ var LlmJudgeEvaluator = class {
|
|
|
3020
3237
|
return this.evaluateWithPrompt(context, judgeProvider);
|
|
3021
3238
|
}
|
|
3022
3239
|
async evaluateWithPrompt(context, judgeProvider) {
|
|
3023
|
-
|
|
3024
|
-
|
|
3240
|
+
const hasReferenceAnswer = hasNonEmptyReferenceAnswer(context.evalCase);
|
|
3241
|
+
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
3242
|
+
let prompt = buildQualityPrompt(context.evalCase, context.candidate, formattedQuestion);
|
|
3243
|
+
let systemPrompt = context.systemPrompt ?? this.customPrompt ?? buildSystemPrompt(hasReferenceAnswer);
|
|
3025
3244
|
if (systemPrompt && hasTemplateVariables(systemPrompt)) {
|
|
3026
3245
|
const variables = {
|
|
3027
3246
|
input_messages: JSON.stringify(context.evalCase.input_segments, null, 2),
|
|
3028
3247
|
output_messages: JSON.stringify(context.evalCase.output_segments, null, 2),
|
|
3029
3248
|
candidate_answer: context.candidate,
|
|
3030
|
-
reference_answer: context.evalCase.reference_answer,
|
|
3249
|
+
reference_answer: context.evalCase.reference_answer ?? "",
|
|
3031
3250
|
expected_outcome: context.evalCase.expected_outcome,
|
|
3032
|
-
question:
|
|
3251
|
+
question: formattedQuestion
|
|
3033
3252
|
};
|
|
3034
3253
|
prompt = substituteVariables(systemPrompt, variables);
|
|
3035
|
-
systemPrompt =
|
|
3254
|
+
systemPrompt = buildSystemPrompt(hasReferenceAnswer);
|
|
3036
3255
|
}
|
|
3037
|
-
const metadata = {
|
|
3038
|
-
...systemPrompt !== void 0 ? { systemPrompt } : {},
|
|
3039
|
-
...context.judgeModel !== void 0 ? { model: context.judgeModel } : {}
|
|
3040
|
-
};
|
|
3256
|
+
const metadata = systemPrompt !== void 0 ? { systemPrompt } : {};
|
|
3041
3257
|
const response = await judgeProvider.invoke({
|
|
3042
3258
|
question: prompt,
|
|
3043
3259
|
metadata,
|
|
@@ -3057,8 +3273,7 @@ var LlmJudgeEvaluator = class {
|
|
|
3057
3273
|
provider: judgeProvider.id,
|
|
3058
3274
|
prompt,
|
|
3059
3275
|
target: context.target.name,
|
|
3060
|
-
...systemPrompt !== void 0
|
|
3061
|
-
...context.judgeModel !== void 0 ? { model: context.judgeModel } : {}
|
|
3276
|
+
...systemPrompt !== void 0 && { systemPrompt }
|
|
3062
3277
|
};
|
|
3063
3278
|
return {
|
|
3064
3279
|
score,
|
|
@@ -3070,38 +3285,51 @@ var LlmJudgeEvaluator = class {
|
|
|
3070
3285
|
};
|
|
3071
3286
|
}
|
|
3072
3287
|
};
|
|
3073
|
-
|
|
3074
|
-
|
|
3075
|
-
|
|
3076
|
-
|
|
3077
|
-
|
|
3078
|
-
|
|
3079
|
-
|
|
3080
|
-
|
|
3081
|
-
|
|
3082
|
-
|
|
3083
|
-
|
|
3084
|
-
|
|
3085
|
-
|
|
3086
|
-
|
|
3087
|
-
|
|
3088
|
-
|
|
3089
|
-
|
|
3288
|
+
function buildSystemPrompt(hasReferenceAnswer) {
|
|
3289
|
+
const basePrompt = [
|
|
3290
|
+
"You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.",
|
|
3291
|
+
""
|
|
3292
|
+
];
|
|
3293
|
+
if (hasReferenceAnswer) {
|
|
3294
|
+
basePrompt.push(
|
|
3295
|
+
"Use the reference_answer as a gold standard for a high-quality response. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.",
|
|
3296
|
+
""
|
|
3297
|
+
);
|
|
3298
|
+
}
|
|
3299
|
+
basePrompt.push(
|
|
3300
|
+
"Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.",
|
|
3301
|
+
"",
|
|
3302
|
+
"You must respond with a single JSON object matching this schema:",
|
|
3303
|
+
"",
|
|
3304
|
+
"{",
|
|
3305
|
+
' "score": <number between 0.0 and 1.0>,',
|
|
3306
|
+
' "hits": [<array of strings, max 4 items, brief specific achievements>],',
|
|
3307
|
+
' "misses": [<array of strings, max 4 items, brief specific failures or omissions, empty if none>],',
|
|
3308
|
+
' "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
|
|
3309
|
+
"}"
|
|
3310
|
+
);
|
|
3311
|
+
return basePrompt.join("\n");
|
|
3312
|
+
}
|
|
3313
|
+
function buildQualityPrompt(evalCase, candidate, question) {
|
|
3090
3314
|
const parts = [
|
|
3091
3315
|
"[[ ## expected_outcome ## ]]",
|
|
3092
3316
|
evalCase.expected_outcome.trim(),
|
|
3093
3317
|
"",
|
|
3094
3318
|
"[[ ## question ## ]]",
|
|
3095
|
-
|
|
3096
|
-
""
|
|
3097
|
-
"[[ ## reference_answer ## ]]",
|
|
3098
|
-
evalCase.reference_answer.trim(),
|
|
3099
|
-
"",
|
|
3100
|
-
"[[ ## candidate_answer ## ]]",
|
|
3101
|
-
candidate.trim(),
|
|
3102
|
-
"",
|
|
3103
|
-
"Respond with a single JSON object matching the schema described in the system prompt."
|
|
3319
|
+
question.trim(),
|
|
3320
|
+
""
|
|
3104
3321
|
];
|
|
3322
|
+
if (hasNonEmptyReferenceAnswer(evalCase)) {
|
|
3323
|
+
parts.push(
|
|
3324
|
+
"[[ ## reference_answer ## ]]",
|
|
3325
|
+
evalCase.reference_answer.trim(),
|
|
3326
|
+
""
|
|
3327
|
+
);
|
|
3328
|
+
}
|
|
3329
|
+
parts.push(
|
|
3330
|
+
"[[ ## candidate_answer ## ]]",
|
|
3331
|
+
candidate.trim()
|
|
3332
|
+
);
|
|
3105
3333
|
return parts.join("\n");
|
|
3106
3334
|
}
|
|
3107
3335
|
function clampScore(value) {
|
|
@@ -3184,6 +3412,9 @@ function extractJsonBlob(text) {
|
|
|
3184
3412
|
function isNonEmptyString(value) {
|
|
3185
3413
|
return typeof value === "string" && value.trim().length > 0;
|
|
3186
3414
|
}
|
|
3415
|
+
function hasNonEmptyReferenceAnswer(evalCase) {
|
|
3416
|
+
return evalCase.reference_answer !== void 0 && evalCase.reference_answer.trim().length > 0;
|
|
3417
|
+
}
|
|
3187
3418
|
var CodeEvaluator = class {
|
|
3188
3419
|
kind = "code";
|
|
3189
3420
|
script;
|
|
@@ -3842,11 +4073,27 @@ async function evaluateCandidate(options) {
|
|
|
3842
4073
|
agentTimeoutMs
|
|
3843
4074
|
});
|
|
3844
4075
|
const completedAt = nowFn();
|
|
3845
|
-
|
|
3846
|
-
|
|
3847
|
-
|
|
3848
|
-
|
|
3849
|
-
|
|
4076
|
+
let agentProviderRequest;
|
|
4077
|
+
let lmProviderRequest;
|
|
4078
|
+
if (isAgentProvider(provider)) {
|
|
4079
|
+
agentProviderRequest = {
|
|
4080
|
+
question: promptInputs.question,
|
|
4081
|
+
guideline_paths: evalCase.guideline_paths
|
|
4082
|
+
};
|
|
4083
|
+
} else {
|
|
4084
|
+
if (promptInputs.chatPrompt) {
|
|
4085
|
+
lmProviderRequest = {
|
|
4086
|
+
chat_prompt: promptInputs.chatPrompt,
|
|
4087
|
+
guideline_paths: evalCase.guideline_paths
|
|
4088
|
+
};
|
|
4089
|
+
} else {
|
|
4090
|
+
lmProviderRequest = {
|
|
4091
|
+
question: promptInputs.question,
|
|
4092
|
+
guidelines: promptInputs.guidelines,
|
|
4093
|
+
guideline_paths: evalCase.guideline_paths
|
|
4094
|
+
};
|
|
4095
|
+
}
|
|
4096
|
+
}
|
|
3850
4097
|
return {
|
|
3851
4098
|
eval_id: evalCase.id,
|
|
3852
4099
|
dataset: evalCase.dataset,
|
|
@@ -3860,7 +4107,8 @@ async function evaluateCandidate(options) {
|
|
|
3860
4107
|
timestamp: completedAt.toISOString(),
|
|
3861
4108
|
reasoning: score.reasoning,
|
|
3862
4109
|
raw_aspects: score.rawAspects,
|
|
3863
|
-
|
|
4110
|
+
agent_provider_request: agentProviderRequest,
|
|
4111
|
+
lm_provider_request: lmProviderRequest,
|
|
3864
4112
|
evaluator_raw_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
|
|
3865
4113
|
evaluator_results: evaluatorResults
|
|
3866
4114
|
};
|
|
@@ -4019,8 +4267,7 @@ async function runLlmJudgeEvaluator(options) {
|
|
|
4019
4267
|
now,
|
|
4020
4268
|
judgeProvider,
|
|
4021
4269
|
systemPrompt: customPrompt,
|
|
4022
|
-
evaluator: config
|
|
4023
|
-
judgeModel: config.model
|
|
4270
|
+
evaluator: config
|
|
4024
4271
|
});
|
|
4025
4272
|
}
|
|
4026
4273
|
async function resolveCustomPrompt(config) {
|
|
@@ -4089,6 +4336,7 @@ async function invokeProvider(provider, options) {
|
|
|
4089
4336
|
question: promptInputs.question,
|
|
4090
4337
|
guidelines: promptInputs.guidelines,
|
|
4091
4338
|
guideline_patterns: evalCase.guideline_patterns,
|
|
4339
|
+
chatPrompt: promptInputs.chatPrompt,
|
|
4092
4340
|
inputFiles: evalCase.file_paths,
|
|
4093
4341
|
evalCaseId: evalCase.id,
|
|
4094
4342
|
attempt,
|
|
@@ -4105,12 +4353,30 @@ async function invokeProvider(provider, options) {
|
|
|
4105
4353
|
}
|
|
4106
4354
|
function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider) {
|
|
4107
4355
|
const message = error instanceof Error ? error.message : String(error);
|
|
4108
|
-
|
|
4109
|
-
|
|
4110
|
-
|
|
4111
|
-
|
|
4112
|
-
|
|
4113
|
-
|
|
4356
|
+
let agentProviderRequest;
|
|
4357
|
+
let lmProviderRequest;
|
|
4358
|
+
if (isAgentProvider(provider)) {
|
|
4359
|
+
agentProviderRequest = {
|
|
4360
|
+
question: promptInputs.question,
|
|
4361
|
+
guideline_paths: evalCase.guideline_paths,
|
|
4362
|
+
error: message
|
|
4363
|
+
};
|
|
4364
|
+
} else {
|
|
4365
|
+
if (promptInputs.chatPrompt) {
|
|
4366
|
+
lmProviderRequest = {
|
|
4367
|
+
chat_prompt: promptInputs.chatPrompt,
|
|
4368
|
+
guideline_paths: evalCase.guideline_paths,
|
|
4369
|
+
error: message
|
|
4370
|
+
};
|
|
4371
|
+
} else {
|
|
4372
|
+
lmProviderRequest = {
|
|
4373
|
+
question: promptInputs.question,
|
|
4374
|
+
guidelines: promptInputs.guidelines,
|
|
4375
|
+
guideline_paths: evalCase.guideline_paths,
|
|
4376
|
+
error: message
|
|
4377
|
+
};
|
|
4378
|
+
}
|
|
4379
|
+
}
|
|
4114
4380
|
return {
|
|
4115
4381
|
eval_id: evalCase.id,
|
|
4116
4382
|
dataset: evalCase.dataset,
|
|
@@ -4123,7 +4389,8 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
4123
4389
|
target: targetName,
|
|
4124
4390
|
timestamp: timestamp.toISOString(),
|
|
4125
4391
|
raw_aspects: [],
|
|
4126
|
-
|
|
4392
|
+
agent_provider_request: agentProviderRequest,
|
|
4393
|
+
lm_provider_request: lmProviderRequest,
|
|
4127
4394
|
error: message
|
|
4128
4395
|
};
|
|
4129
4396
|
}
|
|
@@ -4135,6 +4402,9 @@ function createCacheKey(provider, target, evalCase, promptInputs) {
|
|
|
4135
4402
|
hash.update(promptInputs.question);
|
|
4136
4403
|
hash.update(promptInputs.guidelines);
|
|
4137
4404
|
hash.update(promptInputs.systemMessage ?? "");
|
|
4405
|
+
if (promptInputs.chatPrompt) {
|
|
4406
|
+
hash.update(JSON.stringify(promptInputs.chatPrompt));
|
|
4407
|
+
}
|
|
4138
4408
|
return hash.digest("hex");
|
|
4139
4409
|
}
|
|
4140
4410
|
function isTimeoutLike(error) {
|
|
@@ -4183,6 +4453,7 @@ function createAgentKernel() {
|
|
|
4183
4453
|
loadEvalCases,
|
|
4184
4454
|
normalizeLineEndings,
|
|
4185
4455
|
readTargetDefinitions,
|
|
4456
|
+
readTestSuiteMetadata,
|
|
4186
4457
|
readTextFile,
|
|
4187
4458
|
resolveAndCreateProvider,
|
|
4188
4459
|
resolveFileReference,
|