@agentv/core 0.14.2 → 0.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +168 -72
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +18 -5
- package/dist/index.d.ts +18 -5
- package/dist/index.js +151 -55
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.cjs
CHANGED
|
@@ -116,7 +116,7 @@ function getHitCount(result) {
|
|
|
116
116
|
}
|
|
117
117
|
|
|
118
118
|
// src/evaluation/yaml-parser.ts
|
|
119
|
-
var
|
|
119
|
+
var import_promises6 = require("fs/promises");
|
|
120
120
|
var import_node_path6 = __toESM(require("path"), 1);
|
|
121
121
|
var import_yaml2 = require("yaml");
|
|
122
122
|
|
|
@@ -154,7 +154,7 @@ ${part.content}
|
|
|
154
154
|
}
|
|
155
155
|
return parts.map((p) => p.content).join(" ");
|
|
156
156
|
}
|
|
157
|
-
function formatSegment(segment) {
|
|
157
|
+
function formatSegment(segment, mode = "lm") {
|
|
158
158
|
const type = asString(segment.type);
|
|
159
159
|
if (type === "text") {
|
|
160
160
|
return asString(segment.value);
|
|
@@ -164,8 +164,14 @@ function formatSegment(segment) {
|
|
|
164
164
|
return refPath ? `<Attached: ${refPath}>` : void 0;
|
|
165
165
|
}
|
|
166
166
|
if (type === "file") {
|
|
167
|
-
const text = asString(segment.text);
|
|
168
167
|
const filePath = asString(segment.path);
|
|
168
|
+
if (!filePath) {
|
|
169
|
+
return void 0;
|
|
170
|
+
}
|
|
171
|
+
if (mode === "agent") {
|
|
172
|
+
return `<file: path="${filePath}">`;
|
|
173
|
+
}
|
|
174
|
+
const text = asString(segment.text);
|
|
169
175
|
if (text && filePath) {
|
|
170
176
|
return formatFileContents([{ content: text.trim(), isFile: true, displayPath: filePath }]);
|
|
171
177
|
}
|
|
@@ -369,8 +375,67 @@ function logWarning(message) {
|
|
|
369
375
|
|
|
370
376
|
// src/evaluation/loaders/evaluator-parser.ts
|
|
371
377
|
var import_node_path3 = __toESM(require("path"), 1);
|
|
378
|
+
|
|
379
|
+
// src/evaluation/validation/prompt-validator.ts
|
|
380
|
+
var import_promises3 = require("fs/promises");
|
|
381
|
+
|
|
382
|
+
// src/evaluation/template-variables.ts
|
|
383
|
+
var TEMPLATE_VARIABLES = {
|
|
384
|
+
CANDIDATE_ANSWER: "candidate_answer",
|
|
385
|
+
EXPECTED_MESSAGES: "expected_messages",
|
|
386
|
+
QUESTION: "question",
|
|
387
|
+
EXPECTED_OUTCOME: "expected_outcome",
|
|
388
|
+
REFERENCE_ANSWER: "reference_answer",
|
|
389
|
+
INPUT_MESSAGES: "input_messages"
|
|
390
|
+
};
|
|
391
|
+
var VALID_TEMPLATE_VARIABLES = new Set(
|
|
392
|
+
Object.values(TEMPLATE_VARIABLES)
|
|
393
|
+
);
|
|
394
|
+
var REQUIRED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Set([
|
|
395
|
+
TEMPLATE_VARIABLES.CANDIDATE_ANSWER,
|
|
396
|
+
TEMPLATE_VARIABLES.EXPECTED_MESSAGES
|
|
397
|
+
]);
|
|
398
|
+
|
|
399
|
+
// src/evaluation/validation/prompt-validator.ts
|
|
372
400
|
var ANSI_YELLOW2 = "\x1B[33m";
|
|
373
401
|
var ANSI_RESET2 = "\x1B[0m";
|
|
402
|
+
async function validateCustomPromptContent(promptPath) {
|
|
403
|
+
const content = await (0, import_promises3.readFile)(promptPath, "utf8");
|
|
404
|
+
validateTemplateVariables(content, promptPath);
|
|
405
|
+
}
|
|
406
|
+
function validateTemplateVariables(content, source) {
|
|
407
|
+
const variablePattern = /\{\{\s*([a-zA-Z0-9_]+)\s*\}\}/g;
|
|
408
|
+
const foundVariables = /* @__PURE__ */ new Set();
|
|
409
|
+
const invalidVariables = [];
|
|
410
|
+
let match;
|
|
411
|
+
while ((match = variablePattern.exec(content)) !== null) {
|
|
412
|
+
const varName = match[1];
|
|
413
|
+
foundVariables.add(varName);
|
|
414
|
+
if (!VALID_TEMPLATE_VARIABLES.has(varName)) {
|
|
415
|
+
invalidVariables.push(varName);
|
|
416
|
+
}
|
|
417
|
+
}
|
|
418
|
+
const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.CANDIDATE_ANSWER);
|
|
419
|
+
const hasExpectedMessages = foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_MESSAGES);
|
|
420
|
+
const hasRequiredFields = hasCandidateAnswer || hasExpectedMessages;
|
|
421
|
+
if (!hasRequiredFields) {
|
|
422
|
+
throw new Error(
|
|
423
|
+
`Missing required fields. Must include at least one of:
|
|
424
|
+
- {{ ${TEMPLATE_VARIABLES.CANDIDATE_ANSWER} }}
|
|
425
|
+
- {{ ${TEMPLATE_VARIABLES.EXPECTED_MESSAGES} }}`
|
|
426
|
+
);
|
|
427
|
+
}
|
|
428
|
+
if (invalidVariables.length > 0) {
|
|
429
|
+
const warningMessage = `${ANSI_YELLOW2}Warning: Custom evaluator template at ${source}
|
|
430
|
+
Contains invalid variables: ${invalidVariables.map((v) => `{{ ${v} }}`).join(", ")}
|
|
431
|
+
Valid variables: ${Array.from(VALID_TEMPLATE_VARIABLES).map((v) => `{{ ${v} }}`).join(", ")}${ANSI_RESET2}`;
|
|
432
|
+
console.warn(warningMessage);
|
|
433
|
+
}
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
// src/evaluation/loaders/evaluator-parser.ts
|
|
437
|
+
var ANSI_YELLOW3 = "\x1B[33m";
|
|
438
|
+
var ANSI_RESET3 = "\x1B[0m";
|
|
374
439
|
async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
|
|
375
440
|
const execution = rawEvalCase.execution;
|
|
376
441
|
const candidateEvaluators = isJsonObject2(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators ?? globalExecution?.evaluators;
|
|
@@ -429,6 +494,12 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
429
494
|
const resolved = await resolveFileReference(prompt, searchRoots);
|
|
430
495
|
if (resolved.resolvedPath) {
|
|
431
496
|
promptPath = import_node_path3.default.resolve(resolved.resolvedPath);
|
|
497
|
+
try {
|
|
498
|
+
await validateCustomPromptContent(promptPath);
|
|
499
|
+
} catch (error) {
|
|
500
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
501
|
+
throw new Error(`Evaluator '${name}' template (${promptPath}): ${message}`);
|
|
502
|
+
}
|
|
432
503
|
} else {
|
|
433
504
|
logWarning2(
|
|
434
505
|
`Inline prompt used for evaluator '${name}' in '${evalId}' (file not found: ${resolved.displayPath})`,
|
|
@@ -465,18 +536,18 @@ function isJsonObject2(value) {
|
|
|
465
536
|
function logWarning2(message, details) {
|
|
466
537
|
if (details && details.length > 0) {
|
|
467
538
|
const detailBlock = details.join("\n");
|
|
468
|
-
console.warn(`${
|
|
469
|
-
${detailBlock}${
|
|
539
|
+
console.warn(`${ANSI_YELLOW3}Warning: ${message}
|
|
540
|
+
${detailBlock}${ANSI_RESET3}`);
|
|
470
541
|
} else {
|
|
471
|
-
console.warn(`${
|
|
542
|
+
console.warn(`${ANSI_YELLOW3}Warning: ${message}${ANSI_RESET3}`);
|
|
472
543
|
}
|
|
473
544
|
}
|
|
474
545
|
|
|
475
546
|
// src/evaluation/loaders/message-processor.ts
|
|
476
|
-
var
|
|
547
|
+
var import_promises4 = require("fs/promises");
|
|
477
548
|
var import_node_path4 = __toESM(require("path"), 1);
|
|
478
|
-
var
|
|
479
|
-
var
|
|
549
|
+
var ANSI_YELLOW4 = "\x1B[33m";
|
|
550
|
+
var ANSI_RESET4 = "\x1B[0m";
|
|
480
551
|
async function processMessages(options) {
|
|
481
552
|
const {
|
|
482
553
|
messages,
|
|
@@ -519,7 +590,7 @@ async function processMessages(options) {
|
|
|
519
590
|
continue;
|
|
520
591
|
}
|
|
521
592
|
try {
|
|
522
|
-
const fileContent = (await (0,
|
|
593
|
+
const fileContent = (await (0, import_promises4.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
|
|
523
594
|
if (messageType === "input" && guidelinePatterns && guidelinePaths) {
|
|
524
595
|
const relativeToRepo = import_node_path4.default.relative(repoRootPath, resolvedPath);
|
|
525
596
|
if (isGuidelineFile(relativeToRepo, guidelinePatterns)) {
|
|
@@ -590,7 +661,7 @@ async function resolveAssistantContent(content, searchRoots, verbose) {
|
|
|
590
661
|
continue;
|
|
591
662
|
}
|
|
592
663
|
try {
|
|
593
|
-
const fileContent = (await (0,
|
|
664
|
+
const fileContent = (await (0, import_promises4.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n").trim();
|
|
594
665
|
parts.push({ content: fileContent, isFile: true, displayPath });
|
|
595
666
|
if (verbose) {
|
|
596
667
|
console.log(` [Expected Assistant File] Found: ${displayPath}`);
|
|
@@ -640,19 +711,19 @@ function cloneJsonValue(value) {
|
|
|
640
711
|
function logWarning3(message, details) {
|
|
641
712
|
if (details && details.length > 0) {
|
|
642
713
|
const detailBlock = details.join("\n");
|
|
643
|
-
console.warn(`${
|
|
644
|
-
${detailBlock}${
|
|
714
|
+
console.warn(`${ANSI_YELLOW4}Warning: ${message}
|
|
715
|
+
${detailBlock}${ANSI_RESET4}`);
|
|
645
716
|
} else {
|
|
646
|
-
console.warn(`${
|
|
717
|
+
console.warn(`${ANSI_YELLOW4}Warning: ${message}${ANSI_RESET4}`);
|
|
647
718
|
}
|
|
648
719
|
}
|
|
649
720
|
|
|
650
721
|
// src/evaluation/formatting/prompt-builder.ts
|
|
651
|
-
var
|
|
722
|
+
var import_promises5 = require("fs/promises");
|
|
652
723
|
var import_node_path5 = __toESM(require("path"), 1);
|
|
653
|
-
var
|
|
654
|
-
var
|
|
655
|
-
async function buildPromptInputs(testCase) {
|
|
724
|
+
var ANSI_YELLOW5 = "\x1B[33m";
|
|
725
|
+
var ANSI_RESET5 = "\x1B[0m";
|
|
726
|
+
async function buildPromptInputs(testCase, mode = "lm") {
|
|
656
727
|
const guidelineParts = [];
|
|
657
728
|
for (const rawPath of testCase.guideline_paths) {
|
|
658
729
|
const absolutePath = import_node_path5.default.resolve(rawPath);
|
|
@@ -661,7 +732,7 @@ async function buildPromptInputs(testCase) {
|
|
|
661
732
|
continue;
|
|
662
733
|
}
|
|
663
734
|
try {
|
|
664
|
-
const content = (await (0,
|
|
735
|
+
const content = (await (0, import_promises5.readFile)(absolutePath, "utf8")).replace(/\r\n/g, "\n").trim();
|
|
665
736
|
guidelineParts.push({
|
|
666
737
|
content,
|
|
667
738
|
isFile: true,
|
|
@@ -728,7 +799,7 @@ async function buildPromptInputs(testCase) {
|
|
|
728
799
|
const roleLabel = message.role.charAt(0).toUpperCase() + message.role.slice(1);
|
|
729
800
|
const contentParts = [];
|
|
730
801
|
for (const segment of segments) {
|
|
731
|
-
const formattedContent = formatSegment(segment);
|
|
802
|
+
const formattedContent = formatSegment(segment, mode);
|
|
732
803
|
if (formattedContent) {
|
|
733
804
|
contentParts.push(formattedContent);
|
|
734
805
|
}
|
|
@@ -743,7 +814,11 @@ ${messageContent}`);
|
|
|
743
814
|
} else {
|
|
744
815
|
const questionParts = [];
|
|
745
816
|
for (const segment of testCase.input_segments) {
|
|
746
|
-
|
|
817
|
+
if (segment.type === "file" && typeof segment.path === "string" && testCase.guideline_patterns && isGuidelineFile(segment.path, testCase.guideline_patterns)) {
|
|
818
|
+
questionParts.push(`<Attached: ${segment.path}>`);
|
|
819
|
+
continue;
|
|
820
|
+
}
|
|
821
|
+
const formattedContent = formatSegment(segment, mode);
|
|
747
822
|
if (formattedContent) {
|
|
748
823
|
questionParts.push(formattedContent);
|
|
749
824
|
}
|
|
@@ -757,7 +832,8 @@ ${messageContent}`);
|
|
|
757
832
|
messages: testCase.input_messages,
|
|
758
833
|
segmentsByMessage,
|
|
759
834
|
guidelinePatterns: testCase.guideline_patterns,
|
|
760
|
-
guidelineContent: guidelines
|
|
835
|
+
guidelineContent: guidelines,
|
|
836
|
+
mode
|
|
761
837
|
}) : void 0;
|
|
762
838
|
return { question, guidelines, chatPrompt };
|
|
763
839
|
}
|
|
@@ -774,7 +850,7 @@ function needsRoleMarkers(messages, processedSegmentsByMessage) {
|
|
|
774
850
|
return messagesWithContent > 1;
|
|
775
851
|
}
|
|
776
852
|
function buildChatPromptFromSegments(options) {
|
|
777
|
-
const { messages, segmentsByMessage, guidelinePatterns, guidelineContent, systemPrompt } = options;
|
|
853
|
+
const { messages, segmentsByMessage, guidelinePatterns, guidelineContent, systemPrompt, mode = "lm" } = options;
|
|
778
854
|
if (messages.length === 0) {
|
|
779
855
|
return void 0;
|
|
780
856
|
}
|
|
@@ -792,7 +868,7 @@ ${guidelineContent.trim()}`);
|
|
|
792
868
|
const segments = segmentsByMessage[startIndex];
|
|
793
869
|
const contentParts = [];
|
|
794
870
|
for (const segment of segments) {
|
|
795
|
-
const formatted = formatSegment(segment);
|
|
871
|
+
const formatted = formatSegment(segment, mode);
|
|
796
872
|
if (formatted) {
|
|
797
873
|
contentParts.push(formatted);
|
|
798
874
|
}
|
|
@@ -825,7 +901,7 @@ ${guidelineContent.trim()}`);
|
|
|
825
901
|
if (segment.type === "guideline_ref") {
|
|
826
902
|
continue;
|
|
827
903
|
}
|
|
828
|
-
const formatted = formatSegment(segment);
|
|
904
|
+
const formatted = formatSegment(segment, mode);
|
|
829
905
|
if (formatted) {
|
|
830
906
|
const isGuidelineRef = segment.type === "file" && typeof segment.path === "string" && guidelinePatterns && isGuidelineFile(segment.path, guidelinePatterns);
|
|
831
907
|
if (isGuidelineRef) {
|
|
@@ -849,17 +925,18 @@ function asString4(value) {
|
|
|
849
925
|
return typeof value === "string" ? value : void 0;
|
|
850
926
|
}
|
|
851
927
|
function logWarning4(message) {
|
|
852
|
-
console.warn(`${
|
|
928
|
+
console.warn(`${ANSI_YELLOW5}Warning: ${message}${ANSI_RESET5}`);
|
|
853
929
|
}
|
|
854
930
|
|
|
855
931
|
// src/evaluation/yaml-parser.ts
|
|
856
|
-
var
|
|
857
|
-
var
|
|
932
|
+
var ANSI_YELLOW6 = "\x1B[33m";
|
|
933
|
+
var ANSI_RED = "\x1B[31m";
|
|
934
|
+
var ANSI_RESET6 = "\x1B[0m";
|
|
858
935
|
var SCHEMA_EVAL_V2 = "agentv-eval-v2";
|
|
859
936
|
async function readTestSuiteMetadata(testFilePath) {
|
|
860
937
|
try {
|
|
861
938
|
const absolutePath = import_node_path6.default.resolve(testFilePath);
|
|
862
|
-
const content = await (0,
|
|
939
|
+
const content = await (0, import_promises6.readFile)(absolutePath, "utf8");
|
|
863
940
|
const parsed = (0, import_yaml2.parse)(content);
|
|
864
941
|
if (!isJsonObject(parsed)) {
|
|
865
942
|
return {};
|
|
@@ -877,7 +954,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
877
954
|
const searchRoots = buildSearchRoots(absoluteTestPath, repoRootPath);
|
|
878
955
|
const config = await loadConfig(absoluteTestPath, repoRootPath);
|
|
879
956
|
const guidelinePatterns = config?.guideline_patterns;
|
|
880
|
-
const rawFile = await (0,
|
|
957
|
+
const rawFile = await (0, import_promises6.readFile)(absoluteTestPath, "utf8");
|
|
881
958
|
const parsed = (0, import_yaml2.parse)(rawFile);
|
|
882
959
|
if (!isJsonObject(parsed)) {
|
|
883
960
|
throw new Error(`Invalid test file format: ${evalFilePath}`);
|
|
@@ -915,14 +992,14 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
915
992
|
const inputMessagesValue = evalcase.input_messages;
|
|
916
993
|
const expectedMessagesValue = evalcase.expected_messages;
|
|
917
994
|
if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
|
|
918
|
-
|
|
995
|
+
logError(`Skipping incomplete eval case: ${id ?? "unknown"}. Missing required fields: id, outcome, and/or input_messages`);
|
|
919
996
|
continue;
|
|
920
997
|
}
|
|
921
998
|
const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
|
|
922
999
|
const inputMessages = inputMessagesValue.filter((msg) => isTestMessage(msg));
|
|
923
1000
|
const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
|
|
924
1001
|
if (hasExpectedMessages && expectedMessages.length === 0) {
|
|
925
|
-
|
|
1002
|
+
logError(`No valid expected message found for eval case: ${id}`);
|
|
926
1003
|
continue;
|
|
927
1004
|
}
|
|
928
1005
|
if (expectedMessages.length > 1) {
|
|
@@ -953,7 +1030,14 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
953
1030
|
const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
|
|
954
1031
|
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
955
1032
|
const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
|
|
956
|
-
|
|
1033
|
+
let evaluators;
|
|
1034
|
+
try {
|
|
1035
|
+
evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
|
|
1036
|
+
} catch (error) {
|
|
1037
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1038
|
+
logError(`Skipping eval case '${id}': ${message}`);
|
|
1039
|
+
continue;
|
|
1040
|
+
}
|
|
957
1041
|
const userFilePaths = [];
|
|
958
1042
|
for (const segment of inputSegments) {
|
|
959
1043
|
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
@@ -971,7 +1055,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
971
1055
|
question,
|
|
972
1056
|
input_messages: inputMessages,
|
|
973
1057
|
input_segments: inputSegments,
|
|
974
|
-
|
|
1058
|
+
expected_segments: outputSegments,
|
|
975
1059
|
reference_answer: referenceAnswer,
|
|
976
1060
|
guideline_paths: guidelinePaths.map((guidelinePath) => import_node_path6.default.resolve(guidelinePath)),
|
|
977
1061
|
guideline_patterns: guidelinePatterns,
|
|
@@ -1003,20 +1087,29 @@ function asString5(value) {
|
|
|
1003
1087
|
function logWarning5(message, details) {
|
|
1004
1088
|
if (details && details.length > 0) {
|
|
1005
1089
|
const detailBlock = details.join("\n");
|
|
1006
|
-
console.warn(`${
|
|
1007
|
-
${detailBlock}${
|
|
1090
|
+
console.warn(`${ANSI_YELLOW6}Warning: ${message}
|
|
1091
|
+
${detailBlock}${ANSI_RESET6}`);
|
|
1008
1092
|
} else {
|
|
1009
|
-
console.warn(`${
|
|
1093
|
+
console.warn(`${ANSI_YELLOW6}Warning: ${message}${ANSI_RESET6}`);
|
|
1094
|
+
}
|
|
1095
|
+
}
|
|
1096
|
+
function logError(message, details) {
|
|
1097
|
+
if (details && details.length > 0) {
|
|
1098
|
+
const detailBlock = details.join("\n");
|
|
1099
|
+
console.error(`${ANSI_RED}Error: ${message}
|
|
1100
|
+
${detailBlock}${ANSI_RESET6}`);
|
|
1101
|
+
} else {
|
|
1102
|
+
console.error(`${ANSI_RED}Error: ${message}${ANSI_RESET6}`);
|
|
1010
1103
|
}
|
|
1011
1104
|
}
|
|
1012
1105
|
|
|
1013
1106
|
// src/evaluation/file-utils.ts
|
|
1014
1107
|
var import_node_fs2 = require("fs");
|
|
1015
|
-
var
|
|
1108
|
+
var import_promises7 = require("fs/promises");
|
|
1016
1109
|
var import_node_path7 = __toESM(require("path"), 1);
|
|
1017
1110
|
async function fileExists2(filePath) {
|
|
1018
1111
|
try {
|
|
1019
|
-
await (0,
|
|
1112
|
+
await (0, import_promises7.access)(filePath, import_node_fs2.constants.F_OK);
|
|
1020
1113
|
return true;
|
|
1021
1114
|
} catch {
|
|
1022
1115
|
return false;
|
|
@@ -1026,7 +1119,7 @@ function normalizeLineEndings(content) {
|
|
|
1026
1119
|
return content.replace(/\r\n/g, "\n");
|
|
1027
1120
|
}
|
|
1028
1121
|
async function readTextFile(filePath) {
|
|
1029
|
-
const content = await (0,
|
|
1122
|
+
const content = await (0, import_promises7.readFile)(filePath, "utf8");
|
|
1030
1123
|
return normalizeLineEndings(content);
|
|
1031
1124
|
}
|
|
1032
1125
|
async function findGitRoot(startPath) {
|
|
@@ -1447,7 +1540,7 @@ async function withRetry(fn, retryConfig, signal) {
|
|
|
1447
1540
|
|
|
1448
1541
|
// src/evaluation/providers/cli.ts
|
|
1449
1542
|
var import_node_child_process = require("child_process");
|
|
1450
|
-
var
|
|
1543
|
+
var import_promises8 = __toESM(require("fs/promises"), 1);
|
|
1451
1544
|
var import_node_os = __toESM(require("os"), 1);
|
|
1452
1545
|
var import_node_path8 = __toESM(require("path"), 1);
|
|
1453
1546
|
var import_node_util = require("util");
|
|
@@ -1548,7 +1641,7 @@ var CliProvider = class {
|
|
|
1548
1641
|
const errorMsg = error instanceof Error ? error.message : String(error);
|
|
1549
1642
|
throw new Error(`Failed to read output file '${filePath}': ${errorMsg}`);
|
|
1550
1643
|
} finally {
|
|
1551
|
-
await
|
|
1644
|
+
await import_promises8.default.unlink(filePath).catch(() => {
|
|
1552
1645
|
});
|
|
1553
1646
|
}
|
|
1554
1647
|
}
|
|
@@ -1687,7 +1780,7 @@ function formatTimeoutSuffix(timeoutMs) {
|
|
|
1687
1780
|
var import_node_child_process2 = require("child_process");
|
|
1688
1781
|
var import_node_crypto = require("crypto");
|
|
1689
1782
|
var import_node_fs3 = require("fs");
|
|
1690
|
-
var
|
|
1783
|
+
var import_promises9 = require("fs/promises");
|
|
1691
1784
|
var import_node_os2 = require("os");
|
|
1692
1785
|
var import_node_path10 = __toESM(require("path"), 1);
|
|
1693
1786
|
var import_node_util2 = require("util");
|
|
@@ -1877,7 +1970,7 @@ var CodexProvider = class {
|
|
|
1877
1970
|
try {
|
|
1878
1971
|
const promptContent = buildPromptDocument(request, inputFiles);
|
|
1879
1972
|
const promptFile = import_node_path10.default.join(workspaceRoot, PROMPT_FILENAME);
|
|
1880
|
-
await (0,
|
|
1973
|
+
await (0, import_promises9.writeFile)(promptFile, promptContent, "utf8");
|
|
1881
1974
|
const args = this.buildCodexArgs();
|
|
1882
1975
|
const cwd = this.resolveCwd(workspaceRoot);
|
|
1883
1976
|
const result = await this.executeCodex(args, cwd, promptContent, request.signal, logger);
|
|
@@ -1960,11 +2053,11 @@ var CodexProvider = class {
|
|
|
1960
2053
|
}
|
|
1961
2054
|
}
|
|
1962
2055
|
async createWorkspace() {
|
|
1963
|
-
return await (0,
|
|
2056
|
+
return await (0, import_promises9.mkdtemp)(import_node_path10.default.join((0, import_node_os2.tmpdir)(), WORKSPACE_PREFIX));
|
|
1964
2057
|
}
|
|
1965
2058
|
async cleanupWorkspace(workspaceRoot) {
|
|
1966
2059
|
try {
|
|
1967
|
-
await (0,
|
|
2060
|
+
await (0, import_promises9.rm)(workspaceRoot, { recursive: true, force: true });
|
|
1968
2061
|
} catch {
|
|
1969
2062
|
}
|
|
1970
2063
|
}
|
|
@@ -1984,7 +2077,7 @@ var CodexProvider = class {
|
|
|
1984
2077
|
return void 0;
|
|
1985
2078
|
}
|
|
1986
2079
|
try {
|
|
1987
|
-
await (0,
|
|
2080
|
+
await (0, import_promises9.mkdir)(logDir, { recursive: true });
|
|
1988
2081
|
} catch (error) {
|
|
1989
2082
|
const message = error instanceof Error ? error.message : String(error);
|
|
1990
2083
|
console.warn(`Skipping Codex stream logging (could not create ${logDir}): ${message}`);
|
|
@@ -2207,7 +2300,7 @@ async function locateExecutable(candidate) {
|
|
|
2207
2300
|
if (includesPathSeparator) {
|
|
2208
2301
|
const resolved = import_node_path10.default.isAbsolute(candidate) ? candidate : import_node_path10.default.resolve(candidate);
|
|
2209
2302
|
const executablePath = await ensureWindowsExecutableVariant(resolved);
|
|
2210
|
-
await (0,
|
|
2303
|
+
await (0, import_promises9.access)(executablePath, import_node_fs3.constants.F_OK);
|
|
2211
2304
|
return executablePath;
|
|
2212
2305
|
}
|
|
2213
2306
|
const locator = process.platform === "win32" ? "where" : "which";
|
|
@@ -2217,7 +2310,7 @@ async function locateExecutable(candidate) {
|
|
|
2217
2310
|
const preferred = selectExecutableCandidate(lines);
|
|
2218
2311
|
if (preferred) {
|
|
2219
2312
|
const executablePath = await ensureWindowsExecutableVariant(preferred);
|
|
2220
|
-
await (0,
|
|
2313
|
+
await (0, import_promises9.access)(executablePath, import_node_fs3.constants.F_OK);
|
|
2221
2314
|
return executablePath;
|
|
2222
2315
|
}
|
|
2223
2316
|
} catch {
|
|
@@ -2251,7 +2344,7 @@ async function ensureWindowsExecutableVariant(candidate) {
|
|
|
2251
2344
|
for (const ext of extensions) {
|
|
2252
2345
|
const withExtension = `${candidate}${ext}`;
|
|
2253
2346
|
try {
|
|
2254
|
-
await (0,
|
|
2347
|
+
await (0, import_promises9.access)(withExtension, import_node_fs3.constants.F_OK);
|
|
2255
2348
|
return withExtension;
|
|
2256
2349
|
} catch {
|
|
2257
2350
|
}
|
|
@@ -3313,7 +3406,7 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
|
|
|
3313
3406
|
|
|
3314
3407
|
// src/evaluation/providers/targets-file.ts
|
|
3315
3408
|
var import_node_fs4 = require("fs");
|
|
3316
|
-
var
|
|
3409
|
+
var import_promises10 = require("fs/promises");
|
|
3317
3410
|
var import_node_path12 = __toESM(require("path"), 1);
|
|
3318
3411
|
var import_yaml3 = require("yaml");
|
|
3319
3412
|
|
|
@@ -3376,7 +3469,7 @@ function assertTargetDefinition(value, index, filePath) {
|
|
|
3376
3469
|
}
|
|
3377
3470
|
async function fileExists3(filePath) {
|
|
3378
3471
|
try {
|
|
3379
|
-
await (0,
|
|
3472
|
+
await (0, import_promises10.access)(filePath, import_node_fs4.constants.F_OK);
|
|
3380
3473
|
return true;
|
|
3381
3474
|
} catch {
|
|
3382
3475
|
return false;
|
|
@@ -3387,7 +3480,7 @@ async function readTargetDefinitions(filePath) {
|
|
|
3387
3480
|
if (!await fileExists3(absolutePath)) {
|
|
3388
3481
|
throw new Error(`targets.yaml not found at ${absolutePath}`);
|
|
3389
3482
|
}
|
|
3390
|
-
const raw = await (0,
|
|
3483
|
+
const raw = await (0, import_promises10.readFile)(absolutePath, "utf8");
|
|
3391
3484
|
const parsed = (0, import_yaml3.parse)(raw);
|
|
3392
3485
|
if (!isRecord(parsed)) {
|
|
3393
3486
|
throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with '$schema' and 'targets' fields`);
|
|
@@ -3438,16 +3531,16 @@ Use the reference_answer as a gold standard for a high-quality response (if prov
|
|
|
3438
3531
|
Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.
|
|
3439
3532
|
|
|
3440
3533
|
[[ ## expected_outcome ## ]]
|
|
3441
|
-
{{
|
|
3534
|
+
{{${TEMPLATE_VARIABLES.EXPECTED_OUTCOME}}}
|
|
3442
3535
|
|
|
3443
3536
|
[[ ## question ## ]]
|
|
3444
|
-
{{
|
|
3537
|
+
{{${TEMPLATE_VARIABLES.QUESTION}}}
|
|
3445
3538
|
|
|
3446
3539
|
[[ ## reference_answer ## ]]
|
|
3447
|
-
{{
|
|
3540
|
+
{{${TEMPLATE_VARIABLES.REFERENCE_ANSWER}}}
|
|
3448
3541
|
|
|
3449
3542
|
[[ ## candidate_answer ## ]]
|
|
3450
|
-
{{
|
|
3543
|
+
{{${TEMPLATE_VARIABLES.CANDIDATE_ANSWER}}}`;
|
|
3451
3544
|
var LlmJudgeEvaluator = class {
|
|
3452
3545
|
kind = "llm_judge";
|
|
3453
3546
|
resolveJudgeProvider;
|
|
@@ -3470,12 +3563,12 @@ var LlmJudgeEvaluator = class {
|
|
|
3470
3563
|
async evaluateWithPrompt(context, judgeProvider) {
|
|
3471
3564
|
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
3472
3565
|
const variables = {
|
|
3473
|
-
|
|
3474
|
-
|
|
3475
|
-
|
|
3476
|
-
|
|
3477
|
-
|
|
3478
|
-
|
|
3566
|
+
[TEMPLATE_VARIABLES.INPUT_MESSAGES]: JSON.stringify(context.evalCase.input_segments, null, 2),
|
|
3567
|
+
[TEMPLATE_VARIABLES.EXPECTED_MESSAGES]: JSON.stringify(context.evalCase.expected_segments, null, 2),
|
|
3568
|
+
[TEMPLATE_VARIABLES.CANDIDATE_ANSWER]: context.candidate.trim(),
|
|
3569
|
+
[TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context.evalCase.reference_answer ?? "").trim(),
|
|
3570
|
+
[TEMPLATE_VARIABLES.EXPECTED_OUTCOME]: context.evalCase.expected_outcome.trim(),
|
|
3571
|
+
[TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim()
|
|
3479
3572
|
};
|
|
3480
3573
|
const systemPrompt = buildOutputSchema();
|
|
3481
3574
|
const evaluatorTemplate = context.evaluatorTemplateOverride ?? this.evaluatorTemplate ?? DEFAULT_EVALUATOR_TEMPLATE;
|
|
@@ -3707,14 +3800,14 @@ function parseJsonSafe(payload) {
|
|
|
3707
3800
|
}
|
|
3708
3801
|
}
|
|
3709
3802
|
function substituteVariables(template, variables) {
|
|
3710
|
-
return template.replace(/\{\{([a-zA-Z0-9_]+)\}\}/g, (match, varName) => {
|
|
3803
|
+
return template.replace(/\{\{\s*([a-zA-Z0-9_]+)\s*\}\}/g, (match, varName) => {
|
|
3711
3804
|
return variables[varName] ?? match;
|
|
3712
3805
|
});
|
|
3713
3806
|
}
|
|
3714
3807
|
|
|
3715
3808
|
// src/evaluation/orchestrator.ts
|
|
3716
3809
|
var import_node_crypto2 = require("crypto");
|
|
3717
|
-
var
|
|
3810
|
+
var import_promises11 = require("fs/promises");
|
|
3718
3811
|
var import_node_path13 = __toESM(require("path"), 1);
|
|
3719
3812
|
|
|
3720
3813
|
// ../../node_modules/.pnpm/yocto-queue@1.2.1/node_modules/yocto-queue/index.js
|
|
@@ -3871,11 +3964,11 @@ async function runEvaluation(options) {
|
|
|
3871
3964
|
now,
|
|
3872
3965
|
evalId,
|
|
3873
3966
|
verbose,
|
|
3967
|
+
evalCases: preloadedEvalCases,
|
|
3874
3968
|
onResult,
|
|
3875
3969
|
onProgress
|
|
3876
3970
|
} = options;
|
|
3877
|
-
const
|
|
3878
|
-
const evalCases = await load(evalFilePath, repoRoot, { verbose, evalId });
|
|
3971
|
+
const evalCases = preloadedEvalCases ?? await loadEvalCases(evalFilePath, repoRoot, { verbose, evalId });
|
|
3879
3972
|
const filteredEvalCases = filterEvalCases(evalCases, evalId);
|
|
3880
3973
|
if (filteredEvalCases.length === 0) {
|
|
3881
3974
|
if (evalId) {
|
|
@@ -4059,8 +4152,9 @@ async function runBatchEvaluation(options) {
|
|
|
4059
4152
|
agentTimeoutMs
|
|
4060
4153
|
} = options;
|
|
4061
4154
|
const promptInputsList = [];
|
|
4155
|
+
const formattingMode = isAgentProvider(provider) ? "agent" : "lm";
|
|
4062
4156
|
for (const evalCase of evalCases) {
|
|
4063
|
-
const promptInputs = await buildPromptInputs(evalCase);
|
|
4157
|
+
const promptInputs = await buildPromptInputs(evalCase, formattingMode);
|
|
4064
4158
|
if (promptDumpDir) {
|
|
4065
4159
|
await dumpPrompt(promptDumpDir, evalCase, promptInputs);
|
|
4066
4160
|
}
|
|
@@ -4166,7 +4260,8 @@ async function runEvalCase(options) {
|
|
|
4166
4260
|
signal,
|
|
4167
4261
|
judgeProvider
|
|
4168
4262
|
} = options;
|
|
4169
|
-
const
|
|
4263
|
+
const formattingMode = isAgentProvider(provider) ? "agent" : "lm";
|
|
4264
|
+
const promptInputs = await buildPromptInputs(evalCase, formattingMode);
|
|
4170
4265
|
if (promptDumpDir) {
|
|
4171
4266
|
await dumpPrompt(promptDumpDir, evalCase, promptInputs);
|
|
4172
4267
|
}
|
|
@@ -4455,7 +4550,8 @@ async function runLlmJudgeEvaluator(options) {
|
|
|
4455
4550
|
async function resolveCustomPrompt(config) {
|
|
4456
4551
|
if (config.promptPath) {
|
|
4457
4552
|
try {
|
|
4458
|
-
|
|
4553
|
+
const content = await readTextFile(config.promptPath);
|
|
4554
|
+
return content;
|
|
4459
4555
|
} catch (error) {
|
|
4460
4556
|
const message = error instanceof Error ? error.message : String(error);
|
|
4461
4557
|
console.warn(`Could not read custom prompt at ${config.promptPath}: ${message}`);
|
|
@@ -4490,14 +4586,14 @@ async function dumpPrompt(directory, evalCase, promptInputs) {
|
|
|
4490
4586
|
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
4491
4587
|
const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
|
|
4492
4588
|
const filePath = import_node_path13.default.resolve(directory, filename);
|
|
4493
|
-
await (0,
|
|
4589
|
+
await (0, import_promises11.mkdir)(import_node_path13.default.dirname(filePath), { recursive: true });
|
|
4494
4590
|
const payload = {
|
|
4495
4591
|
eval_id: evalCase.id,
|
|
4496
4592
|
question: promptInputs.question,
|
|
4497
4593
|
guidelines: promptInputs.guidelines,
|
|
4498
4594
|
guideline_paths: evalCase.guideline_paths
|
|
4499
4595
|
};
|
|
4500
|
-
await (0,
|
|
4596
|
+
await (0, import_promises11.writeFile)(filePath, JSON.stringify(payload, null, 2), "utf8");
|
|
4501
4597
|
}
|
|
4502
4598
|
function sanitizeFilename(value) {
|
|
4503
4599
|
if (!value) {
|