@agentv/core 0.15.0 → 0.17.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-IOCVST3R.js → chunk-YCIZ33BO.js} +28 -11
- package/dist/chunk-YCIZ33BO.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +68 -64
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +64 -67
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +297 -149
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +18 -5
- package/dist/index.d.ts +18 -5
- package/dist/index.js +251 -115
- package/dist/index.js.map +1 -1
- package/package.json +15 -16
- package/LICENSE +0 -21
- package/dist/chunk-IOCVST3R.js.map +0 -1
package/dist/index.cjs
CHANGED
|
@@ -116,7 +116,7 @@ function getHitCount(result) {
|
|
|
116
116
|
}
|
|
117
117
|
|
|
118
118
|
// src/evaluation/yaml-parser.ts
|
|
119
|
-
var
|
|
119
|
+
var import_promises6 = require("fs/promises");
|
|
120
120
|
var import_node_path6 = __toESM(require("path"), 1);
|
|
121
121
|
var import_yaml2 = require("yaml");
|
|
122
122
|
|
|
@@ -125,11 +125,11 @@ function extractCodeBlocks(segments) {
|
|
|
125
125
|
const CODE_BLOCK_PATTERN = /```[\s\S]*?```/g;
|
|
126
126
|
const codeBlocks = [];
|
|
127
127
|
for (const segment of segments) {
|
|
128
|
-
const typeValue = segment
|
|
128
|
+
const typeValue = segment.type;
|
|
129
129
|
if (typeof typeValue !== "string" || typeValue !== "text") {
|
|
130
130
|
continue;
|
|
131
131
|
}
|
|
132
|
-
const textValue = segment
|
|
132
|
+
const textValue = segment.value;
|
|
133
133
|
if (typeof textValue !== "string") {
|
|
134
134
|
continue;
|
|
135
135
|
}
|
|
@@ -154,7 +154,7 @@ ${part.content}
|
|
|
154
154
|
}
|
|
155
155
|
return parts.map((p) => p.content).join(" ");
|
|
156
156
|
}
|
|
157
|
-
function formatSegment(segment) {
|
|
157
|
+
function formatSegment(segment, mode = "lm") {
|
|
158
158
|
const type = asString(segment.type);
|
|
159
159
|
if (type === "text") {
|
|
160
160
|
return asString(segment.value);
|
|
@@ -164,8 +164,14 @@ function formatSegment(segment) {
|
|
|
164
164
|
return refPath ? `<Attached: ${refPath}>` : void 0;
|
|
165
165
|
}
|
|
166
166
|
if (type === "file") {
|
|
167
|
-
const text = asString(segment.text);
|
|
168
167
|
const filePath = asString(segment.path);
|
|
168
|
+
if (!filePath) {
|
|
169
|
+
return void 0;
|
|
170
|
+
}
|
|
171
|
+
if (mode === "agent") {
|
|
172
|
+
return `<file: path="${filePath}">`;
|
|
173
|
+
}
|
|
174
|
+
const text = asString(segment.text);
|
|
169
175
|
if (text && filePath) {
|
|
170
176
|
return formatFileContents([{ content: text.trim(), isFile: true, displayPath: filePath }]);
|
|
171
177
|
}
|
|
@@ -194,9 +200,9 @@ function asString(value) {
|
|
|
194
200
|
}
|
|
195
201
|
|
|
196
202
|
// src/evaluation/loaders/config-loader.ts
|
|
197
|
-
var import_micromatch = __toESM(require("micromatch"), 1);
|
|
198
203
|
var import_promises2 = require("fs/promises");
|
|
199
204
|
var import_node_path2 = __toESM(require("path"), 1);
|
|
205
|
+
var import_micromatch = __toESM(require("micromatch"), 1);
|
|
200
206
|
var import_yaml = require("yaml");
|
|
201
207
|
|
|
202
208
|
// src/evaluation/loaders/file-resolver.ts
|
|
@@ -338,8 +344,9 @@ Please add '$schema: ${SCHEMA_CONFIG_V2}' at the top of the file.`;
|
|
|
338
344
|
guideline_patterns: guidelinePatterns
|
|
339
345
|
};
|
|
340
346
|
} catch (error) {
|
|
341
|
-
logWarning(
|
|
342
|
-
|
|
347
|
+
logWarning(
|
|
348
|
+
`Could not read .agentv/config.yaml at ${configPath}: ${error.message}`
|
|
349
|
+
);
|
|
343
350
|
}
|
|
344
351
|
}
|
|
345
352
|
return null;
|
|
@@ -369,8 +376,66 @@ function logWarning(message) {
|
|
|
369
376
|
|
|
370
377
|
// src/evaluation/loaders/evaluator-parser.ts
|
|
371
378
|
var import_node_path3 = __toESM(require("path"), 1);
|
|
379
|
+
|
|
380
|
+
// src/evaluation/validation/prompt-validator.ts
|
|
381
|
+
var import_promises3 = require("fs/promises");
|
|
382
|
+
|
|
383
|
+
// src/evaluation/template-variables.ts
|
|
384
|
+
var TEMPLATE_VARIABLES = {
|
|
385
|
+
CANDIDATE_ANSWER: "candidate_answer",
|
|
386
|
+
EXPECTED_MESSAGES: "expected_messages",
|
|
387
|
+
QUESTION: "question",
|
|
388
|
+
EXPECTED_OUTCOME: "expected_outcome",
|
|
389
|
+
REFERENCE_ANSWER: "reference_answer",
|
|
390
|
+
INPUT_MESSAGES: "input_messages"
|
|
391
|
+
};
|
|
392
|
+
var VALID_TEMPLATE_VARIABLES = new Set(Object.values(TEMPLATE_VARIABLES));
|
|
393
|
+
var REQUIRED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Set([
|
|
394
|
+
TEMPLATE_VARIABLES.CANDIDATE_ANSWER,
|
|
395
|
+
TEMPLATE_VARIABLES.EXPECTED_MESSAGES
|
|
396
|
+
]);
|
|
397
|
+
|
|
398
|
+
// src/evaluation/validation/prompt-validator.ts
|
|
372
399
|
var ANSI_YELLOW2 = "\x1B[33m";
|
|
373
400
|
var ANSI_RESET2 = "\x1B[0m";
|
|
401
|
+
async function validateCustomPromptContent(promptPath) {
|
|
402
|
+
const content = await (0, import_promises3.readFile)(promptPath, "utf8");
|
|
403
|
+
validateTemplateVariables(content, promptPath);
|
|
404
|
+
}
|
|
405
|
+
function validateTemplateVariables(content, source) {
|
|
406
|
+
const variablePattern = /\{\{\s*([a-zA-Z0-9_]+)\s*\}\}/g;
|
|
407
|
+
const foundVariables = /* @__PURE__ */ new Set();
|
|
408
|
+
const invalidVariables = [];
|
|
409
|
+
let match = variablePattern.exec(content);
|
|
410
|
+
while (match !== null) {
|
|
411
|
+
const varName = match[1];
|
|
412
|
+
foundVariables.add(varName);
|
|
413
|
+
if (!VALID_TEMPLATE_VARIABLES.has(varName)) {
|
|
414
|
+
invalidVariables.push(varName);
|
|
415
|
+
}
|
|
416
|
+
match = variablePattern.exec(content);
|
|
417
|
+
}
|
|
418
|
+
const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.CANDIDATE_ANSWER);
|
|
419
|
+
const hasExpectedMessages = foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_MESSAGES);
|
|
420
|
+
const hasRequiredFields = hasCandidateAnswer || hasExpectedMessages;
|
|
421
|
+
if (!hasRequiredFields) {
|
|
422
|
+
throw new Error(
|
|
423
|
+
`Missing required fields. Must include at least one of:
|
|
424
|
+
- {{ ${TEMPLATE_VARIABLES.CANDIDATE_ANSWER} }}
|
|
425
|
+
- {{ ${TEMPLATE_VARIABLES.EXPECTED_MESSAGES} }}`
|
|
426
|
+
);
|
|
427
|
+
}
|
|
428
|
+
if (invalidVariables.length > 0) {
|
|
429
|
+
const warningMessage = `${ANSI_YELLOW2}Warning: Custom evaluator template at ${source}
|
|
430
|
+
Contains invalid variables: ${invalidVariables.map((v) => `{{ ${v} }}`).join(", ")}
|
|
431
|
+
Valid variables: ${Array.from(VALID_TEMPLATE_VARIABLES).map((v) => `{{ ${v} }}`).join(", ")}${ANSI_RESET2}`;
|
|
432
|
+
console.warn(warningMessage);
|
|
433
|
+
}
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
// src/evaluation/loaders/evaluator-parser.ts
|
|
437
|
+
var ANSI_YELLOW3 = "\x1B[33m";
|
|
438
|
+
var ANSI_RESET3 = "\x1B[0m";
|
|
374
439
|
async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
|
|
375
440
|
const execution = rawEvalCase.execution;
|
|
376
441
|
const candidateEvaluators = isJsonObject2(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators ?? globalExecution?.evaluators;
|
|
@@ -429,6 +494,12 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
429
494
|
const resolved = await resolveFileReference(prompt, searchRoots);
|
|
430
495
|
if (resolved.resolvedPath) {
|
|
431
496
|
promptPath = import_node_path3.default.resolve(resolved.resolvedPath);
|
|
497
|
+
try {
|
|
498
|
+
await validateCustomPromptContent(promptPath);
|
|
499
|
+
} catch (error) {
|
|
500
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
501
|
+
throw new Error(`Evaluator '${name}' template (${promptPath}): ${message}`);
|
|
502
|
+
}
|
|
432
503
|
} else {
|
|
433
504
|
logWarning2(
|
|
434
505
|
`Inline prompt used for evaluator '${name}' in '${evalId}' (file not found: ${resolved.displayPath})`,
|
|
@@ -465,18 +536,18 @@ function isJsonObject2(value) {
|
|
|
465
536
|
function logWarning2(message, details) {
|
|
466
537
|
if (details && details.length > 0) {
|
|
467
538
|
const detailBlock = details.join("\n");
|
|
468
|
-
console.warn(`${
|
|
469
|
-
${detailBlock}${
|
|
539
|
+
console.warn(`${ANSI_YELLOW3}Warning: ${message}
|
|
540
|
+
${detailBlock}${ANSI_RESET3}`);
|
|
470
541
|
} else {
|
|
471
|
-
console.warn(`${
|
|
542
|
+
console.warn(`${ANSI_YELLOW3}Warning: ${message}${ANSI_RESET3}`);
|
|
472
543
|
}
|
|
473
544
|
}
|
|
474
545
|
|
|
475
546
|
// src/evaluation/loaders/message-processor.ts
|
|
476
|
-
var
|
|
547
|
+
var import_promises4 = require("fs/promises");
|
|
477
548
|
var import_node_path4 = __toESM(require("path"), 1);
|
|
478
|
-
var
|
|
479
|
-
var
|
|
549
|
+
var ANSI_YELLOW4 = "\x1B[33m";
|
|
550
|
+
var ANSI_RESET4 = "\x1B[0m";
|
|
480
551
|
async function processMessages(options) {
|
|
481
552
|
const {
|
|
482
553
|
messages,
|
|
@@ -519,7 +590,7 @@ async function processMessages(options) {
|
|
|
519
590
|
continue;
|
|
520
591
|
}
|
|
521
592
|
try {
|
|
522
|
-
const fileContent = (await (0,
|
|
593
|
+
const fileContent = (await (0, import_promises4.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
|
|
523
594
|
if (messageType === "input" && guidelinePatterns && guidelinePaths) {
|
|
524
595
|
const relativeToRepo = import_node_path4.default.relative(repoRootPath, resolvedPath);
|
|
525
596
|
if (isGuidelineFile(relativeToRepo, guidelinePatterns)) {
|
|
@@ -590,7 +661,7 @@ async function resolveAssistantContent(content, searchRoots, verbose) {
|
|
|
590
661
|
continue;
|
|
591
662
|
}
|
|
592
663
|
try {
|
|
593
|
-
const fileContent = (await (0,
|
|
664
|
+
const fileContent = (await (0, import_promises4.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n").trim();
|
|
594
665
|
parts.push({ content: fileContent, isFile: true, displayPath });
|
|
595
666
|
if (verbose) {
|
|
596
667
|
console.log(` [Expected Assistant File] Found: ${displayPath}`);
|
|
@@ -640,19 +711,19 @@ function cloneJsonValue(value) {
|
|
|
640
711
|
function logWarning3(message, details) {
|
|
641
712
|
if (details && details.length > 0) {
|
|
642
713
|
const detailBlock = details.join("\n");
|
|
643
|
-
console.warn(`${
|
|
644
|
-
${detailBlock}${
|
|
714
|
+
console.warn(`${ANSI_YELLOW4}Warning: ${message}
|
|
715
|
+
${detailBlock}${ANSI_RESET4}`);
|
|
645
716
|
} else {
|
|
646
|
-
console.warn(`${
|
|
717
|
+
console.warn(`${ANSI_YELLOW4}Warning: ${message}${ANSI_RESET4}`);
|
|
647
718
|
}
|
|
648
719
|
}
|
|
649
720
|
|
|
650
721
|
// src/evaluation/formatting/prompt-builder.ts
|
|
651
|
-
var
|
|
722
|
+
var import_promises5 = require("fs/promises");
|
|
652
723
|
var import_node_path5 = __toESM(require("path"), 1);
|
|
653
|
-
var
|
|
654
|
-
var
|
|
655
|
-
async function buildPromptInputs(testCase) {
|
|
724
|
+
var ANSI_YELLOW5 = "\x1B[33m";
|
|
725
|
+
var ANSI_RESET5 = "\x1B[0m";
|
|
726
|
+
async function buildPromptInputs(testCase, mode = "lm") {
|
|
656
727
|
const guidelineParts = [];
|
|
657
728
|
for (const rawPath of testCase.guideline_paths) {
|
|
658
729
|
const absolutePath = import_node_path5.default.resolve(rawPath);
|
|
@@ -661,7 +732,7 @@ async function buildPromptInputs(testCase) {
|
|
|
661
732
|
continue;
|
|
662
733
|
}
|
|
663
734
|
try {
|
|
664
|
-
const content = (await (0,
|
|
735
|
+
const content = (await (0, import_promises5.readFile)(absolutePath, "utf8")).replace(/\r\n/g, "\n").trim();
|
|
665
736
|
guidelineParts.push({
|
|
666
737
|
content,
|
|
667
738
|
isFile: true,
|
|
@@ -728,7 +799,7 @@ async function buildPromptInputs(testCase) {
|
|
|
728
799
|
const roleLabel = message.role.charAt(0).toUpperCase() + message.role.slice(1);
|
|
729
800
|
const contentParts = [];
|
|
730
801
|
for (const segment of segments) {
|
|
731
|
-
const formattedContent = formatSegment(segment);
|
|
802
|
+
const formattedContent = formatSegment(segment, mode);
|
|
732
803
|
if (formattedContent) {
|
|
733
804
|
contentParts.push(formattedContent);
|
|
734
805
|
}
|
|
@@ -743,7 +814,11 @@ ${messageContent}`);
|
|
|
743
814
|
} else {
|
|
744
815
|
const questionParts = [];
|
|
745
816
|
for (const segment of testCase.input_segments) {
|
|
746
|
-
|
|
817
|
+
if (segment.type === "file" && typeof segment.path === "string" && testCase.guideline_patterns && isGuidelineFile(segment.path, testCase.guideline_patterns)) {
|
|
818
|
+
questionParts.push(`<Attached: ${segment.path}>`);
|
|
819
|
+
continue;
|
|
820
|
+
}
|
|
821
|
+
const formattedContent = formatSegment(segment, mode);
|
|
747
822
|
if (formattedContent) {
|
|
748
823
|
questionParts.push(formattedContent);
|
|
749
824
|
}
|
|
@@ -757,7 +832,8 @@ ${messageContent}`);
|
|
|
757
832
|
messages: testCase.input_messages,
|
|
758
833
|
segmentsByMessage,
|
|
759
834
|
guidelinePatterns: testCase.guideline_patterns,
|
|
760
|
-
guidelineContent: guidelines
|
|
835
|
+
guidelineContent: guidelines,
|
|
836
|
+
mode
|
|
761
837
|
}) : void 0;
|
|
762
838
|
return { question, guidelines, chatPrompt };
|
|
763
839
|
}
|
|
@@ -774,7 +850,14 @@ function needsRoleMarkers(messages, processedSegmentsByMessage) {
|
|
|
774
850
|
return messagesWithContent > 1;
|
|
775
851
|
}
|
|
776
852
|
function buildChatPromptFromSegments(options) {
|
|
777
|
-
const {
|
|
853
|
+
const {
|
|
854
|
+
messages,
|
|
855
|
+
segmentsByMessage,
|
|
856
|
+
guidelinePatterns,
|
|
857
|
+
guidelineContent,
|
|
858
|
+
systemPrompt,
|
|
859
|
+
mode = "lm"
|
|
860
|
+
} = options;
|
|
778
861
|
if (messages.length === 0) {
|
|
779
862
|
return void 0;
|
|
780
863
|
}
|
|
@@ -792,7 +875,7 @@ ${guidelineContent.trim()}`);
|
|
|
792
875
|
const segments = segmentsByMessage[startIndex];
|
|
793
876
|
const contentParts = [];
|
|
794
877
|
for (const segment of segments) {
|
|
795
|
-
const formatted = formatSegment(segment);
|
|
878
|
+
const formatted = formatSegment(segment, mode);
|
|
796
879
|
if (formatted) {
|
|
797
880
|
contentParts.push(formatted);
|
|
798
881
|
}
|
|
@@ -825,7 +908,7 @@ ${guidelineContent.trim()}`);
|
|
|
825
908
|
if (segment.type === "guideline_ref") {
|
|
826
909
|
continue;
|
|
827
910
|
}
|
|
828
|
-
const formatted = formatSegment(segment);
|
|
911
|
+
const formatted = formatSegment(segment, mode);
|
|
829
912
|
if (formatted) {
|
|
830
913
|
const isGuidelineRef = segment.type === "file" && typeof segment.path === "string" && guidelinePatterns && isGuidelineFile(segment.path, guidelinePatterns);
|
|
831
914
|
if (isGuidelineRef) {
|
|
@@ -849,17 +932,17 @@ function asString4(value) {
|
|
|
849
932
|
return typeof value === "string" ? value : void 0;
|
|
850
933
|
}
|
|
851
934
|
function logWarning4(message) {
|
|
852
|
-
console.warn(`${
|
|
935
|
+
console.warn(`${ANSI_YELLOW5}Warning: ${message}${ANSI_RESET5}`);
|
|
853
936
|
}
|
|
854
937
|
|
|
855
938
|
// src/evaluation/yaml-parser.ts
|
|
856
|
-
var
|
|
857
|
-
var
|
|
858
|
-
var
|
|
939
|
+
var ANSI_YELLOW6 = "\x1B[33m";
|
|
940
|
+
var ANSI_RED = "\x1B[31m";
|
|
941
|
+
var ANSI_RESET6 = "\x1B[0m";
|
|
859
942
|
async function readTestSuiteMetadata(testFilePath) {
|
|
860
943
|
try {
|
|
861
944
|
const absolutePath = import_node_path6.default.resolve(testFilePath);
|
|
862
|
-
const content = await (0,
|
|
945
|
+
const content = await (0, import_promises6.readFile)(absolutePath, "utf8");
|
|
863
946
|
const parsed = (0, import_yaml2.parse)(content);
|
|
864
947
|
if (!isJsonObject(parsed)) {
|
|
865
948
|
return {};
|
|
@@ -877,7 +960,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
877
960
|
const searchRoots = buildSearchRoots(absoluteTestPath, repoRootPath);
|
|
878
961
|
const config = await loadConfig(absoluteTestPath, repoRootPath);
|
|
879
962
|
const guidelinePatterns = config?.guideline_patterns;
|
|
880
|
-
const rawFile = await (0,
|
|
963
|
+
const rawFile = await (0, import_promises6.readFile)(absoluteTestPath, "utf8");
|
|
881
964
|
const parsed = (0, import_yaml2.parse)(rawFile);
|
|
882
965
|
if (!isJsonObject(parsed)) {
|
|
883
966
|
throw new Error(`Invalid test file format: ${evalFilePath}`);
|
|
@@ -886,12 +969,6 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
886
969
|
const datasetNameFromSuite = asString5(suite.dataset)?.trim();
|
|
887
970
|
const fallbackDataset = import_node_path6.default.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
|
|
888
971
|
const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
|
|
889
|
-
const schema = suite.$schema;
|
|
890
|
-
if (schema !== SCHEMA_EVAL_V2) {
|
|
891
|
-
const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${evalFilePath}. Expected '${SCHEMA_EVAL_V2}'` : `Missing required field '$schema' in ${evalFilePath}.
|
|
892
|
-
Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
893
|
-
throw new Error(message);
|
|
894
|
-
}
|
|
895
972
|
const rawTestcases = suite.evalcases;
|
|
896
973
|
if (!Array.isArray(rawTestcases)) {
|
|
897
974
|
throw new Error(`Invalid test file format: ${evalFilePath} - missing 'evalcases' field`);
|
|
@@ -915,14 +992,18 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
915
992
|
const inputMessagesValue = evalcase.input_messages;
|
|
916
993
|
const expectedMessagesValue = evalcase.expected_messages;
|
|
917
994
|
if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
|
|
918
|
-
|
|
995
|
+
logError(
|
|
996
|
+
`Skipping incomplete eval case: ${id ?? "unknown"}. Missing required fields: id, outcome, and/or input_messages`
|
|
997
|
+
);
|
|
919
998
|
continue;
|
|
920
999
|
}
|
|
921
1000
|
const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
|
|
922
|
-
const inputMessages = inputMessagesValue.filter(
|
|
1001
|
+
const inputMessages = inputMessagesValue.filter(
|
|
1002
|
+
(msg) => isTestMessage(msg)
|
|
1003
|
+
);
|
|
923
1004
|
const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
|
|
924
1005
|
if (hasExpectedMessages && expectedMessages.length === 0) {
|
|
925
|
-
|
|
1006
|
+
logError(`No valid expected message found for eval case: ${id}`);
|
|
926
1007
|
continue;
|
|
927
1008
|
}
|
|
928
1009
|
if (expectedMessages.length > 1) {
|
|
@@ -953,7 +1034,14 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
953
1034
|
const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
|
|
954
1035
|
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
955
1036
|
const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
|
|
956
|
-
|
|
1037
|
+
let evaluators;
|
|
1038
|
+
try {
|
|
1039
|
+
evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
|
|
1040
|
+
} catch (error) {
|
|
1041
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1042
|
+
logError(`Skipping eval case '${id}': ${message}`);
|
|
1043
|
+
continue;
|
|
1044
|
+
}
|
|
957
1045
|
const userFilePaths = [];
|
|
958
1046
|
for (const segment of inputSegments) {
|
|
959
1047
|
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
@@ -971,7 +1059,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
971
1059
|
question,
|
|
972
1060
|
input_messages: inputMessages,
|
|
973
1061
|
input_segments: inputSegments,
|
|
974
|
-
|
|
1062
|
+
expected_segments: outputSegments,
|
|
975
1063
|
reference_answer: referenceAnswer,
|
|
976
1064
|
guideline_paths: guidelinePaths.map((guidelinePath) => import_node_path6.default.resolve(guidelinePath)),
|
|
977
1065
|
guideline_patterns: guidelinePatterns,
|
|
@@ -1003,20 +1091,29 @@ function asString5(value) {
|
|
|
1003
1091
|
function logWarning5(message, details) {
|
|
1004
1092
|
if (details && details.length > 0) {
|
|
1005
1093
|
const detailBlock = details.join("\n");
|
|
1006
|
-
console.warn(`${
|
|
1007
|
-
${detailBlock}${
|
|
1094
|
+
console.warn(`${ANSI_YELLOW6}Warning: ${message}
|
|
1095
|
+
${detailBlock}${ANSI_RESET6}`);
|
|
1096
|
+
} else {
|
|
1097
|
+
console.warn(`${ANSI_YELLOW6}Warning: ${message}${ANSI_RESET6}`);
|
|
1098
|
+
}
|
|
1099
|
+
}
|
|
1100
|
+
function logError(message, details) {
|
|
1101
|
+
if (details && details.length > 0) {
|
|
1102
|
+
const detailBlock = details.join("\n");
|
|
1103
|
+
console.error(`${ANSI_RED}Error: ${message}
|
|
1104
|
+
${detailBlock}${ANSI_RESET6}`);
|
|
1008
1105
|
} else {
|
|
1009
|
-
console.
|
|
1106
|
+
console.error(`${ANSI_RED}Error: ${message}${ANSI_RESET6}`);
|
|
1010
1107
|
}
|
|
1011
1108
|
}
|
|
1012
1109
|
|
|
1013
1110
|
// src/evaluation/file-utils.ts
|
|
1014
1111
|
var import_node_fs2 = require("fs");
|
|
1015
|
-
var
|
|
1112
|
+
var import_promises7 = require("fs/promises");
|
|
1016
1113
|
var import_node_path7 = __toESM(require("path"), 1);
|
|
1017
1114
|
async function fileExists2(filePath) {
|
|
1018
1115
|
try {
|
|
1019
|
-
await (0,
|
|
1116
|
+
await (0, import_promises7.access)(filePath, import_node_fs2.constants.F_OK);
|
|
1020
1117
|
return true;
|
|
1021
1118
|
} catch {
|
|
1022
1119
|
return false;
|
|
@@ -1026,7 +1123,7 @@ function normalizeLineEndings(content) {
|
|
|
1026
1123
|
return content.replace(/\r\n/g, "\n");
|
|
1027
1124
|
}
|
|
1028
1125
|
async function readTextFile(filePath) {
|
|
1029
|
-
const content = await (0,
|
|
1126
|
+
const content = await (0, import_promises7.readFile)(filePath, "utf8");
|
|
1030
1127
|
return normalizeLineEndings(content);
|
|
1031
1128
|
}
|
|
1032
1129
|
async function findGitRoot(startPath) {
|
|
@@ -1447,7 +1544,7 @@ async function withRetry(fn, retryConfig, signal) {
|
|
|
1447
1544
|
|
|
1448
1545
|
// src/evaluation/providers/cli.ts
|
|
1449
1546
|
var import_node_child_process = require("child_process");
|
|
1450
|
-
var
|
|
1547
|
+
var import_promises8 = __toESM(require("fs/promises"), 1);
|
|
1451
1548
|
var import_node_os = __toESM(require("os"), 1);
|
|
1452
1549
|
var import_node_path8 = __toESM(require("path"), 1);
|
|
1453
1550
|
var import_node_util = require("util");
|
|
@@ -1548,7 +1645,7 @@ var CliProvider = class {
|
|
|
1548
1645
|
const errorMsg = error instanceof Error ? error.message : String(error);
|
|
1549
1646
|
throw new Error(`Failed to read output file '${filePath}': ${errorMsg}`);
|
|
1550
1647
|
} finally {
|
|
1551
|
-
await
|
|
1648
|
+
await import_promises8.default.unlink(filePath).catch(() => {
|
|
1552
1649
|
});
|
|
1553
1650
|
}
|
|
1554
1651
|
}
|
|
@@ -1687,7 +1784,7 @@ function formatTimeoutSuffix(timeoutMs) {
|
|
|
1687
1784
|
var import_node_child_process2 = require("child_process");
|
|
1688
1785
|
var import_node_crypto = require("crypto");
|
|
1689
1786
|
var import_node_fs3 = require("fs");
|
|
1690
|
-
var
|
|
1787
|
+
var import_promises9 = require("fs/promises");
|
|
1691
1788
|
var import_node_os2 = require("os");
|
|
1692
1789
|
var import_node_path10 = __toESM(require("path"), 1);
|
|
1693
1790
|
var import_node_util2 = require("util");
|
|
@@ -1755,9 +1852,7 @@ function buildPromptDocument(request, inputFiles, options) {
|
|
|
1755
1852
|
options?.guidelineOverrides
|
|
1756
1853
|
);
|
|
1757
1854
|
const inputFilesList = collectInputFiles(inputFiles);
|
|
1758
|
-
const nonGuidelineInputFiles = inputFilesList.filter(
|
|
1759
|
-
(file) => !guidelineFiles.includes(file)
|
|
1760
|
-
);
|
|
1855
|
+
const nonGuidelineInputFiles = inputFilesList.filter((file) => !guidelineFiles.includes(file));
|
|
1761
1856
|
const prereadBlock = buildMandatoryPrereadBlock(guidelineFiles, nonGuidelineInputFiles);
|
|
1762
1857
|
if (prereadBlock.length > 0) {
|
|
1763
1858
|
parts.push("\n", prereadBlock);
|
|
@@ -1877,7 +1972,7 @@ var CodexProvider = class {
|
|
|
1877
1972
|
try {
|
|
1878
1973
|
const promptContent = buildPromptDocument(request, inputFiles);
|
|
1879
1974
|
const promptFile = import_node_path10.default.join(workspaceRoot, PROMPT_FILENAME);
|
|
1880
|
-
await (0,
|
|
1975
|
+
await (0, import_promises9.writeFile)(promptFile, promptContent, "utf8");
|
|
1881
1976
|
const args = this.buildCodexArgs();
|
|
1882
1977
|
const cwd = this.resolveCwd(workspaceRoot);
|
|
1883
1978
|
const result = await this.executeCodex(args, cwd, promptContent, request.signal, logger);
|
|
@@ -1929,7 +2024,15 @@ var CodexProvider = class {
|
|
|
1929
2024
|
return import_node_path10.default.resolve(this.config.cwd);
|
|
1930
2025
|
}
|
|
1931
2026
|
buildCodexArgs() {
|
|
1932
|
-
const args = [
|
|
2027
|
+
const args = [
|
|
2028
|
+
"--ask-for-approval",
|
|
2029
|
+
"never",
|
|
2030
|
+
"exec",
|
|
2031
|
+
"--json",
|
|
2032
|
+
"--color",
|
|
2033
|
+
"never",
|
|
2034
|
+
"--skip-git-repo-check"
|
|
2035
|
+
];
|
|
1933
2036
|
if (this.config.args && this.config.args.length > 0) {
|
|
1934
2037
|
args.push(...this.config.args);
|
|
1935
2038
|
}
|
|
@@ -1960,11 +2063,11 @@ var CodexProvider = class {
|
|
|
1960
2063
|
}
|
|
1961
2064
|
}
|
|
1962
2065
|
async createWorkspace() {
|
|
1963
|
-
return await (0,
|
|
2066
|
+
return await (0, import_promises9.mkdtemp)(import_node_path10.default.join((0, import_node_os2.tmpdir)(), WORKSPACE_PREFIX));
|
|
1964
2067
|
}
|
|
1965
2068
|
async cleanupWorkspace(workspaceRoot) {
|
|
1966
2069
|
try {
|
|
1967
|
-
await (0,
|
|
2070
|
+
await (0, import_promises9.rm)(workspaceRoot, { recursive: true, force: true });
|
|
1968
2071
|
} catch {
|
|
1969
2072
|
}
|
|
1970
2073
|
}
|
|
@@ -1984,7 +2087,7 @@ var CodexProvider = class {
|
|
|
1984
2087
|
return void 0;
|
|
1985
2088
|
}
|
|
1986
2089
|
try {
|
|
1987
|
-
await (0,
|
|
2090
|
+
await (0, import_promises9.mkdir)(logDir, { recursive: true });
|
|
1988
2091
|
} catch (error) {
|
|
1989
2092
|
const message = error instanceof Error ? error.message : String(error);
|
|
1990
2093
|
console.warn(`Skipping Codex stream logging (could not create ${logDir}): ${message}`);
|
|
@@ -2207,7 +2310,7 @@ async function locateExecutable(candidate) {
|
|
|
2207
2310
|
if (includesPathSeparator) {
|
|
2208
2311
|
const resolved = import_node_path10.default.isAbsolute(candidate) ? candidate : import_node_path10.default.resolve(candidate);
|
|
2209
2312
|
const executablePath = await ensureWindowsExecutableVariant(resolved);
|
|
2210
|
-
await (0,
|
|
2313
|
+
await (0, import_promises9.access)(executablePath, import_node_fs3.constants.F_OK);
|
|
2211
2314
|
return executablePath;
|
|
2212
2315
|
}
|
|
2213
2316
|
const locator = process.platform === "win32" ? "where" : "which";
|
|
@@ -2217,7 +2320,7 @@ async function locateExecutable(candidate) {
|
|
|
2217
2320
|
const preferred = selectExecutableCandidate(lines);
|
|
2218
2321
|
if (preferred) {
|
|
2219
2322
|
const executablePath = await ensureWindowsExecutableVariant(preferred);
|
|
2220
|
-
await (0,
|
|
2323
|
+
await (0, import_promises9.access)(executablePath, import_node_fs3.constants.F_OK);
|
|
2221
2324
|
return executablePath;
|
|
2222
2325
|
}
|
|
2223
2326
|
} catch {
|
|
@@ -2251,7 +2354,7 @@ async function ensureWindowsExecutableVariant(candidate) {
|
|
|
2251
2354
|
for (const ext of extensions) {
|
|
2252
2355
|
const withExtension = `${candidate}${ext}`;
|
|
2253
2356
|
try {
|
|
2254
|
-
await (0,
|
|
2357
|
+
await (0, import_promises9.access)(withExtension, import_node_fs3.constants.F_OK);
|
|
2255
2358
|
return withExtension;
|
|
2256
2359
|
} catch {
|
|
2257
2360
|
}
|
|
@@ -2553,7 +2656,14 @@ var MockProvider = class {
|
|
|
2553
2656
|
|
|
2554
2657
|
// src/evaluation/providers/targets.ts
|
|
2555
2658
|
var import_zod = require("zod");
|
|
2556
|
-
var CLI_PLACEHOLDERS = /* @__PURE__ */ new Set([
|
|
2659
|
+
var CLI_PLACEHOLDERS = /* @__PURE__ */ new Set([
|
|
2660
|
+
"PROMPT",
|
|
2661
|
+
"GUIDELINES",
|
|
2662
|
+
"EVAL_ID",
|
|
2663
|
+
"ATTEMPT",
|
|
2664
|
+
"FILES",
|
|
2665
|
+
"OUTPUT_FILE"
|
|
2666
|
+
]);
|
|
2557
2667
|
var BASE_TARGET_SCHEMA = import_zod.z.object({
|
|
2558
2668
|
name: import_zod.z.string().min(1, "target name is required"),
|
|
2559
2669
|
provider: import_zod.z.string().min(1, "provider is required"),
|
|
@@ -2798,11 +2908,18 @@ function resolveMockConfig(target) {
|
|
|
2798
2908
|
return { response };
|
|
2799
2909
|
}
|
|
2800
2910
|
function resolveVSCodeConfig(target, env, insiders) {
|
|
2801
|
-
const workspaceTemplateEnvVar = resolveOptionalLiteralString(
|
|
2802
|
-
|
|
2803
|
-
|
|
2804
|
-
|
|
2805
|
-
|
|
2911
|
+
const workspaceTemplateEnvVar = resolveOptionalLiteralString(
|
|
2912
|
+
target.workspace_template ?? target.workspaceTemplate
|
|
2913
|
+
);
|
|
2914
|
+
const workspaceTemplate = workspaceTemplateEnvVar ? resolveOptionalString(
|
|
2915
|
+
workspaceTemplateEnvVar,
|
|
2916
|
+
env,
|
|
2917
|
+
`${target.name} workspace template path`,
|
|
2918
|
+
{
|
|
2919
|
+
allowLiteral: false,
|
|
2920
|
+
optionalEnv: true
|
|
2921
|
+
}
|
|
2922
|
+
) : void 0;
|
|
2806
2923
|
const commandSource = target.vscode_cmd ?? target.command;
|
|
2807
2924
|
const waitSource = target.wait;
|
|
2808
2925
|
const dryRunSource = target.dry_run ?? target.dryRun;
|
|
@@ -2829,7 +2946,10 @@ function resolveCliConfig(target, env) {
|
|
|
2829
2946
|
allowLiteral: true,
|
|
2830
2947
|
optionalEnv: true
|
|
2831
2948
|
});
|
|
2832
|
-
const timeoutMs = resolveTimeoutMs(
|
|
2949
|
+
const timeoutMs = resolveTimeoutMs(
|
|
2950
|
+
target.timeout_seconds ?? target.timeoutSeconds,
|
|
2951
|
+
`${target.name} timeout`
|
|
2952
|
+
);
|
|
2833
2953
|
const healthcheck = resolveCliHealthcheck(target.healthcheck, env, target.name);
|
|
2834
2954
|
const commandTemplate = resolveString(
|
|
2835
2955
|
commandTemplateSource,
|
|
@@ -2957,7 +3077,9 @@ function resolveOptionalString(source, env, description, options) {
|
|
|
2957
3077
|
}
|
|
2958
3078
|
const allowLiteral = options?.allowLiteral ?? false;
|
|
2959
3079
|
if (!allowLiteral) {
|
|
2960
|
-
throw new Error(
|
|
3080
|
+
throw new Error(
|
|
3081
|
+
`${description} must use \${{ VARIABLE_NAME }} syntax for environment variables or be marked as allowing literals`
|
|
3082
|
+
);
|
|
2961
3083
|
}
|
|
2962
3084
|
return trimmed;
|
|
2963
3085
|
}
|
|
@@ -3181,9 +3303,7 @@ function buildPromptDocument2(request, attachments, guidelinePatterns) {
|
|
|
3181
3303
|
}
|
|
3182
3304
|
const guidelineFiles = collectGuidelineFiles2(attachments, guidelinePatterns);
|
|
3183
3305
|
const attachmentFiles = collectAttachmentFiles(attachments);
|
|
3184
|
-
const nonGuidelineAttachments = attachmentFiles.filter(
|
|
3185
|
-
(file) => !guidelineFiles.includes(file)
|
|
3186
|
-
);
|
|
3306
|
+
const nonGuidelineAttachments = attachmentFiles.filter((file) => !guidelineFiles.includes(file));
|
|
3187
3307
|
const prereadBlock = buildMandatoryPrereadBlock2(guidelineFiles, nonGuidelineAttachments);
|
|
3188
3308
|
if (prereadBlock.length > 0) {
|
|
3189
3309
|
parts.push("\n", prereadBlock);
|
|
@@ -3292,8 +3412,10 @@ async function ensureVSCodeSubagents(options) {
|
|
|
3292
3412
|
if (result.skippedExisting.length > 0) {
|
|
3293
3413
|
console.log(`Reusing ${result.skippedExisting.length} existing unlocked subagent(s)`);
|
|
3294
3414
|
}
|
|
3295
|
-
console.log(
|
|
3296
|
-
|
|
3415
|
+
console.log(
|
|
3416
|
+
`
|
|
3417
|
+
total unlocked subagents available: ${result.created.length + result.skippedExisting.length}`
|
|
3418
|
+
);
|
|
3297
3419
|
}
|
|
3298
3420
|
return {
|
|
3299
3421
|
provisioned: true,
|
|
@@ -3313,46 +3435,12 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
|
|
|
3313
3435
|
|
|
3314
3436
|
// src/evaluation/providers/targets-file.ts
|
|
3315
3437
|
var import_node_fs4 = require("fs");
|
|
3316
|
-
var
|
|
3438
|
+
var import_promises10 = require("fs/promises");
|
|
3317
3439
|
var import_node_path12 = __toESM(require("path"), 1);
|
|
3318
3440
|
var import_yaml3 = require("yaml");
|
|
3319
|
-
|
|
3320
|
-
// src/evaluation/providers/types.ts
|
|
3321
|
-
var AGENT_PROVIDER_KINDS = [
|
|
3322
|
-
"codex",
|
|
3323
|
-
"vscode",
|
|
3324
|
-
"vscode-insiders"
|
|
3325
|
-
];
|
|
3326
|
-
var TARGETS_SCHEMA_V2 = "agentv-targets-v2.2";
|
|
3327
|
-
function isAgentProvider(provider) {
|
|
3328
|
-
return provider ? AGENT_PROVIDER_KINDS.includes(provider.kind) : false;
|
|
3329
|
-
}
|
|
3330
|
-
|
|
3331
|
-
// src/evaluation/providers/targets-file.ts
|
|
3332
3441
|
function isRecord(value) {
|
|
3333
3442
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
3334
3443
|
}
|
|
3335
|
-
function checkSchema(parsed, absolutePath) {
|
|
3336
|
-
const schema = parsed.$schema;
|
|
3337
|
-
if (schema === void 0) {
|
|
3338
|
-
throw new Error(
|
|
3339
|
-
`Missing $schema field in targets.yaml at ${absolutePath}.
|
|
3340
|
-
Please add '$schema: ${TARGETS_SCHEMA_V2}' at the top of the file.`
|
|
3341
|
-
);
|
|
3342
|
-
}
|
|
3343
|
-
if (typeof schema !== "string") {
|
|
3344
|
-
throw new Error(
|
|
3345
|
-
`Invalid $schema field in targets.yaml at ${absolutePath}.
|
|
3346
|
-
Expected a string value '${TARGETS_SCHEMA_V2}'.`
|
|
3347
|
-
);
|
|
3348
|
-
}
|
|
3349
|
-
if (schema !== TARGETS_SCHEMA_V2) {
|
|
3350
|
-
throw new Error(
|
|
3351
|
-
`Invalid $schema '${schema}' in targets.yaml at ${absolutePath}.
|
|
3352
|
-
Expected '${TARGETS_SCHEMA_V2}'.`
|
|
3353
|
-
);
|
|
3354
|
-
}
|
|
3355
|
-
}
|
|
3356
3444
|
function extractTargetsArray(parsed, absolutePath) {
|
|
3357
3445
|
const targets = parsed.targets;
|
|
3358
3446
|
if (!Array.isArray(targets)) {
|
|
@@ -3367,7 +3455,9 @@ function assertTargetDefinition(value, index, filePath) {
|
|
|
3367
3455
|
const name = value.name;
|
|
3368
3456
|
const provider = value.provider;
|
|
3369
3457
|
if (typeof name !== "string" || name.trim().length === 0) {
|
|
3370
|
-
throw new Error(
|
|
3458
|
+
throw new Error(
|
|
3459
|
+
`targets.yaml entry at index ${index} in ${filePath} is missing a valid 'name'`
|
|
3460
|
+
);
|
|
3371
3461
|
}
|
|
3372
3462
|
if (typeof provider !== "string" || provider.trim().length === 0) {
|
|
3373
3463
|
throw new Error(`targets.yaml entry '${name}' in ${filePath} is missing a valid 'provider'`);
|
|
@@ -3376,7 +3466,7 @@ function assertTargetDefinition(value, index, filePath) {
|
|
|
3376
3466
|
}
|
|
3377
3467
|
async function fileExists3(filePath) {
|
|
3378
3468
|
try {
|
|
3379
|
-
await (0,
|
|
3469
|
+
await (0, import_promises10.access)(filePath, import_node_fs4.constants.F_OK);
|
|
3380
3470
|
return true;
|
|
3381
3471
|
} catch {
|
|
3382
3472
|
return false;
|
|
@@ -3387,14 +3477,15 @@ async function readTargetDefinitions(filePath) {
|
|
|
3387
3477
|
if (!await fileExists3(absolutePath)) {
|
|
3388
3478
|
throw new Error(`targets.yaml not found at ${absolutePath}`);
|
|
3389
3479
|
}
|
|
3390
|
-
const raw = await (0,
|
|
3480
|
+
const raw = await (0, import_promises10.readFile)(absolutePath, "utf8");
|
|
3391
3481
|
const parsed = (0, import_yaml3.parse)(raw);
|
|
3392
3482
|
if (!isRecord(parsed)) {
|
|
3393
|
-
throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with
|
|
3483
|
+
throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with a 'targets' field`);
|
|
3394
3484
|
}
|
|
3395
|
-
checkSchema(parsed, absolutePath);
|
|
3396
3485
|
const targets = extractTargetsArray(parsed, absolutePath);
|
|
3397
|
-
const definitions = targets.map(
|
|
3486
|
+
const definitions = targets.map(
|
|
3487
|
+
(entry, index) => assertTargetDefinition(entry, index, absolutePath)
|
|
3488
|
+
);
|
|
3398
3489
|
return definitions;
|
|
3399
3490
|
}
|
|
3400
3491
|
function listTargetNames(definitions) {
|
|
@@ -3438,16 +3529,16 @@ Use the reference_answer as a gold standard for a high-quality response (if prov
|
|
|
3438
3529
|
Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.
|
|
3439
3530
|
|
|
3440
3531
|
[[ ## expected_outcome ## ]]
|
|
3441
|
-
{{
|
|
3532
|
+
{{${TEMPLATE_VARIABLES.EXPECTED_OUTCOME}}}
|
|
3442
3533
|
|
|
3443
3534
|
[[ ## question ## ]]
|
|
3444
|
-
{{
|
|
3535
|
+
{{${TEMPLATE_VARIABLES.QUESTION}}}
|
|
3445
3536
|
|
|
3446
3537
|
[[ ## reference_answer ## ]]
|
|
3447
|
-
{{
|
|
3538
|
+
{{${TEMPLATE_VARIABLES.REFERENCE_ANSWER}}}
|
|
3448
3539
|
|
|
3449
3540
|
[[ ## candidate_answer ## ]]
|
|
3450
|
-
{{
|
|
3541
|
+
{{${TEMPLATE_VARIABLES.CANDIDATE_ANSWER}}}`;
|
|
3451
3542
|
var LlmJudgeEvaluator = class {
|
|
3452
3543
|
kind = "llm_judge";
|
|
3453
3544
|
resolveJudgeProvider;
|
|
@@ -3470,12 +3561,16 @@ var LlmJudgeEvaluator = class {
|
|
|
3470
3561
|
async evaluateWithPrompt(context, judgeProvider) {
|
|
3471
3562
|
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
3472
3563
|
const variables = {
|
|
3473
|
-
|
|
3474
|
-
|
|
3475
|
-
|
|
3476
|
-
|
|
3477
|
-
|
|
3478
|
-
|
|
3564
|
+
[TEMPLATE_VARIABLES.INPUT_MESSAGES]: JSON.stringify(context.evalCase.input_segments, null, 2),
|
|
3565
|
+
[TEMPLATE_VARIABLES.EXPECTED_MESSAGES]: JSON.stringify(
|
|
3566
|
+
context.evalCase.expected_segments,
|
|
3567
|
+
null,
|
|
3568
|
+
2
|
|
3569
|
+
),
|
|
3570
|
+
[TEMPLATE_VARIABLES.CANDIDATE_ANSWER]: context.candidate.trim(),
|
|
3571
|
+
[TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context.evalCase.reference_answer ?? "").trim(),
|
|
3572
|
+
[TEMPLATE_VARIABLES.EXPECTED_OUTCOME]: context.evalCase.expected_outcome.trim(),
|
|
3573
|
+
[TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim()
|
|
3479
3574
|
};
|
|
3480
3575
|
const systemPrompt = buildOutputSchema();
|
|
3481
3576
|
const evaluatorTemplate = context.evaluatorTemplateOverride ?? this.evaluatorTemplate ?? DEFAULT_EVALUATOR_TEMPLATE;
|
|
@@ -3707,17 +3802,17 @@ function parseJsonSafe(payload) {
|
|
|
3707
3802
|
}
|
|
3708
3803
|
}
|
|
3709
3804
|
function substituteVariables(template, variables) {
|
|
3710
|
-
return template.replace(/\{\{([a-zA-Z0-9_]+)\}\}/g, (match, varName) => {
|
|
3805
|
+
return template.replace(/\{\{\s*([a-zA-Z0-9_]+)\s*\}\}/g, (match, varName) => {
|
|
3711
3806
|
return variables[varName] ?? match;
|
|
3712
3807
|
});
|
|
3713
3808
|
}
|
|
3714
3809
|
|
|
3715
3810
|
// src/evaluation/orchestrator.ts
|
|
3716
3811
|
var import_node_crypto2 = require("crypto");
|
|
3717
|
-
var
|
|
3812
|
+
var import_promises11 = require("fs/promises");
|
|
3718
3813
|
var import_node_path13 = __toESM(require("path"), 1);
|
|
3719
3814
|
|
|
3720
|
-
// ../../node_modules/.
|
|
3815
|
+
// ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
|
|
3721
3816
|
var Node = class {
|
|
3722
3817
|
value;
|
|
3723
3818
|
next;
|
|
@@ -3750,6 +3845,9 @@ var Queue = class {
|
|
|
3750
3845
|
}
|
|
3751
3846
|
this.#head = this.#head.next;
|
|
3752
3847
|
this.#size--;
|
|
3848
|
+
if (!this.#head) {
|
|
3849
|
+
this.#tail = void 0;
|
|
3850
|
+
}
|
|
3753
3851
|
return current.value;
|
|
3754
3852
|
}
|
|
3755
3853
|
peek() {
|
|
@@ -3780,7 +3878,7 @@ var Queue = class {
|
|
|
3780
3878
|
}
|
|
3781
3879
|
};
|
|
3782
3880
|
|
|
3783
|
-
// ../../node_modules/.
|
|
3881
|
+
// ../../node_modules/.bun/p-limit@6.2.0/node_modules/p-limit/index.js
|
|
3784
3882
|
function pLimit(concurrency) {
|
|
3785
3883
|
validateConcurrency(concurrency);
|
|
3786
3884
|
const queue = new Queue();
|
|
@@ -3853,6 +3951,16 @@ function validateConcurrency(concurrency) {
|
|
|
3853
3951
|
}
|
|
3854
3952
|
}
|
|
3855
3953
|
|
|
3954
|
+
// src/evaluation/providers/types.ts
|
|
3955
|
+
var AGENT_PROVIDER_KINDS = [
|
|
3956
|
+
"codex",
|
|
3957
|
+
"vscode",
|
|
3958
|
+
"vscode-insiders"
|
|
3959
|
+
];
|
|
3960
|
+
function isAgentProvider(provider) {
|
|
3961
|
+
return provider ? AGENT_PROVIDER_KINDS.includes(provider.kind) : false;
|
|
3962
|
+
}
|
|
3963
|
+
|
|
3856
3964
|
// src/evaluation/orchestrator.ts
|
|
3857
3965
|
async function runEvaluation(options) {
|
|
3858
3966
|
const {
|
|
@@ -3871,11 +3979,11 @@ async function runEvaluation(options) {
|
|
|
3871
3979
|
now,
|
|
3872
3980
|
evalId,
|
|
3873
3981
|
verbose,
|
|
3982
|
+
evalCases: preloadedEvalCases,
|
|
3874
3983
|
onResult,
|
|
3875
3984
|
onProgress
|
|
3876
3985
|
} = options;
|
|
3877
|
-
const
|
|
3878
|
-
const evalCases = await load(evalFilePath, repoRoot, { verbose, evalId });
|
|
3986
|
+
const evalCases = preloadedEvalCases ?? await loadEvalCases(evalFilePath, repoRoot, { verbose, evalId });
|
|
3879
3987
|
const filteredEvalCases = filterEvalCases(evalCases, evalId);
|
|
3880
3988
|
if (filteredEvalCases.length === 0) {
|
|
3881
3989
|
if (evalId) {
|
|
@@ -3956,7 +4064,9 @@ async function runEvaluation(options) {
|
|
|
3956
4064
|
} catch (error) {
|
|
3957
4065
|
if (verbose) {
|
|
3958
4066
|
const message = error instanceof Error ? error.message : String(error);
|
|
3959
|
-
console.warn(
|
|
4067
|
+
console.warn(
|
|
4068
|
+
`Provider batch execution failed, falling back to per-case dispatch: ${message}`
|
|
4069
|
+
);
|
|
3960
4070
|
}
|
|
3961
4071
|
}
|
|
3962
4072
|
}
|
|
@@ -4059,8 +4169,9 @@ async function runBatchEvaluation(options) {
|
|
|
4059
4169
|
agentTimeoutMs
|
|
4060
4170
|
} = options;
|
|
4061
4171
|
const promptInputsList = [];
|
|
4172
|
+
const formattingMode = isAgentProvider(provider) ? "agent" : "lm";
|
|
4062
4173
|
for (const evalCase of evalCases) {
|
|
4063
|
-
const promptInputs = await buildPromptInputs(evalCase);
|
|
4174
|
+
const promptInputs = await buildPromptInputs(evalCase, formattingMode);
|
|
4064
4175
|
if (promptDumpDir) {
|
|
4065
4176
|
await dumpPrompt(promptDumpDir, evalCase, promptInputs);
|
|
4066
4177
|
}
|
|
@@ -4119,7 +4230,14 @@ async function runBatchEvaluation(options) {
|
|
|
4119
4230
|
agentTimeoutMs
|
|
4120
4231
|
});
|
|
4121
4232
|
} catch (error) {
|
|
4122
|
-
const errorResult = buildErrorResult(
|
|
4233
|
+
const errorResult = buildErrorResult(
|
|
4234
|
+
evalCase,
|
|
4235
|
+
target.name,
|
|
4236
|
+
nowFn(),
|
|
4237
|
+
error,
|
|
4238
|
+
promptInputs,
|
|
4239
|
+
provider
|
|
4240
|
+
);
|
|
4123
4241
|
results.push(errorResult);
|
|
4124
4242
|
if (onResult) {
|
|
4125
4243
|
await onResult(errorResult);
|
|
@@ -4166,7 +4284,8 @@ async function runEvalCase(options) {
|
|
|
4166
4284
|
signal,
|
|
4167
4285
|
judgeProvider
|
|
4168
4286
|
} = options;
|
|
4169
|
-
const
|
|
4287
|
+
const formattingMode = isAgentProvider(provider) ? "agent" : "lm";
|
|
4288
|
+
const promptInputs = await buildPromptInputs(evalCase, formattingMode);
|
|
4170
4289
|
if (promptDumpDir) {
|
|
4171
4290
|
await dumpPrompt(promptDumpDir, evalCase, promptInputs);
|
|
4172
4291
|
}
|
|
@@ -4296,7 +4415,18 @@ async function evaluateCandidate(options) {
|
|
|
4296
4415
|
};
|
|
4297
4416
|
}
|
|
4298
4417
|
async function runEvaluatorsForCase(options) {
|
|
4299
|
-
const {
|
|
4418
|
+
const {
|
|
4419
|
+
evalCase,
|
|
4420
|
+
candidate,
|
|
4421
|
+
target,
|
|
4422
|
+
provider,
|
|
4423
|
+
evaluators,
|
|
4424
|
+
attempt,
|
|
4425
|
+
promptInputs,
|
|
4426
|
+
now,
|
|
4427
|
+
judgeProvider,
|
|
4428
|
+
agentTimeoutMs
|
|
4429
|
+
} = options;
|
|
4300
4430
|
if (evalCase.evaluators && evalCase.evaluators.length > 0) {
|
|
4301
4431
|
return runEvaluatorList({
|
|
4302
4432
|
evalCase,
|
|
@@ -4397,7 +4527,6 @@ async function runEvaluatorList(options) {
|
|
|
4397
4527
|
reasoning: score2.reasoning,
|
|
4398
4528
|
evaluator_provider_request: score2.evaluatorRawRequest
|
|
4399
4529
|
});
|
|
4400
|
-
continue;
|
|
4401
4530
|
}
|
|
4402
4531
|
} catch (error) {
|
|
4403
4532
|
const message = error instanceof Error ? error.message : String(error);
|
|
@@ -4408,7 +4537,11 @@ async function runEvaluatorList(options) {
|
|
|
4408
4537
|
expectedAspectCount: 1,
|
|
4409
4538
|
reasoning: message
|
|
4410
4539
|
};
|
|
4411
|
-
scored.push({
|
|
4540
|
+
scored.push({
|
|
4541
|
+
score: fallbackScore,
|
|
4542
|
+
name: evaluator.name ?? "unknown",
|
|
4543
|
+
type: evaluator.type ?? "unknown"
|
|
4544
|
+
});
|
|
4412
4545
|
evaluatorResults.push({
|
|
4413
4546
|
name: evaluator.name ?? "unknown",
|
|
4414
4547
|
type: evaluator.type ?? "unknown",
|
|
@@ -4422,7 +4555,10 @@ async function runEvaluatorList(options) {
|
|
|
4422
4555
|
const aggregateScore = scored.length > 0 ? scored.reduce((total, entry) => total + entry.score.score, 0) / scored.length : 0;
|
|
4423
4556
|
const hits = scored.flatMap((entry) => entry.score.hits);
|
|
4424
4557
|
const misses = scored.flatMap((entry) => entry.score.misses);
|
|
4425
|
-
const expectedAspectCount = scored.reduce(
|
|
4558
|
+
const expectedAspectCount = scored.reduce(
|
|
4559
|
+
(total, entry) => total + (entry.score.expectedAspectCount ?? 0),
|
|
4560
|
+
0
|
|
4561
|
+
);
|
|
4426
4562
|
const rawAspects = scored.flatMap((entry) => entry.score.rawAspects ?? []);
|
|
4427
4563
|
const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString2);
|
|
4428
4564
|
const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
|
|
@@ -4437,7 +4573,18 @@ async function runEvaluatorList(options) {
|
|
|
4437
4573
|
return { score, evaluatorResults };
|
|
4438
4574
|
}
|
|
4439
4575
|
async function runLlmJudgeEvaluator(options) {
|
|
4440
|
-
const {
|
|
4576
|
+
const {
|
|
4577
|
+
config,
|
|
4578
|
+
evalCase,
|
|
4579
|
+
candidate,
|
|
4580
|
+
target,
|
|
4581
|
+
provider,
|
|
4582
|
+
evaluatorRegistry,
|
|
4583
|
+
attempt,
|
|
4584
|
+
promptInputs,
|
|
4585
|
+
now,
|
|
4586
|
+
judgeProvider
|
|
4587
|
+
} = options;
|
|
4441
4588
|
const customPrompt = await resolveCustomPrompt(config);
|
|
4442
4589
|
return evaluatorRegistry.llm_judge.evaluate({
|
|
4443
4590
|
evalCase,
|
|
@@ -4455,7 +4602,8 @@ async function runLlmJudgeEvaluator(options) {
|
|
|
4455
4602
|
async function resolveCustomPrompt(config) {
|
|
4456
4603
|
if (config.promptPath) {
|
|
4457
4604
|
try {
|
|
4458
|
-
|
|
4605
|
+
const content = await readTextFile(config.promptPath);
|
|
4606
|
+
return content;
|
|
4459
4607
|
} catch (error) {
|
|
4460
4608
|
const message = error instanceof Error ? error.message : String(error);
|
|
4461
4609
|
console.warn(`Could not read custom prompt at ${config.promptPath}: ${message}`);
|
|
@@ -4490,14 +4638,14 @@ async function dumpPrompt(directory, evalCase, promptInputs) {
|
|
|
4490
4638
|
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
4491
4639
|
const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
|
|
4492
4640
|
const filePath = import_node_path13.default.resolve(directory, filename);
|
|
4493
|
-
await (0,
|
|
4641
|
+
await (0, import_promises11.mkdir)(import_node_path13.default.dirname(filePath), { recursive: true });
|
|
4494
4642
|
const payload = {
|
|
4495
4643
|
eval_id: evalCase.id,
|
|
4496
4644
|
question: promptInputs.question,
|
|
4497
4645
|
guidelines: promptInputs.guidelines,
|
|
4498
4646
|
guideline_paths: evalCase.guideline_paths
|
|
4499
4647
|
};
|
|
4500
|
-
await (0,
|
|
4648
|
+
await (0, import_promises11.writeFile)(filePath, JSON.stringify(payload, null, 2), "utf8");
|
|
4501
4649
|
}
|
|
4502
4650
|
function sanitizeFilename(value) {
|
|
4503
4651
|
if (!value) {
|