@agentv/core 0.15.0 → 0.17.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-IOCVST3R.js → chunk-YCIZ33BO.js} +28 -11
- package/dist/chunk-YCIZ33BO.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +68 -64
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +64 -67
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +297 -149
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +18 -5
- package/dist/index.d.ts +18 -5
- package/dist/index.js +251 -115
- package/dist/index.js.map +1 -1
- package/package.json +15 -16
- package/LICENSE +0 -21
- package/dist/chunk-IOCVST3R.js.map +0 -1
package/dist/index.js
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import {
|
|
2
|
-
TARGETS_SCHEMA_V2,
|
|
3
2
|
buildDirectoryChain,
|
|
4
3
|
buildSearchRoots,
|
|
5
4
|
fileExists,
|
|
@@ -9,7 +8,7 @@ import {
|
|
|
9
8
|
readTextFile,
|
|
10
9
|
resolveFileReference,
|
|
11
10
|
resolveTargetDefinition
|
|
12
|
-
} from "./chunk-
|
|
11
|
+
} from "./chunk-YCIZ33BO.js";
|
|
13
12
|
|
|
14
13
|
// src/evaluation/types.ts
|
|
15
14
|
var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
|
|
@@ -62,7 +61,7 @@ function getHitCount(result) {
|
|
|
62
61
|
}
|
|
63
62
|
|
|
64
63
|
// src/evaluation/yaml-parser.ts
|
|
65
|
-
import { readFile as
|
|
64
|
+
import { readFile as readFile5 } from "node:fs/promises";
|
|
66
65
|
import path6 from "node:path";
|
|
67
66
|
import { parse as parse2 } from "yaml";
|
|
68
67
|
|
|
@@ -71,11 +70,11 @@ function extractCodeBlocks(segments) {
|
|
|
71
70
|
const CODE_BLOCK_PATTERN = /```[\s\S]*?```/g;
|
|
72
71
|
const codeBlocks = [];
|
|
73
72
|
for (const segment of segments) {
|
|
74
|
-
const typeValue = segment
|
|
73
|
+
const typeValue = segment.type;
|
|
75
74
|
if (typeof typeValue !== "string" || typeValue !== "text") {
|
|
76
75
|
continue;
|
|
77
76
|
}
|
|
78
|
-
const textValue = segment
|
|
77
|
+
const textValue = segment.value;
|
|
79
78
|
if (typeof textValue !== "string") {
|
|
80
79
|
continue;
|
|
81
80
|
}
|
|
@@ -100,7 +99,7 @@ ${part.content}
|
|
|
100
99
|
}
|
|
101
100
|
return parts.map((p) => p.content).join(" ");
|
|
102
101
|
}
|
|
103
|
-
function formatSegment(segment) {
|
|
102
|
+
function formatSegment(segment, mode = "lm") {
|
|
104
103
|
const type = asString(segment.type);
|
|
105
104
|
if (type === "text") {
|
|
106
105
|
return asString(segment.value);
|
|
@@ -110,8 +109,14 @@ function formatSegment(segment) {
|
|
|
110
109
|
return refPath ? `<Attached: ${refPath}>` : void 0;
|
|
111
110
|
}
|
|
112
111
|
if (type === "file") {
|
|
113
|
-
const text = asString(segment.text);
|
|
114
112
|
const filePath = asString(segment.path);
|
|
113
|
+
if (!filePath) {
|
|
114
|
+
return void 0;
|
|
115
|
+
}
|
|
116
|
+
if (mode === "agent") {
|
|
117
|
+
return `<file: path="${filePath}">`;
|
|
118
|
+
}
|
|
119
|
+
const text = asString(segment.text);
|
|
115
120
|
if (text && filePath) {
|
|
116
121
|
return formatFileContents([{ content: text.trim(), isFile: true, displayPath: filePath }]);
|
|
117
122
|
}
|
|
@@ -140,9 +145,9 @@ function asString(value) {
|
|
|
140
145
|
}
|
|
141
146
|
|
|
142
147
|
// src/evaluation/loaders/config-loader.ts
|
|
143
|
-
import micromatch from "micromatch";
|
|
144
148
|
import { readFile } from "node:fs/promises";
|
|
145
149
|
import path2 from "node:path";
|
|
150
|
+
import micromatch from "micromatch";
|
|
146
151
|
import { parse } from "yaml";
|
|
147
152
|
|
|
148
153
|
// src/evaluation/loaders/file-resolver.ts
|
|
@@ -284,8 +289,9 @@ Please add '$schema: ${SCHEMA_CONFIG_V2}' at the top of the file.`;
|
|
|
284
289
|
guideline_patterns: guidelinePatterns
|
|
285
290
|
};
|
|
286
291
|
} catch (error) {
|
|
287
|
-
logWarning(
|
|
288
|
-
|
|
292
|
+
logWarning(
|
|
293
|
+
`Could not read .agentv/config.yaml at ${configPath}: ${error.message}`
|
|
294
|
+
);
|
|
289
295
|
}
|
|
290
296
|
}
|
|
291
297
|
return null;
|
|
@@ -315,8 +321,66 @@ function logWarning(message) {
|
|
|
315
321
|
|
|
316
322
|
// src/evaluation/loaders/evaluator-parser.ts
|
|
317
323
|
import path3 from "node:path";
|
|
324
|
+
|
|
325
|
+
// src/evaluation/validation/prompt-validator.ts
|
|
326
|
+
import { readFile as readFile2 } from "node:fs/promises";
|
|
327
|
+
|
|
328
|
+
// src/evaluation/template-variables.ts
|
|
329
|
+
var TEMPLATE_VARIABLES = {
|
|
330
|
+
CANDIDATE_ANSWER: "candidate_answer",
|
|
331
|
+
EXPECTED_MESSAGES: "expected_messages",
|
|
332
|
+
QUESTION: "question",
|
|
333
|
+
EXPECTED_OUTCOME: "expected_outcome",
|
|
334
|
+
REFERENCE_ANSWER: "reference_answer",
|
|
335
|
+
INPUT_MESSAGES: "input_messages"
|
|
336
|
+
};
|
|
337
|
+
var VALID_TEMPLATE_VARIABLES = new Set(Object.values(TEMPLATE_VARIABLES));
|
|
338
|
+
var REQUIRED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Set([
|
|
339
|
+
TEMPLATE_VARIABLES.CANDIDATE_ANSWER,
|
|
340
|
+
TEMPLATE_VARIABLES.EXPECTED_MESSAGES
|
|
341
|
+
]);
|
|
342
|
+
|
|
343
|
+
// src/evaluation/validation/prompt-validator.ts
|
|
318
344
|
var ANSI_YELLOW2 = "\x1B[33m";
|
|
319
345
|
var ANSI_RESET2 = "\x1B[0m";
|
|
346
|
+
async function validateCustomPromptContent(promptPath) {
|
|
347
|
+
const content = await readFile2(promptPath, "utf8");
|
|
348
|
+
validateTemplateVariables(content, promptPath);
|
|
349
|
+
}
|
|
350
|
+
function validateTemplateVariables(content, source) {
|
|
351
|
+
const variablePattern = /\{\{\s*([a-zA-Z0-9_]+)\s*\}\}/g;
|
|
352
|
+
const foundVariables = /* @__PURE__ */ new Set();
|
|
353
|
+
const invalidVariables = [];
|
|
354
|
+
let match = variablePattern.exec(content);
|
|
355
|
+
while (match !== null) {
|
|
356
|
+
const varName = match[1];
|
|
357
|
+
foundVariables.add(varName);
|
|
358
|
+
if (!VALID_TEMPLATE_VARIABLES.has(varName)) {
|
|
359
|
+
invalidVariables.push(varName);
|
|
360
|
+
}
|
|
361
|
+
match = variablePattern.exec(content);
|
|
362
|
+
}
|
|
363
|
+
const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.CANDIDATE_ANSWER);
|
|
364
|
+
const hasExpectedMessages = foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_MESSAGES);
|
|
365
|
+
const hasRequiredFields = hasCandidateAnswer || hasExpectedMessages;
|
|
366
|
+
if (!hasRequiredFields) {
|
|
367
|
+
throw new Error(
|
|
368
|
+
`Missing required fields. Must include at least one of:
|
|
369
|
+
- {{ ${TEMPLATE_VARIABLES.CANDIDATE_ANSWER} }}
|
|
370
|
+
- {{ ${TEMPLATE_VARIABLES.EXPECTED_MESSAGES} }}`
|
|
371
|
+
);
|
|
372
|
+
}
|
|
373
|
+
if (invalidVariables.length > 0) {
|
|
374
|
+
const warningMessage = `${ANSI_YELLOW2}Warning: Custom evaluator template at ${source}
|
|
375
|
+
Contains invalid variables: ${invalidVariables.map((v) => `{{ ${v} }}`).join(", ")}
|
|
376
|
+
Valid variables: ${Array.from(VALID_TEMPLATE_VARIABLES).map((v) => `{{ ${v} }}`).join(", ")}${ANSI_RESET2}`;
|
|
377
|
+
console.warn(warningMessage);
|
|
378
|
+
}
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
// src/evaluation/loaders/evaluator-parser.ts
|
|
382
|
+
var ANSI_YELLOW3 = "\x1B[33m";
|
|
383
|
+
var ANSI_RESET3 = "\x1B[0m";
|
|
320
384
|
async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
|
|
321
385
|
const execution = rawEvalCase.execution;
|
|
322
386
|
const candidateEvaluators = isJsonObject2(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators ?? globalExecution?.evaluators;
|
|
@@ -375,6 +439,12 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
375
439
|
const resolved = await resolveFileReference2(prompt, searchRoots);
|
|
376
440
|
if (resolved.resolvedPath) {
|
|
377
441
|
promptPath = path3.resolve(resolved.resolvedPath);
|
|
442
|
+
try {
|
|
443
|
+
await validateCustomPromptContent(promptPath);
|
|
444
|
+
} catch (error) {
|
|
445
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
446
|
+
throw new Error(`Evaluator '${name}' template (${promptPath}): ${message}`);
|
|
447
|
+
}
|
|
378
448
|
} else {
|
|
379
449
|
logWarning2(
|
|
380
450
|
`Inline prompt used for evaluator '${name}' in '${evalId}' (file not found: ${resolved.displayPath})`,
|
|
@@ -411,18 +481,18 @@ function isJsonObject2(value) {
|
|
|
411
481
|
function logWarning2(message, details) {
|
|
412
482
|
if (details && details.length > 0) {
|
|
413
483
|
const detailBlock = details.join("\n");
|
|
414
|
-
console.warn(`${
|
|
415
|
-
${detailBlock}${
|
|
484
|
+
console.warn(`${ANSI_YELLOW3}Warning: ${message}
|
|
485
|
+
${detailBlock}${ANSI_RESET3}`);
|
|
416
486
|
} else {
|
|
417
|
-
console.warn(`${
|
|
487
|
+
console.warn(`${ANSI_YELLOW3}Warning: ${message}${ANSI_RESET3}`);
|
|
418
488
|
}
|
|
419
489
|
}
|
|
420
490
|
|
|
421
491
|
// src/evaluation/loaders/message-processor.ts
|
|
422
|
-
import { readFile as
|
|
492
|
+
import { readFile as readFile3 } from "node:fs/promises";
|
|
423
493
|
import path4 from "node:path";
|
|
424
|
-
var
|
|
425
|
-
var
|
|
494
|
+
var ANSI_YELLOW4 = "\x1B[33m";
|
|
495
|
+
var ANSI_RESET4 = "\x1B[0m";
|
|
426
496
|
async function processMessages(options) {
|
|
427
497
|
const {
|
|
428
498
|
messages,
|
|
@@ -465,7 +535,7 @@ async function processMessages(options) {
|
|
|
465
535
|
continue;
|
|
466
536
|
}
|
|
467
537
|
try {
|
|
468
|
-
const fileContent = (await
|
|
538
|
+
const fileContent = (await readFile3(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
|
|
469
539
|
if (messageType === "input" && guidelinePatterns && guidelinePaths) {
|
|
470
540
|
const relativeToRepo = path4.relative(repoRootPath, resolvedPath);
|
|
471
541
|
if (isGuidelineFile(relativeToRepo, guidelinePatterns)) {
|
|
@@ -536,7 +606,7 @@ async function resolveAssistantContent(content, searchRoots, verbose) {
|
|
|
536
606
|
continue;
|
|
537
607
|
}
|
|
538
608
|
try {
|
|
539
|
-
const fileContent = (await
|
|
609
|
+
const fileContent = (await readFile3(resolvedPath, "utf8")).replace(/\r\n/g, "\n").trim();
|
|
540
610
|
parts.push({ content: fileContent, isFile: true, displayPath });
|
|
541
611
|
if (verbose) {
|
|
542
612
|
console.log(` [Expected Assistant File] Found: ${displayPath}`);
|
|
@@ -586,19 +656,19 @@ function cloneJsonValue(value) {
|
|
|
586
656
|
function logWarning3(message, details) {
|
|
587
657
|
if (details && details.length > 0) {
|
|
588
658
|
const detailBlock = details.join("\n");
|
|
589
|
-
console.warn(`${
|
|
590
|
-
${detailBlock}${
|
|
659
|
+
console.warn(`${ANSI_YELLOW4}Warning: ${message}
|
|
660
|
+
${detailBlock}${ANSI_RESET4}`);
|
|
591
661
|
} else {
|
|
592
|
-
console.warn(`${
|
|
662
|
+
console.warn(`${ANSI_YELLOW4}Warning: ${message}${ANSI_RESET4}`);
|
|
593
663
|
}
|
|
594
664
|
}
|
|
595
665
|
|
|
596
666
|
// src/evaluation/formatting/prompt-builder.ts
|
|
597
|
-
import { readFile as
|
|
667
|
+
import { readFile as readFile4 } from "node:fs/promises";
|
|
598
668
|
import path5 from "node:path";
|
|
599
|
-
var
|
|
600
|
-
var
|
|
601
|
-
async function buildPromptInputs(testCase) {
|
|
669
|
+
var ANSI_YELLOW5 = "\x1B[33m";
|
|
670
|
+
var ANSI_RESET5 = "\x1B[0m";
|
|
671
|
+
async function buildPromptInputs(testCase, mode = "lm") {
|
|
602
672
|
const guidelineParts = [];
|
|
603
673
|
for (const rawPath of testCase.guideline_paths) {
|
|
604
674
|
const absolutePath = path5.resolve(rawPath);
|
|
@@ -607,7 +677,7 @@ async function buildPromptInputs(testCase) {
|
|
|
607
677
|
continue;
|
|
608
678
|
}
|
|
609
679
|
try {
|
|
610
|
-
const content = (await
|
|
680
|
+
const content = (await readFile4(absolutePath, "utf8")).replace(/\r\n/g, "\n").trim();
|
|
611
681
|
guidelineParts.push({
|
|
612
682
|
content,
|
|
613
683
|
isFile: true,
|
|
@@ -674,7 +744,7 @@ async function buildPromptInputs(testCase) {
|
|
|
674
744
|
const roleLabel = message.role.charAt(0).toUpperCase() + message.role.slice(1);
|
|
675
745
|
const contentParts = [];
|
|
676
746
|
for (const segment of segments) {
|
|
677
|
-
const formattedContent = formatSegment(segment);
|
|
747
|
+
const formattedContent = formatSegment(segment, mode);
|
|
678
748
|
if (formattedContent) {
|
|
679
749
|
contentParts.push(formattedContent);
|
|
680
750
|
}
|
|
@@ -689,7 +759,11 @@ ${messageContent}`);
|
|
|
689
759
|
} else {
|
|
690
760
|
const questionParts = [];
|
|
691
761
|
for (const segment of testCase.input_segments) {
|
|
692
|
-
|
|
762
|
+
if (segment.type === "file" && typeof segment.path === "string" && testCase.guideline_patterns && isGuidelineFile(segment.path, testCase.guideline_patterns)) {
|
|
763
|
+
questionParts.push(`<Attached: ${segment.path}>`);
|
|
764
|
+
continue;
|
|
765
|
+
}
|
|
766
|
+
const formattedContent = formatSegment(segment, mode);
|
|
693
767
|
if (formattedContent) {
|
|
694
768
|
questionParts.push(formattedContent);
|
|
695
769
|
}
|
|
@@ -703,7 +777,8 @@ ${messageContent}`);
|
|
|
703
777
|
messages: testCase.input_messages,
|
|
704
778
|
segmentsByMessage,
|
|
705
779
|
guidelinePatterns: testCase.guideline_patterns,
|
|
706
|
-
guidelineContent: guidelines
|
|
780
|
+
guidelineContent: guidelines,
|
|
781
|
+
mode
|
|
707
782
|
}) : void 0;
|
|
708
783
|
return { question, guidelines, chatPrompt };
|
|
709
784
|
}
|
|
@@ -720,7 +795,14 @@ function needsRoleMarkers(messages, processedSegmentsByMessage) {
|
|
|
720
795
|
return messagesWithContent > 1;
|
|
721
796
|
}
|
|
722
797
|
function buildChatPromptFromSegments(options) {
|
|
723
|
-
const {
|
|
798
|
+
const {
|
|
799
|
+
messages,
|
|
800
|
+
segmentsByMessage,
|
|
801
|
+
guidelinePatterns,
|
|
802
|
+
guidelineContent,
|
|
803
|
+
systemPrompt,
|
|
804
|
+
mode = "lm"
|
|
805
|
+
} = options;
|
|
724
806
|
if (messages.length === 0) {
|
|
725
807
|
return void 0;
|
|
726
808
|
}
|
|
@@ -738,7 +820,7 @@ ${guidelineContent.trim()}`);
|
|
|
738
820
|
const segments = segmentsByMessage[startIndex];
|
|
739
821
|
const contentParts = [];
|
|
740
822
|
for (const segment of segments) {
|
|
741
|
-
const formatted = formatSegment(segment);
|
|
823
|
+
const formatted = formatSegment(segment, mode);
|
|
742
824
|
if (formatted) {
|
|
743
825
|
contentParts.push(formatted);
|
|
744
826
|
}
|
|
@@ -771,7 +853,7 @@ ${guidelineContent.trim()}`);
|
|
|
771
853
|
if (segment.type === "guideline_ref") {
|
|
772
854
|
continue;
|
|
773
855
|
}
|
|
774
|
-
const formatted = formatSegment(segment);
|
|
856
|
+
const formatted = formatSegment(segment, mode);
|
|
775
857
|
if (formatted) {
|
|
776
858
|
const isGuidelineRef = segment.type === "file" && typeof segment.path === "string" && guidelinePatterns && isGuidelineFile(segment.path, guidelinePatterns);
|
|
777
859
|
if (isGuidelineRef) {
|
|
@@ -795,17 +877,17 @@ function asString4(value) {
|
|
|
795
877
|
return typeof value === "string" ? value : void 0;
|
|
796
878
|
}
|
|
797
879
|
function logWarning4(message) {
|
|
798
|
-
console.warn(`${
|
|
880
|
+
console.warn(`${ANSI_YELLOW5}Warning: ${message}${ANSI_RESET5}`);
|
|
799
881
|
}
|
|
800
882
|
|
|
801
883
|
// src/evaluation/yaml-parser.ts
|
|
802
|
-
var
|
|
803
|
-
var
|
|
804
|
-
var
|
|
884
|
+
var ANSI_YELLOW6 = "\x1B[33m";
|
|
885
|
+
var ANSI_RED = "\x1B[31m";
|
|
886
|
+
var ANSI_RESET6 = "\x1B[0m";
|
|
805
887
|
async function readTestSuiteMetadata(testFilePath) {
|
|
806
888
|
try {
|
|
807
889
|
const absolutePath = path6.resolve(testFilePath);
|
|
808
|
-
const content = await
|
|
890
|
+
const content = await readFile5(absolutePath, "utf8");
|
|
809
891
|
const parsed = parse2(content);
|
|
810
892
|
if (!isJsonObject(parsed)) {
|
|
811
893
|
return {};
|
|
@@ -823,7 +905,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
823
905
|
const searchRoots = buildSearchRoots2(absoluteTestPath, repoRootPath);
|
|
824
906
|
const config = await loadConfig(absoluteTestPath, repoRootPath);
|
|
825
907
|
const guidelinePatterns = config?.guideline_patterns;
|
|
826
|
-
const rawFile = await
|
|
908
|
+
const rawFile = await readFile5(absoluteTestPath, "utf8");
|
|
827
909
|
const parsed = parse2(rawFile);
|
|
828
910
|
if (!isJsonObject(parsed)) {
|
|
829
911
|
throw new Error(`Invalid test file format: ${evalFilePath}`);
|
|
@@ -832,12 +914,6 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
832
914
|
const datasetNameFromSuite = asString5(suite.dataset)?.trim();
|
|
833
915
|
const fallbackDataset = path6.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
|
|
834
916
|
const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
|
|
835
|
-
const schema = suite.$schema;
|
|
836
|
-
if (schema !== SCHEMA_EVAL_V2) {
|
|
837
|
-
const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${evalFilePath}. Expected '${SCHEMA_EVAL_V2}'` : `Missing required field '$schema' in ${evalFilePath}.
|
|
838
|
-
Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
839
|
-
throw new Error(message);
|
|
840
|
-
}
|
|
841
917
|
const rawTestcases = suite.evalcases;
|
|
842
918
|
if (!Array.isArray(rawTestcases)) {
|
|
843
919
|
throw new Error(`Invalid test file format: ${evalFilePath} - missing 'evalcases' field`);
|
|
@@ -861,14 +937,18 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
861
937
|
const inputMessagesValue = evalcase.input_messages;
|
|
862
938
|
const expectedMessagesValue = evalcase.expected_messages;
|
|
863
939
|
if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
|
|
864
|
-
|
|
940
|
+
logError(
|
|
941
|
+
`Skipping incomplete eval case: ${id ?? "unknown"}. Missing required fields: id, outcome, and/or input_messages`
|
|
942
|
+
);
|
|
865
943
|
continue;
|
|
866
944
|
}
|
|
867
945
|
const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
|
|
868
|
-
const inputMessages = inputMessagesValue.filter(
|
|
946
|
+
const inputMessages = inputMessagesValue.filter(
|
|
947
|
+
(msg) => isTestMessage(msg)
|
|
948
|
+
);
|
|
869
949
|
const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
|
|
870
950
|
if (hasExpectedMessages && expectedMessages.length === 0) {
|
|
871
|
-
|
|
951
|
+
logError(`No valid expected message found for eval case: ${id}`);
|
|
872
952
|
continue;
|
|
873
953
|
}
|
|
874
954
|
if (expectedMessages.length > 1) {
|
|
@@ -899,7 +979,14 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
899
979
|
const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
|
|
900
980
|
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
901
981
|
const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
|
|
902
|
-
|
|
982
|
+
let evaluators;
|
|
983
|
+
try {
|
|
984
|
+
evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
|
|
985
|
+
} catch (error) {
|
|
986
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
987
|
+
logError(`Skipping eval case '${id}': ${message}`);
|
|
988
|
+
continue;
|
|
989
|
+
}
|
|
903
990
|
const userFilePaths = [];
|
|
904
991
|
for (const segment of inputSegments) {
|
|
905
992
|
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
@@ -917,7 +1004,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
917
1004
|
question,
|
|
918
1005
|
input_messages: inputMessages,
|
|
919
1006
|
input_segments: inputSegments,
|
|
920
|
-
|
|
1007
|
+
expected_segments: outputSegments,
|
|
921
1008
|
reference_answer: referenceAnswer,
|
|
922
1009
|
guideline_paths: guidelinePaths.map((guidelinePath) => path6.resolve(guidelinePath)),
|
|
923
1010
|
guideline_patterns: guidelinePatterns,
|
|
@@ -949,10 +1036,19 @@ function asString5(value) {
|
|
|
949
1036
|
function logWarning5(message, details) {
|
|
950
1037
|
if (details && details.length > 0) {
|
|
951
1038
|
const detailBlock = details.join("\n");
|
|
952
|
-
console.warn(`${
|
|
953
|
-
${detailBlock}${
|
|
1039
|
+
console.warn(`${ANSI_YELLOW6}Warning: ${message}
|
|
1040
|
+
${detailBlock}${ANSI_RESET6}`);
|
|
954
1041
|
} else {
|
|
955
|
-
console.warn(`${
|
|
1042
|
+
console.warn(`${ANSI_YELLOW6}Warning: ${message}${ANSI_RESET6}`);
|
|
1043
|
+
}
|
|
1044
|
+
}
|
|
1045
|
+
function logError(message, details) {
|
|
1046
|
+
if (details && details.length > 0) {
|
|
1047
|
+
const detailBlock = details.join("\n");
|
|
1048
|
+
console.error(`${ANSI_RED}Error: ${message}
|
|
1049
|
+
${detailBlock}${ANSI_RESET6}`);
|
|
1050
|
+
} else {
|
|
1051
|
+
console.error(`${ANSI_RED}Error: ${message}${ANSI_RESET6}`);
|
|
956
1052
|
}
|
|
957
1053
|
}
|
|
958
1054
|
|
|
@@ -1522,7 +1618,7 @@ function formatTimeoutSuffix(timeoutMs) {
|
|
|
1522
1618
|
import { exec as execCallback, spawn } from "node:child_process";
|
|
1523
1619
|
import { randomUUID } from "node:crypto";
|
|
1524
1620
|
import { constants as constants2, createWriteStream } from "node:fs";
|
|
1525
|
-
import { access as access2,
|
|
1621
|
+
import { access as access2, mkdir, mkdtemp, rm, writeFile } from "node:fs/promises";
|
|
1526
1622
|
import { tmpdir } from "node:os";
|
|
1527
1623
|
import path9 from "node:path";
|
|
1528
1624
|
import { promisify as promisify2 } from "node:util";
|
|
@@ -1590,9 +1686,7 @@ function buildPromptDocument(request, inputFiles, options) {
|
|
|
1590
1686
|
options?.guidelineOverrides
|
|
1591
1687
|
);
|
|
1592
1688
|
const inputFilesList = collectInputFiles(inputFiles);
|
|
1593
|
-
const nonGuidelineInputFiles = inputFilesList.filter(
|
|
1594
|
-
(file) => !guidelineFiles.includes(file)
|
|
1595
|
-
);
|
|
1689
|
+
const nonGuidelineInputFiles = inputFilesList.filter((file) => !guidelineFiles.includes(file));
|
|
1596
1690
|
const prereadBlock = buildMandatoryPrereadBlock(guidelineFiles, nonGuidelineInputFiles);
|
|
1597
1691
|
if (prereadBlock.length > 0) {
|
|
1598
1692
|
parts.push("\n", prereadBlock);
|
|
@@ -1764,7 +1858,15 @@ var CodexProvider = class {
|
|
|
1764
1858
|
return path9.resolve(this.config.cwd);
|
|
1765
1859
|
}
|
|
1766
1860
|
buildCodexArgs() {
|
|
1767
|
-
const args = [
|
|
1861
|
+
const args = [
|
|
1862
|
+
"--ask-for-approval",
|
|
1863
|
+
"never",
|
|
1864
|
+
"exec",
|
|
1865
|
+
"--json",
|
|
1866
|
+
"--color",
|
|
1867
|
+
"never",
|
|
1868
|
+
"--skip-git-repo-check"
|
|
1869
|
+
];
|
|
1768
1870
|
if (this.config.args && this.config.args.length > 0) {
|
|
1769
1871
|
args.push(...this.config.args);
|
|
1770
1872
|
}
|
|
@@ -2388,7 +2490,12 @@ var MockProvider = class {
|
|
|
2388
2490
|
|
|
2389
2491
|
// src/evaluation/providers/vscode.ts
|
|
2390
2492
|
import path10 from "node:path";
|
|
2391
|
-
import {
|
|
2493
|
+
import {
|
|
2494
|
+
dispatchAgentSession,
|
|
2495
|
+
dispatchBatchAgent,
|
|
2496
|
+
getSubagentRoot,
|
|
2497
|
+
provisionSubagents
|
|
2498
|
+
} from "subagent";
|
|
2392
2499
|
var VSCodeProvider = class {
|
|
2393
2500
|
id;
|
|
2394
2501
|
kind;
|
|
@@ -2505,9 +2612,7 @@ function buildPromptDocument2(request, attachments, guidelinePatterns) {
|
|
|
2505
2612
|
}
|
|
2506
2613
|
const guidelineFiles = collectGuidelineFiles2(attachments, guidelinePatterns);
|
|
2507
2614
|
const attachmentFiles = collectAttachmentFiles(attachments);
|
|
2508
|
-
const nonGuidelineAttachments = attachmentFiles.filter(
|
|
2509
|
-
(file) => !guidelineFiles.includes(file)
|
|
2510
|
-
);
|
|
2615
|
+
const nonGuidelineAttachments = attachmentFiles.filter((file) => !guidelineFiles.includes(file));
|
|
2511
2616
|
const prereadBlock = buildMandatoryPrereadBlock2(guidelineFiles, nonGuidelineAttachments);
|
|
2512
2617
|
if (prereadBlock.length > 0) {
|
|
2513
2618
|
parts.push("\n", prereadBlock);
|
|
@@ -2616,8 +2721,10 @@ async function ensureVSCodeSubagents(options) {
|
|
|
2616
2721
|
if (result.skippedExisting.length > 0) {
|
|
2617
2722
|
console.log(`Reusing ${result.skippedExisting.length} existing unlocked subagent(s)`);
|
|
2618
2723
|
}
|
|
2619
|
-
console.log(
|
|
2620
|
-
|
|
2724
|
+
console.log(
|
|
2725
|
+
`
|
|
2726
|
+
total unlocked subagents available: ${result.created.length + result.skippedExisting.length}`
|
|
2727
|
+
);
|
|
2621
2728
|
}
|
|
2622
2729
|
return {
|
|
2623
2730
|
provisioned: true,
|
|
@@ -2637,33 +2744,12 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
|
|
|
2637
2744
|
|
|
2638
2745
|
// src/evaluation/providers/targets-file.ts
|
|
2639
2746
|
import { constants as constants3 } from "node:fs";
|
|
2640
|
-
import { access as access3, readFile as
|
|
2747
|
+
import { access as access3, readFile as readFile6 } from "node:fs/promises";
|
|
2641
2748
|
import path11 from "node:path";
|
|
2642
2749
|
import { parse as parse3 } from "yaml";
|
|
2643
2750
|
function isRecord(value) {
|
|
2644
2751
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
2645
2752
|
}
|
|
2646
|
-
function checkSchema(parsed, absolutePath) {
|
|
2647
|
-
const schema = parsed.$schema;
|
|
2648
|
-
if (schema === void 0) {
|
|
2649
|
-
throw new Error(
|
|
2650
|
-
`Missing $schema field in targets.yaml at ${absolutePath}.
|
|
2651
|
-
Please add '$schema: ${TARGETS_SCHEMA_V2}' at the top of the file.`
|
|
2652
|
-
);
|
|
2653
|
-
}
|
|
2654
|
-
if (typeof schema !== "string") {
|
|
2655
|
-
throw new Error(
|
|
2656
|
-
`Invalid $schema field in targets.yaml at ${absolutePath}.
|
|
2657
|
-
Expected a string value '${TARGETS_SCHEMA_V2}'.`
|
|
2658
|
-
);
|
|
2659
|
-
}
|
|
2660
|
-
if (schema !== TARGETS_SCHEMA_V2) {
|
|
2661
|
-
throw new Error(
|
|
2662
|
-
`Invalid $schema '${schema}' in targets.yaml at ${absolutePath}.
|
|
2663
|
-
Expected '${TARGETS_SCHEMA_V2}'.`
|
|
2664
|
-
);
|
|
2665
|
-
}
|
|
2666
|
-
}
|
|
2667
2753
|
function extractTargetsArray(parsed, absolutePath) {
|
|
2668
2754
|
const targets = parsed.targets;
|
|
2669
2755
|
if (!Array.isArray(targets)) {
|
|
@@ -2678,7 +2764,9 @@ function assertTargetDefinition(value, index, filePath) {
|
|
|
2678
2764
|
const name = value.name;
|
|
2679
2765
|
const provider = value.provider;
|
|
2680
2766
|
if (typeof name !== "string" || name.trim().length === 0) {
|
|
2681
|
-
throw new Error(
|
|
2767
|
+
throw new Error(
|
|
2768
|
+
`targets.yaml entry at index ${index} in ${filePath} is missing a valid 'name'`
|
|
2769
|
+
);
|
|
2682
2770
|
}
|
|
2683
2771
|
if (typeof provider !== "string" || provider.trim().length === 0) {
|
|
2684
2772
|
throw new Error(`targets.yaml entry '${name}' in ${filePath} is missing a valid 'provider'`);
|
|
@@ -2698,14 +2786,15 @@ async function readTargetDefinitions(filePath) {
|
|
|
2698
2786
|
if (!await fileExists3(absolutePath)) {
|
|
2699
2787
|
throw new Error(`targets.yaml not found at ${absolutePath}`);
|
|
2700
2788
|
}
|
|
2701
|
-
const raw = await
|
|
2789
|
+
const raw = await readFile6(absolutePath, "utf8");
|
|
2702
2790
|
const parsed = parse3(raw);
|
|
2703
2791
|
if (!isRecord(parsed)) {
|
|
2704
|
-
throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with
|
|
2792
|
+
throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with a 'targets' field`);
|
|
2705
2793
|
}
|
|
2706
|
-
checkSchema(parsed, absolutePath);
|
|
2707
2794
|
const targets = extractTargetsArray(parsed, absolutePath);
|
|
2708
|
-
const definitions = targets.map(
|
|
2795
|
+
const definitions = targets.map(
|
|
2796
|
+
(entry, index) => assertTargetDefinition(entry, index, absolutePath)
|
|
2797
|
+
);
|
|
2709
2798
|
return definitions;
|
|
2710
2799
|
}
|
|
2711
2800
|
function listTargetNames(definitions) {
|
|
@@ -2749,16 +2838,16 @@ Use the reference_answer as a gold standard for a high-quality response (if prov
|
|
|
2749
2838
|
Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.
|
|
2750
2839
|
|
|
2751
2840
|
[[ ## expected_outcome ## ]]
|
|
2752
|
-
{{
|
|
2841
|
+
{{${TEMPLATE_VARIABLES.EXPECTED_OUTCOME}}}
|
|
2753
2842
|
|
|
2754
2843
|
[[ ## question ## ]]
|
|
2755
|
-
{{
|
|
2844
|
+
{{${TEMPLATE_VARIABLES.QUESTION}}}
|
|
2756
2845
|
|
|
2757
2846
|
[[ ## reference_answer ## ]]
|
|
2758
|
-
{{
|
|
2847
|
+
{{${TEMPLATE_VARIABLES.REFERENCE_ANSWER}}}
|
|
2759
2848
|
|
|
2760
2849
|
[[ ## candidate_answer ## ]]
|
|
2761
|
-
{{
|
|
2850
|
+
{{${TEMPLATE_VARIABLES.CANDIDATE_ANSWER}}}`;
|
|
2762
2851
|
var LlmJudgeEvaluator = class {
|
|
2763
2852
|
kind = "llm_judge";
|
|
2764
2853
|
resolveJudgeProvider;
|
|
@@ -2781,12 +2870,16 @@ var LlmJudgeEvaluator = class {
|
|
|
2781
2870
|
async evaluateWithPrompt(context, judgeProvider) {
|
|
2782
2871
|
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
2783
2872
|
const variables = {
|
|
2784
|
-
|
|
2785
|
-
|
|
2786
|
-
|
|
2787
|
-
|
|
2788
|
-
|
|
2789
|
-
|
|
2873
|
+
[TEMPLATE_VARIABLES.INPUT_MESSAGES]: JSON.stringify(context.evalCase.input_segments, null, 2),
|
|
2874
|
+
[TEMPLATE_VARIABLES.EXPECTED_MESSAGES]: JSON.stringify(
|
|
2875
|
+
context.evalCase.expected_segments,
|
|
2876
|
+
null,
|
|
2877
|
+
2
|
|
2878
|
+
),
|
|
2879
|
+
[TEMPLATE_VARIABLES.CANDIDATE_ANSWER]: context.candidate.trim(),
|
|
2880
|
+
[TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context.evalCase.reference_answer ?? "").trim(),
|
|
2881
|
+
[TEMPLATE_VARIABLES.EXPECTED_OUTCOME]: context.evalCase.expected_outcome.trim(),
|
|
2882
|
+
[TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim()
|
|
2790
2883
|
};
|
|
2791
2884
|
const systemPrompt = buildOutputSchema();
|
|
2792
2885
|
const evaluatorTemplate = context.evaluatorTemplateOverride ?? this.evaluatorTemplate ?? DEFAULT_EVALUATOR_TEMPLATE;
|
|
@@ -3018,7 +3111,7 @@ function parseJsonSafe(payload) {
|
|
|
3018
3111
|
}
|
|
3019
3112
|
}
|
|
3020
3113
|
function substituteVariables(template, variables) {
|
|
3021
|
-
return template.replace(/\{\{([a-zA-Z0-9_]+)\}\}/g, (match, varName) => {
|
|
3114
|
+
return template.replace(/\{\{\s*([a-zA-Z0-9_]+)\s*\}\}/g, (match, varName) => {
|
|
3022
3115
|
return variables[varName] ?? match;
|
|
3023
3116
|
});
|
|
3024
3117
|
}
|
|
@@ -3028,7 +3121,7 @@ import { createHash, randomUUID as randomUUID2 } from "node:crypto";
|
|
|
3028
3121
|
import { mkdir as mkdir2, writeFile as writeFile2 } from "node:fs/promises";
|
|
3029
3122
|
import path12 from "node:path";
|
|
3030
3123
|
|
|
3031
|
-
// ../../node_modules/.
|
|
3124
|
+
// ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
|
|
3032
3125
|
var Node = class {
|
|
3033
3126
|
value;
|
|
3034
3127
|
next;
|
|
@@ -3061,6 +3154,9 @@ var Queue = class {
|
|
|
3061
3154
|
}
|
|
3062
3155
|
this.#head = this.#head.next;
|
|
3063
3156
|
this.#size--;
|
|
3157
|
+
if (!this.#head) {
|
|
3158
|
+
this.#tail = void 0;
|
|
3159
|
+
}
|
|
3064
3160
|
return current.value;
|
|
3065
3161
|
}
|
|
3066
3162
|
peek() {
|
|
@@ -3091,7 +3187,7 @@ var Queue = class {
|
|
|
3091
3187
|
}
|
|
3092
3188
|
};
|
|
3093
3189
|
|
|
3094
|
-
// ../../node_modules/.
|
|
3190
|
+
// ../../node_modules/.bun/p-limit@6.2.0/node_modules/p-limit/index.js
|
|
3095
3191
|
function pLimit(concurrency) {
|
|
3096
3192
|
validateConcurrency(concurrency);
|
|
3097
3193
|
const queue = new Queue();
|
|
@@ -3182,11 +3278,11 @@ async function runEvaluation(options) {
|
|
|
3182
3278
|
now,
|
|
3183
3279
|
evalId,
|
|
3184
3280
|
verbose,
|
|
3281
|
+
evalCases: preloadedEvalCases,
|
|
3185
3282
|
onResult,
|
|
3186
3283
|
onProgress
|
|
3187
3284
|
} = options;
|
|
3188
|
-
const
|
|
3189
|
-
const evalCases = await load(evalFilePath, repoRoot, { verbose, evalId });
|
|
3285
|
+
const evalCases = preloadedEvalCases ?? await loadEvalCases(evalFilePath, repoRoot, { verbose, evalId });
|
|
3190
3286
|
const filteredEvalCases = filterEvalCases(evalCases, evalId);
|
|
3191
3287
|
if (filteredEvalCases.length === 0) {
|
|
3192
3288
|
if (evalId) {
|
|
@@ -3267,7 +3363,9 @@ async function runEvaluation(options) {
|
|
|
3267
3363
|
} catch (error) {
|
|
3268
3364
|
if (verbose) {
|
|
3269
3365
|
const message = error instanceof Error ? error.message : String(error);
|
|
3270
|
-
console.warn(
|
|
3366
|
+
console.warn(
|
|
3367
|
+
`Provider batch execution failed, falling back to per-case dispatch: ${message}`
|
|
3368
|
+
);
|
|
3271
3369
|
}
|
|
3272
3370
|
}
|
|
3273
3371
|
}
|
|
@@ -3370,8 +3468,9 @@ async function runBatchEvaluation(options) {
|
|
|
3370
3468
|
agentTimeoutMs
|
|
3371
3469
|
} = options;
|
|
3372
3470
|
const promptInputsList = [];
|
|
3471
|
+
const formattingMode = isAgentProvider(provider) ? "agent" : "lm";
|
|
3373
3472
|
for (const evalCase of evalCases) {
|
|
3374
|
-
const promptInputs = await buildPromptInputs(evalCase);
|
|
3473
|
+
const promptInputs = await buildPromptInputs(evalCase, formattingMode);
|
|
3375
3474
|
if (promptDumpDir) {
|
|
3376
3475
|
await dumpPrompt(promptDumpDir, evalCase, promptInputs);
|
|
3377
3476
|
}
|
|
@@ -3430,7 +3529,14 @@ async function runBatchEvaluation(options) {
|
|
|
3430
3529
|
agentTimeoutMs
|
|
3431
3530
|
});
|
|
3432
3531
|
} catch (error) {
|
|
3433
|
-
const errorResult = buildErrorResult(
|
|
3532
|
+
const errorResult = buildErrorResult(
|
|
3533
|
+
evalCase,
|
|
3534
|
+
target.name,
|
|
3535
|
+
nowFn(),
|
|
3536
|
+
error,
|
|
3537
|
+
promptInputs,
|
|
3538
|
+
provider
|
|
3539
|
+
);
|
|
3434
3540
|
results.push(errorResult);
|
|
3435
3541
|
if (onResult) {
|
|
3436
3542
|
await onResult(errorResult);
|
|
@@ -3477,7 +3583,8 @@ async function runEvalCase(options) {
|
|
|
3477
3583
|
signal,
|
|
3478
3584
|
judgeProvider
|
|
3479
3585
|
} = options;
|
|
3480
|
-
const
|
|
3586
|
+
const formattingMode = isAgentProvider(provider) ? "agent" : "lm";
|
|
3587
|
+
const promptInputs = await buildPromptInputs(evalCase, formattingMode);
|
|
3481
3588
|
if (promptDumpDir) {
|
|
3482
3589
|
await dumpPrompt(promptDumpDir, evalCase, promptInputs);
|
|
3483
3590
|
}
|
|
@@ -3607,7 +3714,18 @@ async function evaluateCandidate(options) {
|
|
|
3607
3714
|
};
|
|
3608
3715
|
}
|
|
3609
3716
|
async function runEvaluatorsForCase(options) {
|
|
3610
|
-
const {
|
|
3717
|
+
const {
|
|
3718
|
+
evalCase,
|
|
3719
|
+
candidate,
|
|
3720
|
+
target,
|
|
3721
|
+
provider,
|
|
3722
|
+
evaluators,
|
|
3723
|
+
attempt,
|
|
3724
|
+
promptInputs,
|
|
3725
|
+
now,
|
|
3726
|
+
judgeProvider,
|
|
3727
|
+
agentTimeoutMs
|
|
3728
|
+
} = options;
|
|
3611
3729
|
if (evalCase.evaluators && evalCase.evaluators.length > 0) {
|
|
3612
3730
|
return runEvaluatorList({
|
|
3613
3731
|
evalCase,
|
|
@@ -3708,7 +3826,6 @@ async function runEvaluatorList(options) {
|
|
|
3708
3826
|
reasoning: score2.reasoning,
|
|
3709
3827
|
evaluator_provider_request: score2.evaluatorRawRequest
|
|
3710
3828
|
});
|
|
3711
|
-
continue;
|
|
3712
3829
|
}
|
|
3713
3830
|
} catch (error) {
|
|
3714
3831
|
const message = error instanceof Error ? error.message : String(error);
|
|
@@ -3719,7 +3836,11 @@ async function runEvaluatorList(options) {
|
|
|
3719
3836
|
expectedAspectCount: 1,
|
|
3720
3837
|
reasoning: message
|
|
3721
3838
|
};
|
|
3722
|
-
scored.push({
|
|
3839
|
+
scored.push({
|
|
3840
|
+
score: fallbackScore,
|
|
3841
|
+
name: evaluator.name ?? "unknown",
|
|
3842
|
+
type: evaluator.type ?? "unknown"
|
|
3843
|
+
});
|
|
3723
3844
|
evaluatorResults.push({
|
|
3724
3845
|
name: evaluator.name ?? "unknown",
|
|
3725
3846
|
type: evaluator.type ?? "unknown",
|
|
@@ -3733,7 +3854,10 @@ async function runEvaluatorList(options) {
|
|
|
3733
3854
|
const aggregateScore = scored.length > 0 ? scored.reduce((total, entry) => total + entry.score.score, 0) / scored.length : 0;
|
|
3734
3855
|
const hits = scored.flatMap((entry) => entry.score.hits);
|
|
3735
3856
|
const misses = scored.flatMap((entry) => entry.score.misses);
|
|
3736
|
-
const expectedAspectCount = scored.reduce(
|
|
3857
|
+
const expectedAspectCount = scored.reduce(
|
|
3858
|
+
(total, entry) => total + (entry.score.expectedAspectCount ?? 0),
|
|
3859
|
+
0
|
|
3860
|
+
);
|
|
3737
3861
|
const rawAspects = scored.flatMap((entry) => entry.score.rawAspects ?? []);
|
|
3738
3862
|
const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString2);
|
|
3739
3863
|
const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
|
|
@@ -3748,7 +3872,18 @@ async function runEvaluatorList(options) {
|
|
|
3748
3872
|
return { score, evaluatorResults };
|
|
3749
3873
|
}
|
|
3750
3874
|
async function runLlmJudgeEvaluator(options) {
|
|
3751
|
-
const {
|
|
3875
|
+
const {
|
|
3876
|
+
config,
|
|
3877
|
+
evalCase,
|
|
3878
|
+
candidate,
|
|
3879
|
+
target,
|
|
3880
|
+
provider,
|
|
3881
|
+
evaluatorRegistry,
|
|
3882
|
+
attempt,
|
|
3883
|
+
promptInputs,
|
|
3884
|
+
now,
|
|
3885
|
+
judgeProvider
|
|
3886
|
+
} = options;
|
|
3752
3887
|
const customPrompt = await resolveCustomPrompt(config);
|
|
3753
3888
|
return evaluatorRegistry.llm_judge.evaluate({
|
|
3754
3889
|
evalCase,
|
|
@@ -3766,7 +3901,8 @@ async function runLlmJudgeEvaluator(options) {
|
|
|
3766
3901
|
async function resolveCustomPrompt(config) {
|
|
3767
3902
|
if (config.promptPath) {
|
|
3768
3903
|
try {
|
|
3769
|
-
|
|
3904
|
+
const content = await readTextFile(config.promptPath);
|
|
3905
|
+
return content;
|
|
3770
3906
|
} catch (error) {
|
|
3771
3907
|
const message = error instanceof Error ? error.message : String(error);
|
|
3772
3908
|
console.warn(`Could not read custom prompt at ${config.promptPath}: ${message}`);
|