@agentv/core 0.15.0 → 0.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +168 -72
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +18 -5
- package/dist/index.d.ts +18 -5
- package/dist/index.js +151 -55
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.d.cts
CHANGED
|
@@ -101,7 +101,7 @@ interface EvalCase {
|
|
|
101
101
|
readonly question: string;
|
|
102
102
|
readonly input_messages: readonly TestMessage[];
|
|
103
103
|
readonly input_segments: readonly JsonObject[];
|
|
104
|
-
readonly
|
|
104
|
+
readonly expected_segments: readonly JsonObject[];
|
|
105
105
|
readonly reference_answer?: string;
|
|
106
106
|
readonly guideline_paths: readonly string[];
|
|
107
107
|
readonly guideline_patterns?: readonly string[];
|
|
@@ -147,6 +147,17 @@ interface EvaluatorResult {
|
|
|
147
147
|
*/
|
|
148
148
|
declare function getHitCount(result: Pick<EvaluationResult, "hits">): number;
|
|
149
149
|
|
|
150
|
+
/**
|
|
151
|
+
* Formatting mode for segment content.
|
|
152
|
+
* - 'agent': File references only (for providers with filesystem access)
|
|
153
|
+
* - 'lm': Embedded file content with XML tags (for language model providers)
|
|
154
|
+
*/
|
|
155
|
+
type FormattingMode = 'agent' | 'lm';
|
|
156
|
+
/**
|
|
157
|
+
* Extract fenced code blocks from AgentV user segments.
|
|
158
|
+
*/
|
|
159
|
+
declare function extractCodeBlocks(segments: readonly JsonObject[]): readonly string[];
|
|
160
|
+
|
|
150
161
|
type ChatMessageRole = "system" | "user" | "assistant" | "tool" | "function";
|
|
151
162
|
interface ChatMessage {
|
|
152
163
|
readonly role: ChatMessageRole;
|
|
@@ -271,12 +282,13 @@ interface PromptInputs {
|
|
|
271
282
|
readonly chatPrompt?: ChatPrompt;
|
|
272
283
|
readonly systemMessage?: string;
|
|
273
284
|
}
|
|
274
|
-
declare function buildPromptInputs(testCase: EvalCase): Promise<PromptInputs>;
|
|
275
|
-
|
|
276
285
|
/**
|
|
277
|
-
*
|
|
286
|
+
* Build prompt inputs by consolidating user request context and guideline content.
|
|
287
|
+
*
|
|
288
|
+
* @param testCase - The evaluation test case
|
|
289
|
+
* @param mode - Formatting mode: 'agent' for file references, 'lm' for embedded content (default: 'lm')
|
|
278
290
|
*/
|
|
279
|
-
declare function
|
|
291
|
+
declare function buildPromptInputs(testCase: EvalCase, mode?: FormattingMode): Promise<PromptInputs>;
|
|
280
292
|
|
|
281
293
|
/**
|
|
282
294
|
* Determine whether a path references guideline content (instructions or prompts).
|
|
@@ -605,6 +617,7 @@ interface RunEvaluationOptions {
|
|
|
605
617
|
readonly evalId?: string;
|
|
606
618
|
readonly verbose?: boolean;
|
|
607
619
|
readonly maxConcurrency?: number;
|
|
620
|
+
readonly evalCases?: readonly EvalCase[];
|
|
608
621
|
readonly onResult?: (result: EvaluationResult) => MaybePromise<void>;
|
|
609
622
|
readonly onProgress?: (event: ProgressEvent) => MaybePromise<void>;
|
|
610
623
|
}
|
package/dist/index.d.ts
CHANGED
|
@@ -101,7 +101,7 @@ interface EvalCase {
|
|
|
101
101
|
readonly question: string;
|
|
102
102
|
readonly input_messages: readonly TestMessage[];
|
|
103
103
|
readonly input_segments: readonly JsonObject[];
|
|
104
|
-
readonly
|
|
104
|
+
readonly expected_segments: readonly JsonObject[];
|
|
105
105
|
readonly reference_answer?: string;
|
|
106
106
|
readonly guideline_paths: readonly string[];
|
|
107
107
|
readonly guideline_patterns?: readonly string[];
|
|
@@ -147,6 +147,17 @@ interface EvaluatorResult {
|
|
|
147
147
|
*/
|
|
148
148
|
declare function getHitCount(result: Pick<EvaluationResult, "hits">): number;
|
|
149
149
|
|
|
150
|
+
/**
|
|
151
|
+
* Formatting mode for segment content.
|
|
152
|
+
* - 'agent': File references only (for providers with filesystem access)
|
|
153
|
+
* - 'lm': Embedded file content with XML tags (for language model providers)
|
|
154
|
+
*/
|
|
155
|
+
type FormattingMode = 'agent' | 'lm';
|
|
156
|
+
/**
|
|
157
|
+
* Extract fenced code blocks from AgentV user segments.
|
|
158
|
+
*/
|
|
159
|
+
declare function extractCodeBlocks(segments: readonly JsonObject[]): readonly string[];
|
|
160
|
+
|
|
150
161
|
type ChatMessageRole = "system" | "user" | "assistant" | "tool" | "function";
|
|
151
162
|
interface ChatMessage {
|
|
152
163
|
readonly role: ChatMessageRole;
|
|
@@ -271,12 +282,13 @@ interface PromptInputs {
|
|
|
271
282
|
readonly chatPrompt?: ChatPrompt;
|
|
272
283
|
readonly systemMessage?: string;
|
|
273
284
|
}
|
|
274
|
-
declare function buildPromptInputs(testCase: EvalCase): Promise<PromptInputs>;
|
|
275
|
-
|
|
276
285
|
/**
|
|
277
|
-
*
|
|
286
|
+
* Build prompt inputs by consolidating user request context and guideline content.
|
|
287
|
+
*
|
|
288
|
+
* @param testCase - The evaluation test case
|
|
289
|
+
* @param mode - Formatting mode: 'agent' for file references, 'lm' for embedded content (default: 'lm')
|
|
278
290
|
*/
|
|
279
|
-
declare function
|
|
291
|
+
declare function buildPromptInputs(testCase: EvalCase, mode?: FormattingMode): Promise<PromptInputs>;
|
|
280
292
|
|
|
281
293
|
/**
|
|
282
294
|
* Determine whether a path references guideline content (instructions or prompts).
|
|
@@ -605,6 +617,7 @@ interface RunEvaluationOptions {
|
|
|
605
617
|
readonly evalId?: string;
|
|
606
618
|
readonly verbose?: boolean;
|
|
607
619
|
readonly maxConcurrency?: number;
|
|
620
|
+
readonly evalCases?: readonly EvalCase[];
|
|
608
621
|
readonly onResult?: (result: EvaluationResult) => MaybePromise<void>;
|
|
609
622
|
readonly onProgress?: (event: ProgressEvent) => MaybePromise<void>;
|
|
610
623
|
}
|
package/dist/index.js
CHANGED
|
@@ -62,7 +62,7 @@ function getHitCount(result) {
|
|
|
62
62
|
}
|
|
63
63
|
|
|
64
64
|
// src/evaluation/yaml-parser.ts
|
|
65
|
-
import { readFile as
|
|
65
|
+
import { readFile as readFile5 } from "node:fs/promises";
|
|
66
66
|
import path6 from "node:path";
|
|
67
67
|
import { parse as parse2 } from "yaml";
|
|
68
68
|
|
|
@@ -100,7 +100,7 @@ ${part.content}
|
|
|
100
100
|
}
|
|
101
101
|
return parts.map((p) => p.content).join(" ");
|
|
102
102
|
}
|
|
103
|
-
function formatSegment(segment) {
|
|
103
|
+
function formatSegment(segment, mode = "lm") {
|
|
104
104
|
const type = asString(segment.type);
|
|
105
105
|
if (type === "text") {
|
|
106
106
|
return asString(segment.value);
|
|
@@ -110,8 +110,14 @@ function formatSegment(segment) {
|
|
|
110
110
|
return refPath ? `<Attached: ${refPath}>` : void 0;
|
|
111
111
|
}
|
|
112
112
|
if (type === "file") {
|
|
113
|
-
const text = asString(segment.text);
|
|
114
113
|
const filePath = asString(segment.path);
|
|
114
|
+
if (!filePath) {
|
|
115
|
+
return void 0;
|
|
116
|
+
}
|
|
117
|
+
if (mode === "agent") {
|
|
118
|
+
return `<file: path="${filePath}">`;
|
|
119
|
+
}
|
|
120
|
+
const text = asString(segment.text);
|
|
115
121
|
if (text && filePath) {
|
|
116
122
|
return formatFileContents([{ content: text.trim(), isFile: true, displayPath: filePath }]);
|
|
117
123
|
}
|
|
@@ -315,8 +321,67 @@ function logWarning(message) {
|
|
|
315
321
|
|
|
316
322
|
// src/evaluation/loaders/evaluator-parser.ts
|
|
317
323
|
import path3 from "node:path";
|
|
324
|
+
|
|
325
|
+
// src/evaluation/validation/prompt-validator.ts
|
|
326
|
+
import { readFile as readFile2 } from "node:fs/promises";
|
|
327
|
+
|
|
328
|
+
// src/evaluation/template-variables.ts
|
|
329
|
+
var TEMPLATE_VARIABLES = {
|
|
330
|
+
CANDIDATE_ANSWER: "candidate_answer",
|
|
331
|
+
EXPECTED_MESSAGES: "expected_messages",
|
|
332
|
+
QUESTION: "question",
|
|
333
|
+
EXPECTED_OUTCOME: "expected_outcome",
|
|
334
|
+
REFERENCE_ANSWER: "reference_answer",
|
|
335
|
+
INPUT_MESSAGES: "input_messages"
|
|
336
|
+
};
|
|
337
|
+
var VALID_TEMPLATE_VARIABLES = new Set(
|
|
338
|
+
Object.values(TEMPLATE_VARIABLES)
|
|
339
|
+
);
|
|
340
|
+
var REQUIRED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Set([
|
|
341
|
+
TEMPLATE_VARIABLES.CANDIDATE_ANSWER,
|
|
342
|
+
TEMPLATE_VARIABLES.EXPECTED_MESSAGES
|
|
343
|
+
]);
|
|
344
|
+
|
|
345
|
+
// src/evaluation/validation/prompt-validator.ts
|
|
318
346
|
var ANSI_YELLOW2 = "\x1B[33m";
|
|
319
347
|
var ANSI_RESET2 = "\x1B[0m";
|
|
348
|
+
async function validateCustomPromptContent(promptPath) {
|
|
349
|
+
const content = await readFile2(promptPath, "utf8");
|
|
350
|
+
validateTemplateVariables(content, promptPath);
|
|
351
|
+
}
|
|
352
|
+
function validateTemplateVariables(content, source) {
|
|
353
|
+
const variablePattern = /\{\{\s*([a-zA-Z0-9_]+)\s*\}\}/g;
|
|
354
|
+
const foundVariables = /* @__PURE__ */ new Set();
|
|
355
|
+
const invalidVariables = [];
|
|
356
|
+
let match;
|
|
357
|
+
while ((match = variablePattern.exec(content)) !== null) {
|
|
358
|
+
const varName = match[1];
|
|
359
|
+
foundVariables.add(varName);
|
|
360
|
+
if (!VALID_TEMPLATE_VARIABLES.has(varName)) {
|
|
361
|
+
invalidVariables.push(varName);
|
|
362
|
+
}
|
|
363
|
+
}
|
|
364
|
+
const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.CANDIDATE_ANSWER);
|
|
365
|
+
const hasExpectedMessages = foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_MESSAGES);
|
|
366
|
+
const hasRequiredFields = hasCandidateAnswer || hasExpectedMessages;
|
|
367
|
+
if (!hasRequiredFields) {
|
|
368
|
+
throw new Error(
|
|
369
|
+
`Missing required fields. Must include at least one of:
|
|
370
|
+
- {{ ${TEMPLATE_VARIABLES.CANDIDATE_ANSWER} }}
|
|
371
|
+
- {{ ${TEMPLATE_VARIABLES.EXPECTED_MESSAGES} }}`
|
|
372
|
+
);
|
|
373
|
+
}
|
|
374
|
+
if (invalidVariables.length > 0) {
|
|
375
|
+
const warningMessage = `${ANSI_YELLOW2}Warning: Custom evaluator template at ${source}
|
|
376
|
+
Contains invalid variables: ${invalidVariables.map((v) => `{{ ${v} }}`).join(", ")}
|
|
377
|
+
Valid variables: ${Array.from(VALID_TEMPLATE_VARIABLES).map((v) => `{{ ${v} }}`).join(", ")}${ANSI_RESET2}`;
|
|
378
|
+
console.warn(warningMessage);
|
|
379
|
+
}
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
// src/evaluation/loaders/evaluator-parser.ts
|
|
383
|
+
var ANSI_YELLOW3 = "\x1B[33m";
|
|
384
|
+
var ANSI_RESET3 = "\x1B[0m";
|
|
320
385
|
async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
|
|
321
386
|
const execution = rawEvalCase.execution;
|
|
322
387
|
const candidateEvaluators = isJsonObject2(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators ?? globalExecution?.evaluators;
|
|
@@ -375,6 +440,12 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
375
440
|
const resolved = await resolveFileReference2(prompt, searchRoots);
|
|
376
441
|
if (resolved.resolvedPath) {
|
|
377
442
|
promptPath = path3.resolve(resolved.resolvedPath);
|
|
443
|
+
try {
|
|
444
|
+
await validateCustomPromptContent(promptPath);
|
|
445
|
+
} catch (error) {
|
|
446
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
447
|
+
throw new Error(`Evaluator '${name}' template (${promptPath}): ${message}`);
|
|
448
|
+
}
|
|
378
449
|
} else {
|
|
379
450
|
logWarning2(
|
|
380
451
|
`Inline prompt used for evaluator '${name}' in '${evalId}' (file not found: ${resolved.displayPath})`,
|
|
@@ -411,18 +482,18 @@ function isJsonObject2(value) {
|
|
|
411
482
|
function logWarning2(message, details) {
|
|
412
483
|
if (details && details.length > 0) {
|
|
413
484
|
const detailBlock = details.join("\n");
|
|
414
|
-
console.warn(`${
|
|
415
|
-
${detailBlock}${
|
|
485
|
+
console.warn(`${ANSI_YELLOW3}Warning: ${message}
|
|
486
|
+
${detailBlock}${ANSI_RESET3}`);
|
|
416
487
|
} else {
|
|
417
|
-
console.warn(`${
|
|
488
|
+
console.warn(`${ANSI_YELLOW3}Warning: ${message}${ANSI_RESET3}`);
|
|
418
489
|
}
|
|
419
490
|
}
|
|
420
491
|
|
|
421
492
|
// src/evaluation/loaders/message-processor.ts
|
|
422
|
-
import { readFile as
|
|
493
|
+
import { readFile as readFile3 } from "node:fs/promises";
|
|
423
494
|
import path4 from "node:path";
|
|
424
|
-
var
|
|
425
|
-
var
|
|
495
|
+
var ANSI_YELLOW4 = "\x1B[33m";
|
|
496
|
+
var ANSI_RESET4 = "\x1B[0m";
|
|
426
497
|
async function processMessages(options) {
|
|
427
498
|
const {
|
|
428
499
|
messages,
|
|
@@ -465,7 +536,7 @@ async function processMessages(options) {
|
|
|
465
536
|
continue;
|
|
466
537
|
}
|
|
467
538
|
try {
|
|
468
|
-
const fileContent = (await
|
|
539
|
+
const fileContent = (await readFile3(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
|
|
469
540
|
if (messageType === "input" && guidelinePatterns && guidelinePaths) {
|
|
470
541
|
const relativeToRepo = path4.relative(repoRootPath, resolvedPath);
|
|
471
542
|
if (isGuidelineFile(relativeToRepo, guidelinePatterns)) {
|
|
@@ -536,7 +607,7 @@ async function resolveAssistantContent(content, searchRoots, verbose) {
|
|
|
536
607
|
continue;
|
|
537
608
|
}
|
|
538
609
|
try {
|
|
539
|
-
const fileContent = (await
|
|
610
|
+
const fileContent = (await readFile3(resolvedPath, "utf8")).replace(/\r\n/g, "\n").trim();
|
|
540
611
|
parts.push({ content: fileContent, isFile: true, displayPath });
|
|
541
612
|
if (verbose) {
|
|
542
613
|
console.log(` [Expected Assistant File] Found: ${displayPath}`);
|
|
@@ -586,19 +657,19 @@ function cloneJsonValue(value) {
|
|
|
586
657
|
function logWarning3(message, details) {
|
|
587
658
|
if (details && details.length > 0) {
|
|
588
659
|
const detailBlock = details.join("\n");
|
|
589
|
-
console.warn(`${
|
|
590
|
-
${detailBlock}${
|
|
660
|
+
console.warn(`${ANSI_YELLOW4}Warning: ${message}
|
|
661
|
+
${detailBlock}${ANSI_RESET4}`);
|
|
591
662
|
} else {
|
|
592
|
-
console.warn(`${
|
|
663
|
+
console.warn(`${ANSI_YELLOW4}Warning: ${message}${ANSI_RESET4}`);
|
|
593
664
|
}
|
|
594
665
|
}
|
|
595
666
|
|
|
596
667
|
// src/evaluation/formatting/prompt-builder.ts
|
|
597
|
-
import { readFile as
|
|
668
|
+
import { readFile as readFile4 } from "node:fs/promises";
|
|
598
669
|
import path5 from "node:path";
|
|
599
|
-
var
|
|
600
|
-
var
|
|
601
|
-
async function buildPromptInputs(testCase) {
|
|
670
|
+
var ANSI_YELLOW5 = "\x1B[33m";
|
|
671
|
+
var ANSI_RESET5 = "\x1B[0m";
|
|
672
|
+
async function buildPromptInputs(testCase, mode = "lm") {
|
|
602
673
|
const guidelineParts = [];
|
|
603
674
|
for (const rawPath of testCase.guideline_paths) {
|
|
604
675
|
const absolutePath = path5.resolve(rawPath);
|
|
@@ -607,7 +678,7 @@ async function buildPromptInputs(testCase) {
|
|
|
607
678
|
continue;
|
|
608
679
|
}
|
|
609
680
|
try {
|
|
610
|
-
const content = (await
|
|
681
|
+
const content = (await readFile4(absolutePath, "utf8")).replace(/\r\n/g, "\n").trim();
|
|
611
682
|
guidelineParts.push({
|
|
612
683
|
content,
|
|
613
684
|
isFile: true,
|
|
@@ -674,7 +745,7 @@ async function buildPromptInputs(testCase) {
|
|
|
674
745
|
const roleLabel = message.role.charAt(0).toUpperCase() + message.role.slice(1);
|
|
675
746
|
const contentParts = [];
|
|
676
747
|
for (const segment of segments) {
|
|
677
|
-
const formattedContent = formatSegment(segment);
|
|
748
|
+
const formattedContent = formatSegment(segment, mode);
|
|
678
749
|
if (formattedContent) {
|
|
679
750
|
contentParts.push(formattedContent);
|
|
680
751
|
}
|
|
@@ -689,7 +760,11 @@ ${messageContent}`);
|
|
|
689
760
|
} else {
|
|
690
761
|
const questionParts = [];
|
|
691
762
|
for (const segment of testCase.input_segments) {
|
|
692
|
-
|
|
763
|
+
if (segment.type === "file" && typeof segment.path === "string" && testCase.guideline_patterns && isGuidelineFile(segment.path, testCase.guideline_patterns)) {
|
|
764
|
+
questionParts.push(`<Attached: ${segment.path}>`);
|
|
765
|
+
continue;
|
|
766
|
+
}
|
|
767
|
+
const formattedContent = formatSegment(segment, mode);
|
|
693
768
|
if (formattedContent) {
|
|
694
769
|
questionParts.push(formattedContent);
|
|
695
770
|
}
|
|
@@ -703,7 +778,8 @@ ${messageContent}`);
|
|
|
703
778
|
messages: testCase.input_messages,
|
|
704
779
|
segmentsByMessage,
|
|
705
780
|
guidelinePatterns: testCase.guideline_patterns,
|
|
706
|
-
guidelineContent: guidelines
|
|
781
|
+
guidelineContent: guidelines,
|
|
782
|
+
mode
|
|
707
783
|
}) : void 0;
|
|
708
784
|
return { question, guidelines, chatPrompt };
|
|
709
785
|
}
|
|
@@ -720,7 +796,7 @@ function needsRoleMarkers(messages, processedSegmentsByMessage) {
|
|
|
720
796
|
return messagesWithContent > 1;
|
|
721
797
|
}
|
|
722
798
|
function buildChatPromptFromSegments(options) {
|
|
723
|
-
const { messages, segmentsByMessage, guidelinePatterns, guidelineContent, systemPrompt } = options;
|
|
799
|
+
const { messages, segmentsByMessage, guidelinePatterns, guidelineContent, systemPrompt, mode = "lm" } = options;
|
|
724
800
|
if (messages.length === 0) {
|
|
725
801
|
return void 0;
|
|
726
802
|
}
|
|
@@ -738,7 +814,7 @@ ${guidelineContent.trim()}`);
|
|
|
738
814
|
const segments = segmentsByMessage[startIndex];
|
|
739
815
|
const contentParts = [];
|
|
740
816
|
for (const segment of segments) {
|
|
741
|
-
const formatted = formatSegment(segment);
|
|
817
|
+
const formatted = formatSegment(segment, mode);
|
|
742
818
|
if (formatted) {
|
|
743
819
|
contentParts.push(formatted);
|
|
744
820
|
}
|
|
@@ -771,7 +847,7 @@ ${guidelineContent.trim()}`);
|
|
|
771
847
|
if (segment.type === "guideline_ref") {
|
|
772
848
|
continue;
|
|
773
849
|
}
|
|
774
|
-
const formatted = formatSegment(segment);
|
|
850
|
+
const formatted = formatSegment(segment, mode);
|
|
775
851
|
if (formatted) {
|
|
776
852
|
const isGuidelineRef = segment.type === "file" && typeof segment.path === "string" && guidelinePatterns && isGuidelineFile(segment.path, guidelinePatterns);
|
|
777
853
|
if (isGuidelineRef) {
|
|
@@ -795,17 +871,18 @@ function asString4(value) {
|
|
|
795
871
|
return typeof value === "string" ? value : void 0;
|
|
796
872
|
}
|
|
797
873
|
function logWarning4(message) {
|
|
798
|
-
console.warn(`${
|
|
874
|
+
console.warn(`${ANSI_YELLOW5}Warning: ${message}${ANSI_RESET5}`);
|
|
799
875
|
}
|
|
800
876
|
|
|
801
877
|
// src/evaluation/yaml-parser.ts
|
|
802
|
-
var
|
|
803
|
-
var
|
|
878
|
+
var ANSI_YELLOW6 = "\x1B[33m";
|
|
879
|
+
var ANSI_RED = "\x1B[31m";
|
|
880
|
+
var ANSI_RESET6 = "\x1B[0m";
|
|
804
881
|
var SCHEMA_EVAL_V2 = "agentv-eval-v2";
|
|
805
882
|
async function readTestSuiteMetadata(testFilePath) {
|
|
806
883
|
try {
|
|
807
884
|
const absolutePath = path6.resolve(testFilePath);
|
|
808
|
-
const content = await
|
|
885
|
+
const content = await readFile5(absolutePath, "utf8");
|
|
809
886
|
const parsed = parse2(content);
|
|
810
887
|
if (!isJsonObject(parsed)) {
|
|
811
888
|
return {};
|
|
@@ -823,7 +900,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
823
900
|
const searchRoots = buildSearchRoots2(absoluteTestPath, repoRootPath);
|
|
824
901
|
const config = await loadConfig(absoluteTestPath, repoRootPath);
|
|
825
902
|
const guidelinePatterns = config?.guideline_patterns;
|
|
826
|
-
const rawFile = await
|
|
903
|
+
const rawFile = await readFile5(absoluteTestPath, "utf8");
|
|
827
904
|
const parsed = parse2(rawFile);
|
|
828
905
|
if (!isJsonObject(parsed)) {
|
|
829
906
|
throw new Error(`Invalid test file format: ${evalFilePath}`);
|
|
@@ -861,14 +938,14 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
861
938
|
const inputMessagesValue = evalcase.input_messages;
|
|
862
939
|
const expectedMessagesValue = evalcase.expected_messages;
|
|
863
940
|
if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
|
|
864
|
-
|
|
941
|
+
logError(`Skipping incomplete eval case: ${id ?? "unknown"}. Missing required fields: id, outcome, and/or input_messages`);
|
|
865
942
|
continue;
|
|
866
943
|
}
|
|
867
944
|
const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
|
|
868
945
|
const inputMessages = inputMessagesValue.filter((msg) => isTestMessage(msg));
|
|
869
946
|
const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
|
|
870
947
|
if (hasExpectedMessages && expectedMessages.length === 0) {
|
|
871
|
-
|
|
948
|
+
logError(`No valid expected message found for eval case: ${id}`);
|
|
872
949
|
continue;
|
|
873
950
|
}
|
|
874
951
|
if (expectedMessages.length > 1) {
|
|
@@ -899,7 +976,14 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
899
976
|
const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
|
|
900
977
|
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
901
978
|
const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
|
|
902
|
-
|
|
979
|
+
let evaluators;
|
|
980
|
+
try {
|
|
981
|
+
evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
|
|
982
|
+
} catch (error) {
|
|
983
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
984
|
+
logError(`Skipping eval case '${id}': ${message}`);
|
|
985
|
+
continue;
|
|
986
|
+
}
|
|
903
987
|
const userFilePaths = [];
|
|
904
988
|
for (const segment of inputSegments) {
|
|
905
989
|
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
@@ -917,7 +1001,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
917
1001
|
question,
|
|
918
1002
|
input_messages: inputMessages,
|
|
919
1003
|
input_segments: inputSegments,
|
|
920
|
-
|
|
1004
|
+
expected_segments: outputSegments,
|
|
921
1005
|
reference_answer: referenceAnswer,
|
|
922
1006
|
guideline_paths: guidelinePaths.map((guidelinePath) => path6.resolve(guidelinePath)),
|
|
923
1007
|
guideline_patterns: guidelinePatterns,
|
|
@@ -949,10 +1033,19 @@ function asString5(value) {
|
|
|
949
1033
|
function logWarning5(message, details) {
|
|
950
1034
|
if (details && details.length > 0) {
|
|
951
1035
|
const detailBlock = details.join("\n");
|
|
952
|
-
console.warn(`${
|
|
953
|
-
${detailBlock}${
|
|
1036
|
+
console.warn(`${ANSI_YELLOW6}Warning: ${message}
|
|
1037
|
+
${detailBlock}${ANSI_RESET6}`);
|
|
954
1038
|
} else {
|
|
955
|
-
console.warn(`${
|
|
1039
|
+
console.warn(`${ANSI_YELLOW6}Warning: ${message}${ANSI_RESET6}`);
|
|
1040
|
+
}
|
|
1041
|
+
}
|
|
1042
|
+
function logError(message, details) {
|
|
1043
|
+
if (details && details.length > 0) {
|
|
1044
|
+
const detailBlock = details.join("\n");
|
|
1045
|
+
console.error(`${ANSI_RED}Error: ${message}
|
|
1046
|
+
${detailBlock}${ANSI_RESET6}`);
|
|
1047
|
+
} else {
|
|
1048
|
+
console.error(`${ANSI_RED}Error: ${message}${ANSI_RESET6}`);
|
|
956
1049
|
}
|
|
957
1050
|
}
|
|
958
1051
|
|
|
@@ -2637,7 +2730,7 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
|
|
|
2637
2730
|
|
|
2638
2731
|
// src/evaluation/providers/targets-file.ts
|
|
2639
2732
|
import { constants as constants3 } from "node:fs";
|
|
2640
|
-
import { access as access3, readFile as
|
|
2733
|
+
import { access as access3, readFile as readFile6 } from "node:fs/promises";
|
|
2641
2734
|
import path11 from "node:path";
|
|
2642
2735
|
import { parse as parse3 } from "yaml";
|
|
2643
2736
|
function isRecord(value) {
|
|
@@ -2698,7 +2791,7 @@ async function readTargetDefinitions(filePath) {
|
|
|
2698
2791
|
if (!await fileExists3(absolutePath)) {
|
|
2699
2792
|
throw new Error(`targets.yaml not found at ${absolutePath}`);
|
|
2700
2793
|
}
|
|
2701
|
-
const raw = await
|
|
2794
|
+
const raw = await readFile6(absolutePath, "utf8");
|
|
2702
2795
|
const parsed = parse3(raw);
|
|
2703
2796
|
if (!isRecord(parsed)) {
|
|
2704
2797
|
throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with '$schema' and 'targets' fields`);
|
|
@@ -2749,16 +2842,16 @@ Use the reference_answer as a gold standard for a high-quality response (if prov
|
|
|
2749
2842
|
Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.
|
|
2750
2843
|
|
|
2751
2844
|
[[ ## expected_outcome ## ]]
|
|
2752
|
-
{{
|
|
2845
|
+
{{${TEMPLATE_VARIABLES.EXPECTED_OUTCOME}}}
|
|
2753
2846
|
|
|
2754
2847
|
[[ ## question ## ]]
|
|
2755
|
-
{{
|
|
2848
|
+
{{${TEMPLATE_VARIABLES.QUESTION}}}
|
|
2756
2849
|
|
|
2757
2850
|
[[ ## reference_answer ## ]]
|
|
2758
|
-
{{
|
|
2851
|
+
{{${TEMPLATE_VARIABLES.REFERENCE_ANSWER}}}
|
|
2759
2852
|
|
|
2760
2853
|
[[ ## candidate_answer ## ]]
|
|
2761
|
-
{{
|
|
2854
|
+
{{${TEMPLATE_VARIABLES.CANDIDATE_ANSWER}}}`;
|
|
2762
2855
|
var LlmJudgeEvaluator = class {
|
|
2763
2856
|
kind = "llm_judge";
|
|
2764
2857
|
resolveJudgeProvider;
|
|
@@ -2781,12 +2874,12 @@ var LlmJudgeEvaluator = class {
|
|
|
2781
2874
|
async evaluateWithPrompt(context, judgeProvider) {
|
|
2782
2875
|
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
2783
2876
|
const variables = {
|
|
2784
|
-
|
|
2785
|
-
|
|
2786
|
-
|
|
2787
|
-
|
|
2788
|
-
|
|
2789
|
-
|
|
2877
|
+
[TEMPLATE_VARIABLES.INPUT_MESSAGES]: JSON.stringify(context.evalCase.input_segments, null, 2),
|
|
2878
|
+
[TEMPLATE_VARIABLES.EXPECTED_MESSAGES]: JSON.stringify(context.evalCase.expected_segments, null, 2),
|
|
2879
|
+
[TEMPLATE_VARIABLES.CANDIDATE_ANSWER]: context.candidate.trim(),
|
|
2880
|
+
[TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context.evalCase.reference_answer ?? "").trim(),
|
|
2881
|
+
[TEMPLATE_VARIABLES.EXPECTED_OUTCOME]: context.evalCase.expected_outcome.trim(),
|
|
2882
|
+
[TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim()
|
|
2790
2883
|
};
|
|
2791
2884
|
const systemPrompt = buildOutputSchema();
|
|
2792
2885
|
const evaluatorTemplate = context.evaluatorTemplateOverride ?? this.evaluatorTemplate ?? DEFAULT_EVALUATOR_TEMPLATE;
|
|
@@ -3018,7 +3111,7 @@ function parseJsonSafe(payload) {
|
|
|
3018
3111
|
}
|
|
3019
3112
|
}
|
|
3020
3113
|
function substituteVariables(template, variables) {
|
|
3021
|
-
return template.replace(/\{\{([a-zA-Z0-9_]+)\}\}/g, (match, varName) => {
|
|
3114
|
+
return template.replace(/\{\{\s*([a-zA-Z0-9_]+)\s*\}\}/g, (match, varName) => {
|
|
3022
3115
|
return variables[varName] ?? match;
|
|
3023
3116
|
});
|
|
3024
3117
|
}
|
|
@@ -3182,11 +3275,11 @@ async function runEvaluation(options) {
|
|
|
3182
3275
|
now,
|
|
3183
3276
|
evalId,
|
|
3184
3277
|
verbose,
|
|
3278
|
+
evalCases: preloadedEvalCases,
|
|
3185
3279
|
onResult,
|
|
3186
3280
|
onProgress
|
|
3187
3281
|
} = options;
|
|
3188
|
-
const
|
|
3189
|
-
const evalCases = await load(evalFilePath, repoRoot, { verbose, evalId });
|
|
3282
|
+
const evalCases = preloadedEvalCases ?? await loadEvalCases(evalFilePath, repoRoot, { verbose, evalId });
|
|
3190
3283
|
const filteredEvalCases = filterEvalCases(evalCases, evalId);
|
|
3191
3284
|
if (filteredEvalCases.length === 0) {
|
|
3192
3285
|
if (evalId) {
|
|
@@ -3370,8 +3463,9 @@ async function runBatchEvaluation(options) {
|
|
|
3370
3463
|
agentTimeoutMs
|
|
3371
3464
|
} = options;
|
|
3372
3465
|
const promptInputsList = [];
|
|
3466
|
+
const formattingMode = isAgentProvider(provider) ? "agent" : "lm";
|
|
3373
3467
|
for (const evalCase of evalCases) {
|
|
3374
|
-
const promptInputs = await buildPromptInputs(evalCase);
|
|
3468
|
+
const promptInputs = await buildPromptInputs(evalCase, formattingMode);
|
|
3375
3469
|
if (promptDumpDir) {
|
|
3376
3470
|
await dumpPrompt(promptDumpDir, evalCase, promptInputs);
|
|
3377
3471
|
}
|
|
@@ -3477,7 +3571,8 @@ async function runEvalCase(options) {
|
|
|
3477
3571
|
signal,
|
|
3478
3572
|
judgeProvider
|
|
3479
3573
|
} = options;
|
|
3480
|
-
const
|
|
3574
|
+
const formattingMode = isAgentProvider(provider) ? "agent" : "lm";
|
|
3575
|
+
const promptInputs = await buildPromptInputs(evalCase, formattingMode);
|
|
3481
3576
|
if (promptDumpDir) {
|
|
3482
3577
|
await dumpPrompt(promptDumpDir, evalCase, promptInputs);
|
|
3483
3578
|
}
|
|
@@ -3766,7 +3861,8 @@ async function runLlmJudgeEvaluator(options) {
|
|
|
3766
3861
|
async function resolveCustomPrompt(config) {
|
|
3767
3862
|
if (config.promptPath) {
|
|
3768
3863
|
try {
|
|
3769
|
-
|
|
3864
|
+
const content = await readTextFile(config.promptPath);
|
|
3865
|
+
return content;
|
|
3770
3866
|
} catch (error) {
|
|
3771
3867
|
const message = error instanceof Error ? error.message : String(error);
|
|
3772
3868
|
console.warn(`Could not read custom prompt at ${config.promptPath}: ${message}`);
|