@agentv/core 0.9.0 → 0.10.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-SNTZFB24.js → chunk-YQBJAT5I.js} +1 -1
- package/dist/{chunk-SNTZFB24.js.map → chunk-YQBJAT5I.js.map} +1 -1
- package/dist/evaluation/validation/index.cjs +30 -13
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +21 -4
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +375 -104
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +74 -64
- package/dist/index.d.ts +74 -64
- package/dist/index.js +375 -105
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -9,7 +9,7 @@ import {
|
|
|
9
9
|
readTextFile,
|
|
10
10
|
resolveFileReference,
|
|
11
11
|
resolveTargetDefinition
|
|
12
|
-
} from "./chunk-
|
|
12
|
+
} from "./chunk-YQBJAT5I.js";
|
|
13
13
|
|
|
14
14
|
// src/evaluation/types.ts
|
|
15
15
|
var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
|
|
@@ -73,6 +73,33 @@ var ANSI_YELLOW = "\x1B[33m";
|
|
|
73
73
|
var ANSI_RESET = "\x1B[0m";
|
|
74
74
|
var SCHEMA_EVAL_V2 = "agentv-eval-v2";
|
|
75
75
|
var SCHEMA_CONFIG_V2 = "agentv-config-v2";
|
|
76
|
+
async function readTestSuiteMetadata(testFilePath) {
|
|
77
|
+
try {
|
|
78
|
+
const absolutePath = path.resolve(testFilePath);
|
|
79
|
+
const content = await readFile(absolutePath, "utf8");
|
|
80
|
+
const parsed = parse(content);
|
|
81
|
+
if (!isJsonObject(parsed)) {
|
|
82
|
+
return {};
|
|
83
|
+
}
|
|
84
|
+
return { target: extractTargetFromSuite(parsed) };
|
|
85
|
+
} catch {
|
|
86
|
+
return {};
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
function extractTargetFromSuite(suite) {
|
|
90
|
+
const execution = suite.execution;
|
|
91
|
+
if (execution && typeof execution === "object" && !Array.isArray(execution)) {
|
|
92
|
+
const executionTarget = execution.target;
|
|
93
|
+
if (typeof executionTarget === "string" && executionTarget.trim().length > 0) {
|
|
94
|
+
return executionTarget.trim();
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
const targetValue = suite.target;
|
|
98
|
+
if (typeof targetValue === "string" && targetValue.trim().length > 0) {
|
|
99
|
+
return targetValue.trim();
|
|
100
|
+
}
|
|
101
|
+
return void 0;
|
|
102
|
+
}
|
|
76
103
|
async function loadConfig(evalFilePath, repoRoot) {
|
|
77
104
|
const directories = buildDirectoryChain(evalFilePath, repoRoot);
|
|
78
105
|
for (const directory of directories) {
|
|
@@ -249,6 +276,8 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
249
276
|
throw new Error(`Invalid test file format: ${evalFilePath} - missing 'evalcases' field`);
|
|
250
277
|
}
|
|
251
278
|
const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm_judge";
|
|
279
|
+
const globalExecution = isJsonObject(suite.execution) ? suite.execution : void 0;
|
|
280
|
+
const globalTarget = asString(globalExecution?.target) ?? asString(suite.target);
|
|
252
281
|
const results = [];
|
|
253
282
|
for (const rawEvalcase of rawTestcases) {
|
|
254
283
|
if (!isJsonObject(rawEvalcase)) {
|
|
@@ -268,14 +297,11 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
268
297
|
logWarning(`Skipping incomplete eval case: ${id ?? "unknown"}`);
|
|
269
298
|
continue;
|
|
270
299
|
}
|
|
271
|
-
|
|
272
|
-
logWarning(`Eval case '${id}' missing expected_messages array`);
|
|
273
|
-
continue;
|
|
274
|
-
}
|
|
300
|
+
const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
|
|
275
301
|
const inputMessages = inputMessagesValue.filter((msg) => isTestMessage(msg));
|
|
276
|
-
const expectedMessages = expectedMessagesValue.filter((msg) => isTestMessage(msg));
|
|
277
|
-
if (expectedMessages.length === 0) {
|
|
278
|
-
logWarning(`No expected message found for eval case: ${id}`);
|
|
302
|
+
const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
|
|
303
|
+
if (hasExpectedMessages && expectedMessages.length === 0) {
|
|
304
|
+
logWarning(`No valid expected message found for eval case: ${id}`);
|
|
279
305
|
continue;
|
|
280
306
|
}
|
|
281
307
|
if (expectedMessages.length > 1) {
|
|
@@ -293,20 +319,20 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
293
319
|
messageType: "input",
|
|
294
320
|
verbose
|
|
295
321
|
});
|
|
296
|
-
const outputSegments = await processMessages({
|
|
322
|
+
const outputSegments = hasExpectedMessages ? await processMessages({
|
|
297
323
|
messages: expectedMessages,
|
|
298
324
|
searchRoots,
|
|
299
325
|
repoRootPath,
|
|
300
326
|
guidelinePatterns,
|
|
301
327
|
messageType: "output",
|
|
302
328
|
verbose
|
|
303
|
-
});
|
|
329
|
+
}) : [];
|
|
304
330
|
const codeSnippets = extractCodeBlocks(inputSegments);
|
|
305
331
|
const expectedContent = expectedMessages[0]?.content;
|
|
306
|
-
const referenceAnswer = await resolveAssistantContent(expectedContent, searchRoots, verbose);
|
|
332
|
+
const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
|
|
307
333
|
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
308
334
|
const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
|
|
309
|
-
const evaluators = await parseEvaluators(evalcase, searchRoots, id ?? "unknown");
|
|
335
|
+
const evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
|
|
310
336
|
const userFilePaths = [];
|
|
311
337
|
for (const segment of inputSegments) {
|
|
312
338
|
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
@@ -322,6 +348,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
322
348
|
dataset: datasetName,
|
|
323
349
|
conversation_id: conversationId,
|
|
324
350
|
question,
|
|
351
|
+
input_messages: inputMessages,
|
|
325
352
|
input_segments: inputSegments,
|
|
326
353
|
output_segments: outputSegments,
|
|
327
354
|
reference_answer: referenceAnswer,
|
|
@@ -349,6 +376,54 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
349
376
|
}
|
|
350
377
|
return results;
|
|
351
378
|
}
|
|
379
|
+
function needsRoleMarkers(messages, processedSegmentsByMessage) {
|
|
380
|
+
if (messages.some((msg) => msg.role === "assistant" || msg.role === "tool")) {
|
|
381
|
+
return true;
|
|
382
|
+
}
|
|
383
|
+
let messagesWithContent = 0;
|
|
384
|
+
for (const segments of processedSegmentsByMessage) {
|
|
385
|
+
if (hasVisibleContent(segments)) {
|
|
386
|
+
messagesWithContent++;
|
|
387
|
+
}
|
|
388
|
+
}
|
|
389
|
+
return messagesWithContent > 1;
|
|
390
|
+
}
|
|
391
|
+
function hasVisibleContent(segments) {
|
|
392
|
+
return segments.some((segment) => {
|
|
393
|
+
const type = asString(segment.type);
|
|
394
|
+
if (type === "text") {
|
|
395
|
+
const value = asString(segment.value);
|
|
396
|
+
return value !== void 0 && value.trim().length > 0;
|
|
397
|
+
}
|
|
398
|
+
if (type === "guideline_ref") {
|
|
399
|
+
return false;
|
|
400
|
+
}
|
|
401
|
+
if (type === "file") {
|
|
402
|
+
const text = asString(segment.text);
|
|
403
|
+
return text !== void 0 && text.trim().length > 0;
|
|
404
|
+
}
|
|
405
|
+
return false;
|
|
406
|
+
});
|
|
407
|
+
}
|
|
408
|
+
function formatSegment(segment) {
|
|
409
|
+
const type = asString(segment.type);
|
|
410
|
+
if (type === "text") {
|
|
411
|
+
return asString(segment.value);
|
|
412
|
+
}
|
|
413
|
+
if (type === "guideline_ref") {
|
|
414
|
+
const refPath = asString(segment.path);
|
|
415
|
+
return refPath ? `<Attached: ${refPath}>` : void 0;
|
|
416
|
+
}
|
|
417
|
+
if (type === "file") {
|
|
418
|
+
const text = asString(segment.text);
|
|
419
|
+
const filePath = asString(segment.path);
|
|
420
|
+
if (text && filePath) {
|
|
421
|
+
return `=== ${filePath} ===
|
|
422
|
+
${text}`;
|
|
423
|
+
}
|
|
424
|
+
}
|
|
425
|
+
return void 0;
|
|
426
|
+
}
|
|
352
427
|
async function buildPromptInputs(testCase) {
|
|
353
428
|
const guidelineContents = [];
|
|
354
429
|
for (const rawPath of testCase.guideline_paths) {
|
|
@@ -365,36 +440,168 @@ ${content}`);
|
|
|
365
440
|
logWarning(`Could not read guideline file ${absolutePath}: ${error.message}`);
|
|
366
441
|
}
|
|
367
442
|
}
|
|
368
|
-
const
|
|
443
|
+
const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
|
|
444
|
+
const segmentsByMessage = [];
|
|
445
|
+
const fileContentsByPath = /* @__PURE__ */ new Map();
|
|
369
446
|
for (const segment of testCase.input_segments) {
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
const pathValue = segment.path;
|
|
373
|
-
const textValue = segment.text;
|
|
374
|
-
const label = typeof pathValue === "string" ? pathValue : "file";
|
|
375
|
-
const body = typeof textValue === "string" ? textValue : "";
|
|
376
|
-
questionParts.push(`=== ${label} ===
|
|
377
|
-
${body}`);
|
|
378
|
-
continue;
|
|
447
|
+
if (segment.type === "file" && typeof segment.path === "string" && typeof segment.text === "string") {
|
|
448
|
+
fileContentsByPath.set(segment.path, segment.text);
|
|
379
449
|
}
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
450
|
+
}
|
|
451
|
+
for (const message of testCase.input_messages) {
|
|
452
|
+
const messageSegments = [];
|
|
453
|
+
if (typeof message.content === "string") {
|
|
454
|
+
if (message.content.trim().length > 0) {
|
|
455
|
+
messageSegments.push({ type: "text", value: message.content });
|
|
456
|
+
}
|
|
457
|
+
} else if (Array.isArray(message.content)) {
|
|
458
|
+
for (const segment of message.content) {
|
|
459
|
+
if (typeof segment === "string") {
|
|
460
|
+
if (segment.trim().length > 0) {
|
|
461
|
+
messageSegments.push({ type: "text", value: segment });
|
|
462
|
+
}
|
|
463
|
+
} else if (isJsonObject(segment)) {
|
|
464
|
+
const type = asString(segment.type);
|
|
465
|
+
if (type === "file") {
|
|
466
|
+
const value = asString(segment.value);
|
|
467
|
+
if (!value) continue;
|
|
468
|
+
if (testCase.guideline_patterns && isGuidelineFile(value, testCase.guideline_patterns)) {
|
|
469
|
+
messageSegments.push({ type: "guideline_ref", path: value });
|
|
470
|
+
continue;
|
|
471
|
+
}
|
|
472
|
+
const fileText = fileContentsByPath.get(value);
|
|
473
|
+
if (fileText !== void 0) {
|
|
474
|
+
messageSegments.push({ type: "file", text: fileText, path: value });
|
|
475
|
+
}
|
|
476
|
+
} else if (type === "text") {
|
|
477
|
+
const textValue = asString(segment.value);
|
|
478
|
+
if (textValue && textValue.trim().length > 0) {
|
|
479
|
+
messageSegments.push({ type: "text", value: textValue });
|
|
480
|
+
}
|
|
481
|
+
}
|
|
482
|
+
}
|
|
384
483
|
}
|
|
385
|
-
continue;
|
|
386
484
|
}
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
485
|
+
segmentsByMessage.push(messageSegments);
|
|
486
|
+
}
|
|
487
|
+
const useRoleMarkers = needsRoleMarkers(testCase.input_messages, segmentsByMessage);
|
|
488
|
+
let question;
|
|
489
|
+
if (useRoleMarkers) {
|
|
490
|
+
const messageParts = [];
|
|
491
|
+
for (let i = 0; i < testCase.input_messages.length; i++) {
|
|
492
|
+
const message = testCase.input_messages[i];
|
|
493
|
+
const segments = segmentsByMessage[i];
|
|
494
|
+
if (!hasVisibleContent(segments)) {
|
|
495
|
+
continue;
|
|
496
|
+
}
|
|
497
|
+
const roleLabel = message.role.charAt(0).toUpperCase() + message.role.slice(1);
|
|
498
|
+
const contentParts = [];
|
|
499
|
+
for (const segment of segments) {
|
|
500
|
+
const formattedContent = formatSegment(segment);
|
|
501
|
+
if (formattedContent) {
|
|
502
|
+
contentParts.push(formattedContent);
|
|
503
|
+
}
|
|
504
|
+
}
|
|
505
|
+
if (contentParts.length > 0) {
|
|
506
|
+
const messageContent = contentParts.join("\n");
|
|
507
|
+
messageParts.push(`@[${roleLabel}]:
|
|
508
|
+
${messageContent}`);
|
|
509
|
+
}
|
|
510
|
+
}
|
|
511
|
+
question = messageParts.join("\n\n");
|
|
512
|
+
} else {
|
|
513
|
+
const questionParts = [];
|
|
514
|
+
for (const segment of testCase.input_segments) {
|
|
515
|
+
const formattedContent = formatSegment(segment);
|
|
516
|
+
if (formattedContent) {
|
|
517
|
+
questionParts.push(formattedContent);
|
|
518
|
+
}
|
|
390
519
|
}
|
|
520
|
+
if (testCase.code_snippets.length > 0) {
|
|
521
|
+
questionParts.push(testCase.code_snippets.join("\n"));
|
|
522
|
+
}
|
|
523
|
+
question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
|
|
391
524
|
}
|
|
392
|
-
|
|
393
|
-
|
|
525
|
+
const chatPrompt = useRoleMarkers ? buildChatPromptFromSegments({
|
|
526
|
+
messages: testCase.input_messages,
|
|
527
|
+
segmentsByMessage,
|
|
528
|
+
guidelinePatterns: testCase.guideline_patterns,
|
|
529
|
+
guidelineContent: guidelines
|
|
530
|
+
}) : void 0;
|
|
531
|
+
return { question, guidelines, chatPrompt };
|
|
532
|
+
}
|
|
533
|
+
function buildChatPromptFromSegments(options) {
|
|
534
|
+
const { messages, segmentsByMessage, guidelinePatterns, guidelineContent, systemPrompt } = options;
|
|
535
|
+
if (messages.length === 0) {
|
|
536
|
+
return void 0;
|
|
394
537
|
}
|
|
395
|
-
const
|
|
396
|
-
|
|
397
|
-
|
|
538
|
+
const systemSegments = [];
|
|
539
|
+
if (systemPrompt && systemPrompt.trim().length > 0) {
|
|
540
|
+
systemSegments.push(systemPrompt.trim());
|
|
541
|
+
}
|
|
542
|
+
if (guidelineContent && guidelineContent.trim().length > 0) {
|
|
543
|
+
systemSegments.push(`[[ ## Guidelines ## ]]
|
|
544
|
+
|
|
545
|
+
${guidelineContent.trim()}`);
|
|
546
|
+
}
|
|
547
|
+
let startIndex = 0;
|
|
548
|
+
while (startIndex < messages.length && messages[startIndex].role === "system") {
|
|
549
|
+
const segments = segmentsByMessage[startIndex];
|
|
550
|
+
const contentParts = [];
|
|
551
|
+
for (const segment of segments) {
|
|
552
|
+
const formatted = formatSegment(segment);
|
|
553
|
+
if (formatted) {
|
|
554
|
+
contentParts.push(formatted);
|
|
555
|
+
}
|
|
556
|
+
}
|
|
557
|
+
if (contentParts.length > 0) {
|
|
558
|
+
systemSegments.push(contentParts.join("\n"));
|
|
559
|
+
}
|
|
560
|
+
startIndex += 1;
|
|
561
|
+
}
|
|
562
|
+
const chatPrompt = [];
|
|
563
|
+
if (systemSegments.length > 0) {
|
|
564
|
+
chatPrompt.push({
|
|
565
|
+
role: "system",
|
|
566
|
+
content: systemSegments.join("\n\n")
|
|
567
|
+
});
|
|
568
|
+
}
|
|
569
|
+
for (let i = startIndex; i < messages.length; i++) {
|
|
570
|
+
const message = messages[i];
|
|
571
|
+
const segments = segmentsByMessage[i];
|
|
572
|
+
const contentParts = [];
|
|
573
|
+
let role = message.role;
|
|
574
|
+
let name;
|
|
575
|
+
if (role === "system") {
|
|
576
|
+
role = "assistant";
|
|
577
|
+
contentParts.push("@[System]:");
|
|
578
|
+
} else if (role === "tool") {
|
|
579
|
+
role = "function";
|
|
580
|
+
name = "tool";
|
|
581
|
+
}
|
|
582
|
+
for (const segment of segments) {
|
|
583
|
+
if (segment.type === "guideline_ref") {
|
|
584
|
+
continue;
|
|
585
|
+
}
|
|
586
|
+
const formatted = formatSegment(segment);
|
|
587
|
+
if (formatted) {
|
|
588
|
+
const isGuidelineRef = segment.type === "file" && typeof segment.path === "string" && guidelinePatterns && isGuidelineFile(segment.path, guidelinePatterns);
|
|
589
|
+
if (isGuidelineRef) {
|
|
590
|
+
continue;
|
|
591
|
+
}
|
|
592
|
+
contentParts.push(formatted);
|
|
593
|
+
}
|
|
594
|
+
}
|
|
595
|
+
if (contentParts.length === 0) {
|
|
596
|
+
continue;
|
|
597
|
+
}
|
|
598
|
+
chatPrompt.push({
|
|
599
|
+
role,
|
|
600
|
+
content: contentParts.join("\n"),
|
|
601
|
+
...name ? { name } : {}
|
|
602
|
+
});
|
|
603
|
+
}
|
|
604
|
+
return chatPrompt.length > 0 ? chatPrompt : void 0;
|
|
398
605
|
}
|
|
399
606
|
async function fileExists2(absolutePath) {
|
|
400
607
|
try {
|
|
@@ -492,9 +699,9 @@ async function resolveAssistantContent(content, searchRoots, verbose) {
|
|
|
492
699
|
}
|
|
493
700
|
return parts.join(" ");
|
|
494
701
|
}
|
|
495
|
-
async function parseEvaluators(rawEvalCase, searchRoots, evalId) {
|
|
702
|
+
async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
|
|
496
703
|
const execution = rawEvalCase.execution;
|
|
497
|
-
const candidateEvaluators = isJsonObject(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators;
|
|
704
|
+
const candidateEvaluators = isJsonObject(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators ?? globalExecution?.evaluators;
|
|
498
705
|
if (candidateEvaluators === void 0) {
|
|
499
706
|
return void 0;
|
|
500
707
|
}
|
|
@@ -532,6 +739,8 @@ async function parseEvaluators(rawEvalCase, searchRoots, evalId) {
|
|
|
532
739
|
resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
|
|
533
740
|
);
|
|
534
741
|
}
|
|
742
|
+
} else {
|
|
743
|
+
resolvedCwd = searchRoots[0];
|
|
535
744
|
}
|
|
536
745
|
evaluators.push({
|
|
537
746
|
name,
|
|
@@ -560,8 +769,7 @@ async function parseEvaluators(rawEvalCase, searchRoots, evalId) {
|
|
|
560
769
|
name,
|
|
561
770
|
type: "llm_judge",
|
|
562
771
|
prompt,
|
|
563
|
-
promptPath
|
|
564
|
-
model
|
|
772
|
+
promptPath
|
|
565
773
|
});
|
|
566
774
|
}
|
|
567
775
|
return evaluators.length > 0 ? evaluators : void 0;
|
|
@@ -591,21 +799,14 @@ import { AxAI } from "@ax-llm/ax";
|
|
|
591
799
|
var DEFAULT_SYSTEM_PROMPT = "You are a careful assistant. Follow all provided instructions and do not fabricate results.";
|
|
592
800
|
function buildChatPrompt(request) {
|
|
593
801
|
if (request.chatPrompt) {
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
} else {
|
|
601
|
-
systemSegments.push(DEFAULT_SYSTEM_PROMPT);
|
|
602
|
-
}
|
|
603
|
-
if (request.guidelines && request.guidelines.trim().length > 0) {
|
|
604
|
-
systemSegments.push(`[[ ## Guidelines ## ]]
|
|
605
|
-
|
|
606
|
-
${request.guidelines.trim()}`);
|
|
802
|
+
const hasSystemMessage = request.chatPrompt.some((message) => message.role === "system");
|
|
803
|
+
if (hasSystemMessage) {
|
|
804
|
+
return request.chatPrompt;
|
|
805
|
+
}
|
|
806
|
+
const systemContent2 = resolveSystemContent(request);
|
|
807
|
+
return [{ role: "system", content: systemContent2 }, ...request.chatPrompt];
|
|
607
808
|
}
|
|
608
|
-
const systemContent =
|
|
809
|
+
const systemContent = resolveSystemContent(request);
|
|
609
810
|
const userContent = request.question.trim();
|
|
610
811
|
const prompt = [
|
|
611
812
|
{
|
|
@@ -619,6 +820,21 @@ ${request.guidelines.trim()}`);
|
|
|
619
820
|
];
|
|
620
821
|
return prompt;
|
|
621
822
|
}
|
|
823
|
+
function resolveSystemContent(request) {
|
|
824
|
+
const systemSegments = [];
|
|
825
|
+
const metadataSystemPrompt = typeof request.metadata?.systemPrompt === "string" ? request.metadata.systemPrompt : void 0;
|
|
826
|
+
if (metadataSystemPrompt && metadataSystemPrompt.trim().length > 0) {
|
|
827
|
+
systemSegments.push(metadataSystemPrompt.trim());
|
|
828
|
+
} else {
|
|
829
|
+
systemSegments.push(DEFAULT_SYSTEM_PROMPT);
|
|
830
|
+
}
|
|
831
|
+
if (request.guidelines && request.guidelines.trim().length > 0) {
|
|
832
|
+
systemSegments.push(`[[ ## Guidelines ## ]]
|
|
833
|
+
|
|
834
|
+
${request.guidelines.trim()}`);
|
|
835
|
+
}
|
|
836
|
+
return systemSegments.join("\n\n");
|
|
837
|
+
}
|
|
622
838
|
function extractModelConfig(request, defaults) {
|
|
623
839
|
const temperature = request.temperature ?? defaults.temperature;
|
|
624
840
|
const maxTokens = request.maxOutputTokens ?? defaults.maxOutputTokens;
|
|
@@ -2330,24 +2546,23 @@ var LlmJudgeEvaluator = class {
|
|
|
2330
2546
|
return this.evaluateWithPrompt(context, judgeProvider);
|
|
2331
2547
|
}
|
|
2332
2548
|
async evaluateWithPrompt(context, judgeProvider) {
|
|
2333
|
-
|
|
2334
|
-
|
|
2549
|
+
const hasReferenceAnswer = hasNonEmptyReferenceAnswer(context.evalCase);
|
|
2550
|
+
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
2551
|
+
let prompt = buildQualityPrompt(context.evalCase, context.candidate, formattedQuestion);
|
|
2552
|
+
let systemPrompt = context.systemPrompt ?? this.customPrompt ?? buildSystemPrompt(hasReferenceAnswer);
|
|
2335
2553
|
if (systemPrompt && hasTemplateVariables(systemPrompt)) {
|
|
2336
2554
|
const variables = {
|
|
2337
2555
|
input_messages: JSON.stringify(context.evalCase.input_segments, null, 2),
|
|
2338
2556
|
output_messages: JSON.stringify(context.evalCase.output_segments, null, 2),
|
|
2339
2557
|
candidate_answer: context.candidate,
|
|
2340
|
-
reference_answer: context.evalCase.reference_answer,
|
|
2558
|
+
reference_answer: context.evalCase.reference_answer ?? "",
|
|
2341
2559
|
expected_outcome: context.evalCase.expected_outcome,
|
|
2342
|
-
question:
|
|
2560
|
+
question: formattedQuestion
|
|
2343
2561
|
};
|
|
2344
2562
|
prompt = substituteVariables(systemPrompt, variables);
|
|
2345
|
-
systemPrompt =
|
|
2563
|
+
systemPrompt = buildSystemPrompt(hasReferenceAnswer);
|
|
2346
2564
|
}
|
|
2347
|
-
const metadata = {
|
|
2348
|
-
...systemPrompt !== void 0 ? { systemPrompt } : {},
|
|
2349
|
-
...context.judgeModel !== void 0 ? { model: context.judgeModel } : {}
|
|
2350
|
-
};
|
|
2565
|
+
const metadata = systemPrompt !== void 0 ? { systemPrompt } : {};
|
|
2351
2566
|
const response = await judgeProvider.invoke({
|
|
2352
2567
|
question: prompt,
|
|
2353
2568
|
metadata,
|
|
@@ -2367,8 +2582,7 @@ var LlmJudgeEvaluator = class {
|
|
|
2367
2582
|
provider: judgeProvider.id,
|
|
2368
2583
|
prompt,
|
|
2369
2584
|
target: context.target.name,
|
|
2370
|
-
...systemPrompt !== void 0
|
|
2371
|
-
...context.judgeModel !== void 0 ? { model: context.judgeModel } : {}
|
|
2585
|
+
...systemPrompt !== void 0 && { systemPrompt }
|
|
2372
2586
|
};
|
|
2373
2587
|
return {
|
|
2374
2588
|
score,
|
|
@@ -2380,38 +2594,51 @@ var LlmJudgeEvaluator = class {
|
|
|
2380
2594
|
};
|
|
2381
2595
|
}
|
|
2382
2596
|
};
|
|
2383
|
-
|
|
2384
|
-
|
|
2385
|
-
|
|
2386
|
-
|
|
2387
|
-
|
|
2388
|
-
|
|
2389
|
-
|
|
2390
|
-
|
|
2391
|
-
|
|
2392
|
-
|
|
2393
|
-
|
|
2394
|
-
|
|
2395
|
-
|
|
2396
|
-
|
|
2397
|
-
|
|
2398
|
-
|
|
2399
|
-
|
|
2597
|
+
function buildSystemPrompt(hasReferenceAnswer) {
|
|
2598
|
+
const basePrompt = [
|
|
2599
|
+
"You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.",
|
|
2600
|
+
""
|
|
2601
|
+
];
|
|
2602
|
+
if (hasReferenceAnswer) {
|
|
2603
|
+
basePrompt.push(
|
|
2604
|
+
"Use the reference_answer as a gold standard for a high-quality response. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.",
|
|
2605
|
+
""
|
|
2606
|
+
);
|
|
2607
|
+
}
|
|
2608
|
+
basePrompt.push(
|
|
2609
|
+
"Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.",
|
|
2610
|
+
"",
|
|
2611
|
+
"You must respond with a single JSON object matching this schema:",
|
|
2612
|
+
"",
|
|
2613
|
+
"{",
|
|
2614
|
+
' "score": <number between 0.0 and 1.0>,',
|
|
2615
|
+
' "hits": [<array of strings, max 4 items, brief specific achievements>],',
|
|
2616
|
+
' "misses": [<array of strings, max 4 items, brief specific failures or omissions, empty if none>],',
|
|
2617
|
+
' "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
|
|
2618
|
+
"}"
|
|
2619
|
+
);
|
|
2620
|
+
return basePrompt.join("\n");
|
|
2621
|
+
}
|
|
2622
|
+
function buildQualityPrompt(evalCase, candidate, question) {
|
|
2400
2623
|
const parts = [
|
|
2401
2624
|
"[[ ## expected_outcome ## ]]",
|
|
2402
2625
|
evalCase.expected_outcome.trim(),
|
|
2403
2626
|
"",
|
|
2404
2627
|
"[[ ## question ## ]]",
|
|
2405
|
-
|
|
2406
|
-
""
|
|
2407
|
-
"[[ ## reference_answer ## ]]",
|
|
2408
|
-
evalCase.reference_answer.trim(),
|
|
2409
|
-
"",
|
|
2410
|
-
"[[ ## candidate_answer ## ]]",
|
|
2411
|
-
candidate.trim(),
|
|
2412
|
-
"",
|
|
2413
|
-
"Respond with a single JSON object matching the schema described in the system prompt."
|
|
2628
|
+
question.trim(),
|
|
2629
|
+
""
|
|
2414
2630
|
];
|
|
2631
|
+
if (hasNonEmptyReferenceAnswer(evalCase)) {
|
|
2632
|
+
parts.push(
|
|
2633
|
+
"[[ ## reference_answer ## ]]",
|
|
2634
|
+
evalCase.reference_answer.trim(),
|
|
2635
|
+
""
|
|
2636
|
+
);
|
|
2637
|
+
}
|
|
2638
|
+
parts.push(
|
|
2639
|
+
"[[ ## candidate_answer ## ]]",
|
|
2640
|
+
candidate.trim()
|
|
2641
|
+
);
|
|
2415
2642
|
return parts.join("\n");
|
|
2416
2643
|
}
|
|
2417
2644
|
function clampScore(value) {
|
|
@@ -2494,6 +2721,9 @@ function extractJsonBlob(text) {
|
|
|
2494
2721
|
function isNonEmptyString(value) {
|
|
2495
2722
|
return typeof value === "string" && value.trim().length > 0;
|
|
2496
2723
|
}
|
|
2724
|
+
function hasNonEmptyReferenceAnswer(evalCase) {
|
|
2725
|
+
return evalCase.reference_answer !== void 0 && evalCase.reference_answer.trim().length > 0;
|
|
2726
|
+
}
|
|
2497
2727
|
var CodeEvaluator = class {
|
|
2498
2728
|
kind = "code";
|
|
2499
2729
|
script;
|
|
@@ -3152,11 +3382,27 @@ async function evaluateCandidate(options) {
|
|
|
3152
3382
|
agentTimeoutMs
|
|
3153
3383
|
});
|
|
3154
3384
|
const completedAt = nowFn();
|
|
3155
|
-
|
|
3156
|
-
|
|
3157
|
-
|
|
3158
|
-
|
|
3159
|
-
|
|
3385
|
+
let agentProviderRequest;
|
|
3386
|
+
let lmProviderRequest;
|
|
3387
|
+
if (isAgentProvider(provider)) {
|
|
3388
|
+
agentProviderRequest = {
|
|
3389
|
+
question: promptInputs.question,
|
|
3390
|
+
guideline_paths: evalCase.guideline_paths
|
|
3391
|
+
};
|
|
3392
|
+
} else {
|
|
3393
|
+
if (promptInputs.chatPrompt) {
|
|
3394
|
+
lmProviderRequest = {
|
|
3395
|
+
chat_prompt: promptInputs.chatPrompt,
|
|
3396
|
+
guideline_paths: evalCase.guideline_paths
|
|
3397
|
+
};
|
|
3398
|
+
} else {
|
|
3399
|
+
lmProviderRequest = {
|
|
3400
|
+
question: promptInputs.question,
|
|
3401
|
+
guidelines: promptInputs.guidelines,
|
|
3402
|
+
guideline_paths: evalCase.guideline_paths
|
|
3403
|
+
};
|
|
3404
|
+
}
|
|
3405
|
+
}
|
|
3160
3406
|
return {
|
|
3161
3407
|
eval_id: evalCase.id,
|
|
3162
3408
|
dataset: evalCase.dataset,
|
|
@@ -3170,7 +3416,8 @@ async function evaluateCandidate(options) {
|
|
|
3170
3416
|
timestamp: completedAt.toISOString(),
|
|
3171
3417
|
reasoning: score.reasoning,
|
|
3172
3418
|
raw_aspects: score.rawAspects,
|
|
3173
|
-
|
|
3419
|
+
agent_provider_request: agentProviderRequest,
|
|
3420
|
+
lm_provider_request: lmProviderRequest,
|
|
3174
3421
|
evaluator_raw_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
|
|
3175
3422
|
evaluator_results: evaluatorResults
|
|
3176
3423
|
};
|
|
@@ -3329,8 +3576,7 @@ async function runLlmJudgeEvaluator(options) {
|
|
|
3329
3576
|
now,
|
|
3330
3577
|
judgeProvider,
|
|
3331
3578
|
systemPrompt: customPrompt,
|
|
3332
|
-
evaluator: config
|
|
3333
|
-
judgeModel: config.model
|
|
3579
|
+
evaluator: config
|
|
3334
3580
|
});
|
|
3335
3581
|
}
|
|
3336
3582
|
async function resolveCustomPrompt(config) {
|
|
@@ -3399,6 +3645,7 @@ async function invokeProvider(provider, options) {
|
|
|
3399
3645
|
question: promptInputs.question,
|
|
3400
3646
|
guidelines: promptInputs.guidelines,
|
|
3401
3647
|
guideline_patterns: evalCase.guideline_patterns,
|
|
3648
|
+
chatPrompt: promptInputs.chatPrompt,
|
|
3402
3649
|
inputFiles: evalCase.file_paths,
|
|
3403
3650
|
evalCaseId: evalCase.id,
|
|
3404
3651
|
attempt,
|
|
@@ -3415,12 +3662,30 @@ async function invokeProvider(provider, options) {
|
|
|
3415
3662
|
}
|
|
3416
3663
|
function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider) {
|
|
3417
3664
|
const message = error instanceof Error ? error.message : String(error);
|
|
3418
|
-
|
|
3419
|
-
|
|
3420
|
-
|
|
3421
|
-
|
|
3422
|
-
|
|
3423
|
-
|
|
3665
|
+
let agentProviderRequest;
|
|
3666
|
+
let lmProviderRequest;
|
|
3667
|
+
if (isAgentProvider(provider)) {
|
|
3668
|
+
agentProviderRequest = {
|
|
3669
|
+
question: promptInputs.question,
|
|
3670
|
+
guideline_paths: evalCase.guideline_paths,
|
|
3671
|
+
error: message
|
|
3672
|
+
};
|
|
3673
|
+
} else {
|
|
3674
|
+
if (promptInputs.chatPrompt) {
|
|
3675
|
+
lmProviderRequest = {
|
|
3676
|
+
chat_prompt: promptInputs.chatPrompt,
|
|
3677
|
+
guideline_paths: evalCase.guideline_paths,
|
|
3678
|
+
error: message
|
|
3679
|
+
};
|
|
3680
|
+
} else {
|
|
3681
|
+
lmProviderRequest = {
|
|
3682
|
+
question: promptInputs.question,
|
|
3683
|
+
guidelines: promptInputs.guidelines,
|
|
3684
|
+
guideline_paths: evalCase.guideline_paths,
|
|
3685
|
+
error: message
|
|
3686
|
+
};
|
|
3687
|
+
}
|
|
3688
|
+
}
|
|
3424
3689
|
return {
|
|
3425
3690
|
eval_id: evalCase.id,
|
|
3426
3691
|
dataset: evalCase.dataset,
|
|
@@ -3433,7 +3698,8 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
3433
3698
|
target: targetName,
|
|
3434
3699
|
timestamp: timestamp.toISOString(),
|
|
3435
3700
|
raw_aspects: [],
|
|
3436
|
-
|
|
3701
|
+
agent_provider_request: agentProviderRequest,
|
|
3702
|
+
lm_provider_request: lmProviderRequest,
|
|
3437
3703
|
error: message
|
|
3438
3704
|
};
|
|
3439
3705
|
}
|
|
@@ -3445,6 +3711,9 @@ function createCacheKey(provider, target, evalCase, promptInputs) {
|
|
|
3445
3711
|
hash.update(promptInputs.question);
|
|
3446
3712
|
hash.update(promptInputs.guidelines);
|
|
3447
3713
|
hash.update(promptInputs.systemMessage ?? "");
|
|
3714
|
+
if (promptInputs.chatPrompt) {
|
|
3715
|
+
hash.update(JSON.stringify(promptInputs.chatPrompt));
|
|
3716
|
+
}
|
|
3448
3717
|
return hash.digest("hex");
|
|
3449
3718
|
}
|
|
3450
3719
|
function isTimeoutLike(error) {
|
|
@@ -3492,6 +3761,7 @@ export {
|
|
|
3492
3761
|
loadEvalCases,
|
|
3493
3762
|
normalizeLineEndings,
|
|
3494
3763
|
readTargetDefinitions,
|
|
3764
|
+
readTestSuiteMetadata,
|
|
3495
3765
|
readTextFile,
|
|
3496
3766
|
resolveAndCreateProvider,
|
|
3497
3767
|
resolveFileReference,
|