agentv 0.9.0 → 0.10.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -590,7 +590,7 @@ import fg from "fast-glob";
|
|
|
590
590
|
import { stat as stat3 } from "node:fs/promises";
|
|
591
591
|
import path15 from "node:path";
|
|
592
592
|
|
|
593
|
-
// ../../packages/core/dist/chunk-
|
|
593
|
+
// ../../packages/core/dist/chunk-YQBJAT5I.js
|
|
594
594
|
import { constants } from "node:fs";
|
|
595
595
|
import { access, readFile } from "node:fs/promises";
|
|
596
596
|
import path from "node:path";
|
|
@@ -4636,7 +4636,7 @@ var coerce = {
|
|
|
4636
4636
|
};
|
|
4637
4637
|
var NEVER = INVALID;
|
|
4638
4638
|
|
|
4639
|
-
// ../../packages/core/dist/chunk-
|
|
4639
|
+
// ../../packages/core/dist/chunk-YQBJAT5I.js
|
|
4640
4640
|
async function fileExists(filePath) {
|
|
4641
4641
|
try {
|
|
4642
4642
|
await access(filePath, constants.F_OK);
|
|
@@ -11752,6 +11752,33 @@ var ANSI_YELLOW = "\x1B[33m";
|
|
|
11752
11752
|
var ANSI_RESET = "\x1B[0m";
|
|
11753
11753
|
var SCHEMA_EVAL_V2 = "agentv-eval-v2";
|
|
11754
11754
|
var SCHEMA_CONFIG_V2 = "agentv-config-v2";
|
|
11755
|
+
async function readTestSuiteMetadata(testFilePath) {
|
|
11756
|
+
try {
|
|
11757
|
+
const absolutePath = path8.resolve(testFilePath);
|
|
11758
|
+
const content = await readFile3(absolutePath, "utf8");
|
|
11759
|
+
const parsed = parse3(content);
|
|
11760
|
+
if (!isJsonObject(parsed)) {
|
|
11761
|
+
return {};
|
|
11762
|
+
}
|
|
11763
|
+
return { target: extractTargetFromSuite(parsed) };
|
|
11764
|
+
} catch {
|
|
11765
|
+
return {};
|
|
11766
|
+
}
|
|
11767
|
+
}
|
|
11768
|
+
function extractTargetFromSuite(suite) {
|
|
11769
|
+
const execution = suite.execution;
|
|
11770
|
+
if (execution && typeof execution === "object" && !Array.isArray(execution)) {
|
|
11771
|
+
const executionTarget = execution.target;
|
|
11772
|
+
if (typeof executionTarget === "string" && executionTarget.trim().length > 0) {
|
|
11773
|
+
return executionTarget.trim();
|
|
11774
|
+
}
|
|
11775
|
+
}
|
|
11776
|
+
const targetValue = suite.target;
|
|
11777
|
+
if (typeof targetValue === "string" && targetValue.trim().length > 0) {
|
|
11778
|
+
return targetValue.trim();
|
|
11779
|
+
}
|
|
11780
|
+
return void 0;
|
|
11781
|
+
}
|
|
11755
11782
|
async function loadConfig(evalFilePath, repoRoot) {
|
|
11756
11783
|
const directories = buildDirectoryChain(evalFilePath, repoRoot);
|
|
11757
11784
|
for (const directory of directories) {
|
|
@@ -11928,6 +11955,8 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
11928
11955
|
throw new Error(`Invalid test file format: ${evalFilePath} - missing 'evalcases' field`);
|
|
11929
11956
|
}
|
|
11930
11957
|
const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm_judge";
|
|
11958
|
+
const globalExecution = isJsonObject(suite.execution) ? suite.execution : void 0;
|
|
11959
|
+
const globalTarget = asString(globalExecution?.target) ?? asString(suite.target);
|
|
11931
11960
|
const results = [];
|
|
11932
11961
|
for (const rawEvalcase of rawTestcases) {
|
|
11933
11962
|
if (!isJsonObject(rawEvalcase)) {
|
|
@@ -11947,14 +11976,11 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
11947
11976
|
logWarning(`Skipping incomplete eval case: ${id ?? "unknown"}`);
|
|
11948
11977
|
continue;
|
|
11949
11978
|
}
|
|
11950
|
-
|
|
11951
|
-
logWarning(`Eval case '${id}' missing expected_messages array`);
|
|
11952
|
-
continue;
|
|
11953
|
-
}
|
|
11979
|
+
const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
|
|
11954
11980
|
const inputMessages = inputMessagesValue.filter((msg) => isTestMessage(msg));
|
|
11955
|
-
const expectedMessages = expectedMessagesValue.filter((msg) => isTestMessage(msg));
|
|
11956
|
-
if (expectedMessages.length === 0) {
|
|
11957
|
-
logWarning(`No expected message found for eval case: ${id}`);
|
|
11981
|
+
const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
|
|
11982
|
+
if (hasExpectedMessages && expectedMessages.length === 0) {
|
|
11983
|
+
logWarning(`No valid expected message found for eval case: ${id}`);
|
|
11958
11984
|
continue;
|
|
11959
11985
|
}
|
|
11960
11986
|
if (expectedMessages.length > 1) {
|
|
@@ -11972,20 +11998,20 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
11972
11998
|
messageType: "input",
|
|
11973
11999
|
verbose
|
|
11974
12000
|
});
|
|
11975
|
-
const outputSegments = await processMessages({
|
|
12001
|
+
const outputSegments = hasExpectedMessages ? await processMessages({
|
|
11976
12002
|
messages: expectedMessages,
|
|
11977
12003
|
searchRoots,
|
|
11978
12004
|
repoRootPath,
|
|
11979
12005
|
guidelinePatterns,
|
|
11980
12006
|
messageType: "output",
|
|
11981
12007
|
verbose
|
|
11982
|
-
});
|
|
12008
|
+
}) : [];
|
|
11983
12009
|
const codeSnippets = extractCodeBlocks(inputSegments);
|
|
11984
12010
|
const expectedContent = expectedMessages[0]?.content;
|
|
11985
|
-
const referenceAnswer = await resolveAssistantContent(expectedContent, searchRoots, verbose);
|
|
12011
|
+
const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
|
|
11986
12012
|
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
11987
12013
|
const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
|
|
11988
|
-
const evaluators = await parseEvaluators(evalcase, searchRoots, id ?? "unknown");
|
|
12014
|
+
const evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
|
|
11989
12015
|
const userFilePaths = [];
|
|
11990
12016
|
for (const segment of inputSegments) {
|
|
11991
12017
|
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
@@ -12001,6 +12027,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
12001
12027
|
dataset: datasetName,
|
|
12002
12028
|
conversation_id: conversationId,
|
|
12003
12029
|
question,
|
|
12030
|
+
input_messages: inputMessages,
|
|
12004
12031
|
input_segments: inputSegments,
|
|
12005
12032
|
output_segments: outputSegments,
|
|
12006
12033
|
reference_answer: referenceAnswer,
|
|
@@ -12028,6 +12055,54 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
12028
12055
|
}
|
|
12029
12056
|
return results;
|
|
12030
12057
|
}
|
|
12058
|
+
function needsRoleMarkers(messages, processedSegmentsByMessage) {
|
|
12059
|
+
if (messages.some((msg) => msg.role === "assistant" || msg.role === "tool")) {
|
|
12060
|
+
return true;
|
|
12061
|
+
}
|
|
12062
|
+
let messagesWithContent = 0;
|
|
12063
|
+
for (const segments of processedSegmentsByMessage) {
|
|
12064
|
+
if (hasVisibleContent(segments)) {
|
|
12065
|
+
messagesWithContent++;
|
|
12066
|
+
}
|
|
12067
|
+
}
|
|
12068
|
+
return messagesWithContent > 1;
|
|
12069
|
+
}
|
|
12070
|
+
function hasVisibleContent(segments) {
|
|
12071
|
+
return segments.some((segment) => {
|
|
12072
|
+
const type = asString(segment.type);
|
|
12073
|
+
if (type === "text") {
|
|
12074
|
+
const value = asString(segment.value);
|
|
12075
|
+
return value !== void 0 && value.trim().length > 0;
|
|
12076
|
+
}
|
|
12077
|
+
if (type === "guideline_ref") {
|
|
12078
|
+
return false;
|
|
12079
|
+
}
|
|
12080
|
+
if (type === "file") {
|
|
12081
|
+
const text = asString(segment.text);
|
|
12082
|
+
return text !== void 0 && text.trim().length > 0;
|
|
12083
|
+
}
|
|
12084
|
+
return false;
|
|
12085
|
+
});
|
|
12086
|
+
}
|
|
12087
|
+
function formatSegment(segment) {
|
|
12088
|
+
const type = asString(segment.type);
|
|
12089
|
+
if (type === "text") {
|
|
12090
|
+
return asString(segment.value);
|
|
12091
|
+
}
|
|
12092
|
+
if (type === "guideline_ref") {
|
|
12093
|
+
const refPath = asString(segment.path);
|
|
12094
|
+
return refPath ? `<Attached: ${refPath}>` : void 0;
|
|
12095
|
+
}
|
|
12096
|
+
if (type === "file") {
|
|
12097
|
+
const text = asString(segment.text);
|
|
12098
|
+
const filePath = asString(segment.path);
|
|
12099
|
+
if (text && filePath) {
|
|
12100
|
+
return `=== ${filePath} ===
|
|
12101
|
+
${text}`;
|
|
12102
|
+
}
|
|
12103
|
+
}
|
|
12104
|
+
return void 0;
|
|
12105
|
+
}
|
|
12031
12106
|
async function buildPromptInputs(testCase) {
|
|
12032
12107
|
const guidelineContents = [];
|
|
12033
12108
|
for (const rawPath of testCase.guideline_paths) {
|
|
@@ -12044,36 +12119,168 @@ ${content}`);
|
|
|
12044
12119
|
logWarning(`Could not read guideline file ${absolutePath}: ${error.message}`);
|
|
12045
12120
|
}
|
|
12046
12121
|
}
|
|
12047
|
-
const
|
|
12122
|
+
const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
|
|
12123
|
+
const segmentsByMessage = [];
|
|
12124
|
+
const fileContentsByPath = /* @__PURE__ */ new Map();
|
|
12048
12125
|
for (const segment of testCase.input_segments) {
|
|
12049
|
-
|
|
12050
|
-
|
|
12051
|
-
const pathValue = segment.path;
|
|
12052
|
-
const textValue = segment.text;
|
|
12053
|
-
const label = typeof pathValue === "string" ? pathValue : "file";
|
|
12054
|
-
const body = typeof textValue === "string" ? textValue : "";
|
|
12055
|
-
questionParts.push(`=== ${label} ===
|
|
12056
|
-
${body}`);
|
|
12057
|
-
continue;
|
|
12126
|
+
if (segment.type === "file" && typeof segment.path === "string" && typeof segment.text === "string") {
|
|
12127
|
+
fileContentsByPath.set(segment.path, segment.text);
|
|
12058
12128
|
}
|
|
12059
|
-
|
|
12060
|
-
|
|
12061
|
-
|
|
12062
|
-
|
|
12129
|
+
}
|
|
12130
|
+
for (const message of testCase.input_messages) {
|
|
12131
|
+
const messageSegments = [];
|
|
12132
|
+
if (typeof message.content === "string") {
|
|
12133
|
+
if (message.content.trim().length > 0) {
|
|
12134
|
+
messageSegments.push({ type: "text", value: message.content });
|
|
12135
|
+
}
|
|
12136
|
+
} else if (Array.isArray(message.content)) {
|
|
12137
|
+
for (const segment of message.content) {
|
|
12138
|
+
if (typeof segment === "string") {
|
|
12139
|
+
if (segment.trim().length > 0) {
|
|
12140
|
+
messageSegments.push({ type: "text", value: segment });
|
|
12141
|
+
}
|
|
12142
|
+
} else if (isJsonObject(segment)) {
|
|
12143
|
+
const type = asString(segment.type);
|
|
12144
|
+
if (type === "file") {
|
|
12145
|
+
const value = asString(segment.value);
|
|
12146
|
+
if (!value) continue;
|
|
12147
|
+
if (testCase.guideline_patterns && isGuidelineFile(value, testCase.guideline_patterns)) {
|
|
12148
|
+
messageSegments.push({ type: "guideline_ref", path: value });
|
|
12149
|
+
continue;
|
|
12150
|
+
}
|
|
12151
|
+
const fileText = fileContentsByPath.get(value);
|
|
12152
|
+
if (fileText !== void 0) {
|
|
12153
|
+
messageSegments.push({ type: "file", text: fileText, path: value });
|
|
12154
|
+
}
|
|
12155
|
+
} else if (type === "text") {
|
|
12156
|
+
const textValue = asString(segment.value);
|
|
12157
|
+
if (textValue && textValue.trim().length > 0) {
|
|
12158
|
+
messageSegments.push({ type: "text", value: textValue });
|
|
12159
|
+
}
|
|
12160
|
+
}
|
|
12161
|
+
}
|
|
12063
12162
|
}
|
|
12064
|
-
continue;
|
|
12065
12163
|
}
|
|
12066
|
-
|
|
12067
|
-
|
|
12068
|
-
|
|
12164
|
+
segmentsByMessage.push(messageSegments);
|
|
12165
|
+
}
|
|
12166
|
+
const useRoleMarkers = needsRoleMarkers(testCase.input_messages, segmentsByMessage);
|
|
12167
|
+
let question;
|
|
12168
|
+
if (useRoleMarkers) {
|
|
12169
|
+
const messageParts = [];
|
|
12170
|
+
for (let i6 = 0; i6 < testCase.input_messages.length; i6++) {
|
|
12171
|
+
const message = testCase.input_messages[i6];
|
|
12172
|
+
const segments = segmentsByMessage[i6];
|
|
12173
|
+
if (!hasVisibleContent(segments)) {
|
|
12174
|
+
continue;
|
|
12175
|
+
}
|
|
12176
|
+
const roleLabel = message.role.charAt(0).toUpperCase() + message.role.slice(1);
|
|
12177
|
+
const contentParts = [];
|
|
12178
|
+
for (const segment of segments) {
|
|
12179
|
+
const formattedContent = formatSegment(segment);
|
|
12180
|
+
if (formattedContent) {
|
|
12181
|
+
contentParts.push(formattedContent);
|
|
12182
|
+
}
|
|
12183
|
+
}
|
|
12184
|
+
if (contentParts.length > 0) {
|
|
12185
|
+
const messageContent = contentParts.join("\n");
|
|
12186
|
+
messageParts.push(`@[${roleLabel}]:
|
|
12187
|
+
${messageContent}`);
|
|
12188
|
+
}
|
|
12189
|
+
}
|
|
12190
|
+
question = messageParts.join("\n\n");
|
|
12191
|
+
} else {
|
|
12192
|
+
const questionParts = [];
|
|
12193
|
+
for (const segment of testCase.input_segments) {
|
|
12194
|
+
const formattedContent = formatSegment(segment);
|
|
12195
|
+
if (formattedContent) {
|
|
12196
|
+
questionParts.push(formattedContent);
|
|
12197
|
+
}
|
|
12069
12198
|
}
|
|
12199
|
+
if (testCase.code_snippets.length > 0) {
|
|
12200
|
+
questionParts.push(testCase.code_snippets.join("\n"));
|
|
12201
|
+
}
|
|
12202
|
+
question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
|
|
12070
12203
|
}
|
|
12071
|
-
|
|
12072
|
-
|
|
12204
|
+
const chatPrompt = useRoleMarkers ? buildChatPromptFromSegments({
|
|
12205
|
+
messages: testCase.input_messages,
|
|
12206
|
+
segmentsByMessage,
|
|
12207
|
+
guidelinePatterns: testCase.guideline_patterns,
|
|
12208
|
+
guidelineContent: guidelines
|
|
12209
|
+
}) : void 0;
|
|
12210
|
+
return { question, guidelines, chatPrompt };
|
|
12211
|
+
}
|
|
12212
|
+
function buildChatPromptFromSegments(options) {
|
|
12213
|
+
const { messages, segmentsByMessage, guidelinePatterns, guidelineContent, systemPrompt } = options;
|
|
12214
|
+
if (messages.length === 0) {
|
|
12215
|
+
return void 0;
|
|
12073
12216
|
}
|
|
12074
|
-
const
|
|
12075
|
-
|
|
12076
|
-
|
|
12217
|
+
const systemSegments = [];
|
|
12218
|
+
if (systemPrompt && systemPrompt.trim().length > 0) {
|
|
12219
|
+
systemSegments.push(systemPrompt.trim());
|
|
12220
|
+
}
|
|
12221
|
+
if (guidelineContent && guidelineContent.trim().length > 0) {
|
|
12222
|
+
systemSegments.push(`[[ ## Guidelines ## ]]
|
|
12223
|
+
|
|
12224
|
+
${guidelineContent.trim()}`);
|
|
12225
|
+
}
|
|
12226
|
+
let startIndex = 0;
|
|
12227
|
+
while (startIndex < messages.length && messages[startIndex].role === "system") {
|
|
12228
|
+
const segments = segmentsByMessage[startIndex];
|
|
12229
|
+
const contentParts = [];
|
|
12230
|
+
for (const segment of segments) {
|
|
12231
|
+
const formatted = formatSegment(segment);
|
|
12232
|
+
if (formatted) {
|
|
12233
|
+
contentParts.push(formatted);
|
|
12234
|
+
}
|
|
12235
|
+
}
|
|
12236
|
+
if (contentParts.length > 0) {
|
|
12237
|
+
systemSegments.push(contentParts.join("\n"));
|
|
12238
|
+
}
|
|
12239
|
+
startIndex += 1;
|
|
12240
|
+
}
|
|
12241
|
+
const chatPrompt = [];
|
|
12242
|
+
if (systemSegments.length > 0) {
|
|
12243
|
+
chatPrompt.push({
|
|
12244
|
+
role: "system",
|
|
12245
|
+
content: systemSegments.join("\n\n")
|
|
12246
|
+
});
|
|
12247
|
+
}
|
|
12248
|
+
for (let i6 = startIndex; i6 < messages.length; i6++) {
|
|
12249
|
+
const message = messages[i6];
|
|
12250
|
+
const segments = segmentsByMessage[i6];
|
|
12251
|
+
const contentParts = [];
|
|
12252
|
+
let role = message.role;
|
|
12253
|
+
let name;
|
|
12254
|
+
if (role === "system") {
|
|
12255
|
+
role = "assistant";
|
|
12256
|
+
contentParts.push("@[System]:");
|
|
12257
|
+
} else if (role === "tool") {
|
|
12258
|
+
role = "function";
|
|
12259
|
+
name = "tool";
|
|
12260
|
+
}
|
|
12261
|
+
for (const segment of segments) {
|
|
12262
|
+
if (segment.type === "guideline_ref") {
|
|
12263
|
+
continue;
|
|
12264
|
+
}
|
|
12265
|
+
const formatted = formatSegment(segment);
|
|
12266
|
+
if (formatted) {
|
|
12267
|
+
const isGuidelineRef = segment.type === "file" && typeof segment.path === "string" && guidelinePatterns && isGuidelineFile(segment.path, guidelinePatterns);
|
|
12268
|
+
if (isGuidelineRef) {
|
|
12269
|
+
continue;
|
|
12270
|
+
}
|
|
12271
|
+
contentParts.push(formatted);
|
|
12272
|
+
}
|
|
12273
|
+
}
|
|
12274
|
+
if (contentParts.length === 0) {
|
|
12275
|
+
continue;
|
|
12276
|
+
}
|
|
12277
|
+
chatPrompt.push({
|
|
12278
|
+
role,
|
|
12279
|
+
content: contentParts.join("\n"),
|
|
12280
|
+
...name ? { name } : {}
|
|
12281
|
+
});
|
|
12282
|
+
}
|
|
12283
|
+
return chatPrompt.length > 0 ? chatPrompt : void 0;
|
|
12077
12284
|
}
|
|
12078
12285
|
async function fileExists2(absolutePath) {
|
|
12079
12286
|
try {
|
|
@@ -12171,9 +12378,9 @@ async function resolveAssistantContent(content, searchRoots, verbose) {
|
|
|
12171
12378
|
}
|
|
12172
12379
|
return parts.join(" ");
|
|
12173
12380
|
}
|
|
12174
|
-
async function parseEvaluators(rawEvalCase, searchRoots, evalId) {
|
|
12381
|
+
async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
|
|
12175
12382
|
const execution = rawEvalCase.execution;
|
|
12176
|
-
const candidateEvaluators = isJsonObject(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators;
|
|
12383
|
+
const candidateEvaluators = isJsonObject(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators ?? globalExecution?.evaluators;
|
|
12177
12384
|
if (candidateEvaluators === void 0) {
|
|
12178
12385
|
return void 0;
|
|
12179
12386
|
}
|
|
@@ -12211,6 +12418,8 @@ async function parseEvaluators(rawEvalCase, searchRoots, evalId) {
|
|
|
12211
12418
|
resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
|
|
12212
12419
|
);
|
|
12213
12420
|
}
|
|
12421
|
+
} else {
|
|
12422
|
+
resolvedCwd = searchRoots[0];
|
|
12214
12423
|
}
|
|
12215
12424
|
evaluators.push({
|
|
12216
12425
|
name,
|
|
@@ -12239,8 +12448,7 @@ async function parseEvaluators(rawEvalCase, searchRoots, evalId) {
|
|
|
12239
12448
|
name,
|
|
12240
12449
|
type: "llm_judge",
|
|
12241
12450
|
prompt,
|
|
12242
|
-
promptPath
|
|
12243
|
-
model
|
|
12451
|
+
promptPath
|
|
12244
12452
|
});
|
|
12245
12453
|
}
|
|
12246
12454
|
return evaluators.length > 0 ? evaluators : void 0;
|
|
@@ -12267,21 +12475,14 @@ ${detailBlock}${ANSI_RESET}`);
|
|
|
12267
12475
|
var DEFAULT_SYSTEM_PROMPT = "You are a careful assistant. Follow all provided instructions and do not fabricate results.";
|
|
12268
12476
|
function buildChatPrompt(request) {
|
|
12269
12477
|
if (request.chatPrompt) {
|
|
12270
|
-
|
|
12271
|
-
|
|
12272
|
-
|
|
12273
|
-
|
|
12274
|
-
|
|
12275
|
-
|
|
12276
|
-
} else {
|
|
12277
|
-
systemSegments.push(DEFAULT_SYSTEM_PROMPT);
|
|
12278
|
-
}
|
|
12279
|
-
if (request.guidelines && request.guidelines.trim().length > 0) {
|
|
12280
|
-
systemSegments.push(`[[ ## Guidelines ## ]]
|
|
12281
|
-
|
|
12282
|
-
${request.guidelines.trim()}`);
|
|
12478
|
+
const hasSystemMessage = request.chatPrompt.some((message) => message.role === "system");
|
|
12479
|
+
if (hasSystemMessage) {
|
|
12480
|
+
return request.chatPrompt;
|
|
12481
|
+
}
|
|
12482
|
+
const systemContent2 = resolveSystemContent(request);
|
|
12483
|
+
return [{ role: "system", content: systemContent2 }, ...request.chatPrompt];
|
|
12283
12484
|
}
|
|
12284
|
-
const systemContent =
|
|
12485
|
+
const systemContent = resolveSystemContent(request);
|
|
12285
12486
|
const userContent = request.question.trim();
|
|
12286
12487
|
const prompt = [
|
|
12287
12488
|
{
|
|
@@ -12295,6 +12496,21 @@ ${request.guidelines.trim()}`);
|
|
|
12295
12496
|
];
|
|
12296
12497
|
return prompt;
|
|
12297
12498
|
}
|
|
12499
|
+
function resolveSystemContent(request) {
|
|
12500
|
+
const systemSegments = [];
|
|
12501
|
+
const metadataSystemPrompt = typeof request.metadata?.systemPrompt === "string" ? request.metadata.systemPrompt : void 0;
|
|
12502
|
+
if (metadataSystemPrompt && metadataSystemPrompt.trim().length > 0) {
|
|
12503
|
+
systemSegments.push(metadataSystemPrompt.trim());
|
|
12504
|
+
} else {
|
|
12505
|
+
systemSegments.push(DEFAULT_SYSTEM_PROMPT);
|
|
12506
|
+
}
|
|
12507
|
+
if (request.guidelines && request.guidelines.trim().length > 0) {
|
|
12508
|
+
systemSegments.push(`[[ ## Guidelines ## ]]
|
|
12509
|
+
|
|
12510
|
+
${request.guidelines.trim()}`);
|
|
12511
|
+
}
|
|
12512
|
+
return systemSegments.join("\n\n");
|
|
12513
|
+
}
|
|
12298
12514
|
function extractModelConfig(request, defaults) {
|
|
12299
12515
|
const temperature = request.temperature ?? defaults.temperature;
|
|
12300
12516
|
const maxTokens = request.maxOutputTokens ?? defaults.maxOutputTokens;
|
|
@@ -13955,24 +14171,23 @@ var LlmJudgeEvaluator = class {
|
|
|
13955
14171
|
return this.evaluateWithPrompt(context2, judgeProvider);
|
|
13956
14172
|
}
|
|
13957
14173
|
async evaluateWithPrompt(context2, judgeProvider) {
|
|
13958
|
-
|
|
13959
|
-
|
|
14174
|
+
const hasReferenceAnswer = hasNonEmptyReferenceAnswer(context2.evalCase);
|
|
14175
|
+
const formattedQuestion = context2.promptInputs.question && context2.promptInputs.question.trim().length > 0 ? context2.promptInputs.question : context2.evalCase.question;
|
|
14176
|
+
let prompt = buildQualityPrompt(context2.evalCase, context2.candidate, formattedQuestion);
|
|
14177
|
+
let systemPrompt = context2.systemPrompt ?? this.customPrompt ?? buildSystemPrompt(hasReferenceAnswer);
|
|
13960
14178
|
if (systemPrompt && hasTemplateVariables(systemPrompt)) {
|
|
13961
14179
|
const variables = {
|
|
13962
14180
|
input_messages: JSON.stringify(context2.evalCase.input_segments, null, 2),
|
|
13963
14181
|
output_messages: JSON.stringify(context2.evalCase.output_segments, null, 2),
|
|
13964
14182
|
candidate_answer: context2.candidate,
|
|
13965
|
-
reference_answer: context2.evalCase.reference_answer,
|
|
14183
|
+
reference_answer: context2.evalCase.reference_answer ?? "",
|
|
13966
14184
|
expected_outcome: context2.evalCase.expected_outcome,
|
|
13967
|
-
question:
|
|
14185
|
+
question: formattedQuestion
|
|
13968
14186
|
};
|
|
13969
14187
|
prompt = substituteVariables(systemPrompt, variables);
|
|
13970
|
-
systemPrompt =
|
|
14188
|
+
systemPrompt = buildSystemPrompt(hasReferenceAnswer);
|
|
13971
14189
|
}
|
|
13972
|
-
const metadata = {
|
|
13973
|
-
...systemPrompt !== void 0 ? { systemPrompt } : {},
|
|
13974
|
-
...context2.judgeModel !== void 0 ? { model: context2.judgeModel } : {}
|
|
13975
|
-
};
|
|
14190
|
+
const metadata = systemPrompt !== void 0 ? { systemPrompt } : {};
|
|
13976
14191
|
const response = await judgeProvider.invoke({
|
|
13977
14192
|
question: prompt,
|
|
13978
14193
|
metadata,
|
|
@@ -13992,8 +14207,7 @@ var LlmJudgeEvaluator = class {
|
|
|
13992
14207
|
provider: judgeProvider.id,
|
|
13993
14208
|
prompt,
|
|
13994
14209
|
target: context2.target.name,
|
|
13995
|
-
...systemPrompt !== void 0
|
|
13996
|
-
...context2.judgeModel !== void 0 ? { model: context2.judgeModel } : {}
|
|
14210
|
+
...systemPrompt !== void 0 && { systemPrompt }
|
|
13997
14211
|
};
|
|
13998
14212
|
return {
|
|
13999
14213
|
score,
|
|
@@ -14005,38 +14219,51 @@ var LlmJudgeEvaluator = class {
|
|
|
14005
14219
|
};
|
|
14006
14220
|
}
|
|
14007
14221
|
};
|
|
14008
|
-
|
|
14009
|
-
|
|
14010
|
-
|
|
14011
|
-
|
|
14012
|
-
|
|
14013
|
-
|
|
14014
|
-
|
|
14015
|
-
|
|
14016
|
-
|
|
14017
|
-
|
|
14018
|
-
|
|
14019
|
-
|
|
14020
|
-
|
|
14021
|
-
|
|
14022
|
-
|
|
14023
|
-
|
|
14024
|
-
|
|
14222
|
+
function buildSystemPrompt(hasReferenceAnswer) {
|
|
14223
|
+
const basePrompt = [
|
|
14224
|
+
"You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.",
|
|
14225
|
+
""
|
|
14226
|
+
];
|
|
14227
|
+
if (hasReferenceAnswer) {
|
|
14228
|
+
basePrompt.push(
|
|
14229
|
+
"Use the reference_answer as a gold standard for a high-quality response. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.",
|
|
14230
|
+
""
|
|
14231
|
+
);
|
|
14232
|
+
}
|
|
14233
|
+
basePrompt.push(
|
|
14234
|
+
"Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.",
|
|
14235
|
+
"",
|
|
14236
|
+
"You must respond with a single JSON object matching this schema:",
|
|
14237
|
+
"",
|
|
14238
|
+
"{",
|
|
14239
|
+
' "score": <number between 0.0 and 1.0>,',
|
|
14240
|
+
' "hits": [<array of strings, max 4 items, brief specific achievements>],',
|
|
14241
|
+
' "misses": [<array of strings, max 4 items, brief specific failures or omissions, empty if none>],',
|
|
14242
|
+
' "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
|
|
14243
|
+
"}"
|
|
14244
|
+
);
|
|
14245
|
+
return basePrompt.join("\n");
|
|
14246
|
+
}
|
|
14247
|
+
function buildQualityPrompt(evalCase, candidate, question) {
|
|
14025
14248
|
const parts = [
|
|
14026
14249
|
"[[ ## expected_outcome ## ]]",
|
|
14027
14250
|
evalCase.expected_outcome.trim(),
|
|
14028
14251
|
"",
|
|
14029
14252
|
"[[ ## question ## ]]",
|
|
14030
|
-
|
|
14031
|
-
""
|
|
14032
|
-
"[[ ## reference_answer ## ]]",
|
|
14033
|
-
evalCase.reference_answer.trim(),
|
|
14034
|
-
"",
|
|
14035
|
-
"[[ ## candidate_answer ## ]]",
|
|
14036
|
-
candidate.trim(),
|
|
14037
|
-
"",
|
|
14038
|
-
"Respond with a single JSON object matching the schema described in the system prompt."
|
|
14253
|
+
question.trim(),
|
|
14254
|
+
""
|
|
14039
14255
|
];
|
|
14256
|
+
if (hasNonEmptyReferenceAnswer(evalCase)) {
|
|
14257
|
+
parts.push(
|
|
14258
|
+
"[[ ## reference_answer ## ]]",
|
|
14259
|
+
evalCase.reference_answer.trim(),
|
|
14260
|
+
""
|
|
14261
|
+
);
|
|
14262
|
+
}
|
|
14263
|
+
parts.push(
|
|
14264
|
+
"[[ ## candidate_answer ## ]]",
|
|
14265
|
+
candidate.trim()
|
|
14266
|
+
);
|
|
14040
14267
|
return parts.join("\n");
|
|
14041
14268
|
}
|
|
14042
14269
|
function clampScore(value) {
|
|
@@ -14119,6 +14346,9 @@ function extractJsonBlob(text) {
|
|
|
14119
14346
|
function isNonEmptyString(value) {
|
|
14120
14347
|
return typeof value === "string" && value.trim().length > 0;
|
|
14121
14348
|
}
|
|
14349
|
+
function hasNonEmptyReferenceAnswer(evalCase) {
|
|
14350
|
+
return evalCase.reference_answer !== void 0 && evalCase.reference_answer.trim().length > 0;
|
|
14351
|
+
}
|
|
14122
14352
|
var CodeEvaluator = class {
|
|
14123
14353
|
kind = "code";
|
|
14124
14354
|
script;
|
|
@@ -14766,11 +14996,27 @@ async function evaluateCandidate(options) {
|
|
|
14766
14996
|
agentTimeoutMs
|
|
14767
14997
|
});
|
|
14768
14998
|
const completedAt = nowFn();
|
|
14769
|
-
|
|
14770
|
-
|
|
14771
|
-
|
|
14772
|
-
|
|
14773
|
-
|
|
14999
|
+
let agentProviderRequest;
|
|
15000
|
+
let lmProviderRequest;
|
|
15001
|
+
if (isAgentProvider(provider)) {
|
|
15002
|
+
agentProviderRequest = {
|
|
15003
|
+
question: promptInputs.question,
|
|
15004
|
+
guideline_paths: evalCase.guideline_paths
|
|
15005
|
+
};
|
|
15006
|
+
} else {
|
|
15007
|
+
if (promptInputs.chatPrompt) {
|
|
15008
|
+
lmProviderRequest = {
|
|
15009
|
+
chat_prompt: promptInputs.chatPrompt,
|
|
15010
|
+
guideline_paths: evalCase.guideline_paths
|
|
15011
|
+
};
|
|
15012
|
+
} else {
|
|
15013
|
+
lmProviderRequest = {
|
|
15014
|
+
question: promptInputs.question,
|
|
15015
|
+
guidelines: promptInputs.guidelines,
|
|
15016
|
+
guideline_paths: evalCase.guideline_paths
|
|
15017
|
+
};
|
|
15018
|
+
}
|
|
15019
|
+
}
|
|
14774
15020
|
return {
|
|
14775
15021
|
eval_id: evalCase.id,
|
|
14776
15022
|
dataset: evalCase.dataset,
|
|
@@ -14784,7 +15030,8 @@ async function evaluateCandidate(options) {
|
|
|
14784
15030
|
timestamp: completedAt.toISOString(),
|
|
14785
15031
|
reasoning: score.reasoning,
|
|
14786
15032
|
raw_aspects: score.rawAspects,
|
|
14787
|
-
|
|
15033
|
+
agent_provider_request: agentProviderRequest,
|
|
15034
|
+
lm_provider_request: lmProviderRequest,
|
|
14788
15035
|
evaluator_raw_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
|
|
14789
15036
|
evaluator_results: evaluatorResults
|
|
14790
15037
|
};
|
|
@@ -14943,8 +15190,7 @@ async function runLlmJudgeEvaluator(options) {
|
|
|
14943
15190
|
now,
|
|
14944
15191
|
judgeProvider,
|
|
14945
15192
|
systemPrompt: customPrompt,
|
|
14946
|
-
evaluator: config
|
|
14947
|
-
judgeModel: config.model
|
|
15193
|
+
evaluator: config
|
|
14948
15194
|
});
|
|
14949
15195
|
}
|
|
14950
15196
|
async function resolveCustomPrompt(config) {
|
|
@@ -15013,6 +15259,7 @@ async function invokeProvider(provider, options) {
|
|
|
15013
15259
|
question: promptInputs.question,
|
|
15014
15260
|
guidelines: promptInputs.guidelines,
|
|
15015
15261
|
guideline_patterns: evalCase.guideline_patterns,
|
|
15262
|
+
chatPrompt: promptInputs.chatPrompt,
|
|
15016
15263
|
inputFiles: evalCase.file_paths,
|
|
15017
15264
|
evalCaseId: evalCase.id,
|
|
15018
15265
|
attempt,
|
|
@@ -15029,12 +15276,30 @@ async function invokeProvider(provider, options) {
|
|
|
15029
15276
|
}
|
|
15030
15277
|
function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider) {
|
|
15031
15278
|
const message = error instanceof Error ? error.message : String(error);
|
|
15032
|
-
|
|
15033
|
-
|
|
15034
|
-
|
|
15035
|
-
|
|
15036
|
-
|
|
15037
|
-
|
|
15279
|
+
let agentProviderRequest;
|
|
15280
|
+
let lmProviderRequest;
|
|
15281
|
+
if (isAgentProvider(provider)) {
|
|
15282
|
+
agentProviderRequest = {
|
|
15283
|
+
question: promptInputs.question,
|
|
15284
|
+
guideline_paths: evalCase.guideline_paths,
|
|
15285
|
+
error: message
|
|
15286
|
+
};
|
|
15287
|
+
} else {
|
|
15288
|
+
if (promptInputs.chatPrompt) {
|
|
15289
|
+
lmProviderRequest = {
|
|
15290
|
+
chat_prompt: promptInputs.chatPrompt,
|
|
15291
|
+
guideline_paths: evalCase.guideline_paths,
|
|
15292
|
+
error: message
|
|
15293
|
+
};
|
|
15294
|
+
} else {
|
|
15295
|
+
lmProviderRequest = {
|
|
15296
|
+
question: promptInputs.question,
|
|
15297
|
+
guidelines: promptInputs.guidelines,
|
|
15298
|
+
guideline_paths: evalCase.guideline_paths,
|
|
15299
|
+
error: message
|
|
15300
|
+
};
|
|
15301
|
+
}
|
|
15302
|
+
}
|
|
15038
15303
|
return {
|
|
15039
15304
|
eval_id: evalCase.id,
|
|
15040
15305
|
dataset: evalCase.dataset,
|
|
@@ -15047,7 +15312,8 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
15047
15312
|
target: targetName,
|
|
15048
15313
|
timestamp: timestamp.toISOString(),
|
|
15049
15314
|
raw_aspects: [],
|
|
15050
|
-
|
|
15315
|
+
agent_provider_request: agentProviderRequest,
|
|
15316
|
+
lm_provider_request: lmProviderRequest,
|
|
15051
15317
|
error: message
|
|
15052
15318
|
};
|
|
15053
15319
|
}
|
|
@@ -15059,6 +15325,9 @@ function createCacheKey(provider, target, evalCase, promptInputs) {
|
|
|
15059
15325
|
hash.update(promptInputs.question);
|
|
15060
15326
|
hash.update(promptInputs.guidelines);
|
|
15061
15327
|
hash.update(promptInputs.systemMessage ?? "");
|
|
15328
|
+
if (promptInputs.chatPrompt) {
|
|
15329
|
+
hash.update(JSON.stringify(promptInputs.chatPrompt));
|
|
15330
|
+
}
|
|
15062
15331
|
return hash.digest("hex");
|
|
15063
15332
|
}
|
|
15064
15333
|
function isTimeoutLike(error) {
|
|
@@ -15486,8 +15755,6 @@ import { stripVTControlCharacters } from "node:util";
|
|
|
15486
15755
|
var ESC = "\x1B[";
|
|
15487
15756
|
var CLEAR_LINE = `${ESC}K`;
|
|
15488
15757
|
var MOVE_CURSOR_UP = `${ESC}1A`;
|
|
15489
|
-
var SYNC_START = `${ESC}?2026h`;
|
|
15490
|
-
var SYNC_END = `${ESC}?2026l`;
|
|
15491
15758
|
var ProgressDisplay = class {
|
|
15492
15759
|
workers = /* @__PURE__ */ new Map();
|
|
15493
15760
|
maxWorkers;
|
|
@@ -15963,14 +16230,14 @@ async function validateEvalFile(filePath) {
|
|
|
15963
16230
|
validateMessages(inputMessages, `${location}.input_messages`, absolutePath, errors);
|
|
15964
16231
|
}
|
|
15965
16232
|
const expectedMessages = evalCase["expected_messages"];
|
|
15966
|
-
if (!Array.isArray(expectedMessages)) {
|
|
16233
|
+
if (expectedMessages !== void 0 && !Array.isArray(expectedMessages)) {
|
|
15967
16234
|
errors.push({
|
|
15968
16235
|
severity: "error",
|
|
15969
16236
|
filePath: absolutePath,
|
|
15970
16237
|
location: `${location}.expected_messages`,
|
|
15971
|
-
message: "
|
|
16238
|
+
message: "Invalid 'expected_messages' field (must be an array if provided)"
|
|
15972
16239
|
});
|
|
15973
|
-
} else {
|
|
16240
|
+
} else if (Array.isArray(expectedMessages)) {
|
|
15974
16241
|
validateMessages(expectedMessages, `${location}.expected_messages`, absolutePath, errors);
|
|
15975
16242
|
}
|
|
15976
16243
|
}
|
|
@@ -16006,11 +16273,13 @@ function validateMessages(messages, location, filePath, errors) {
|
|
|
16006
16273
|
}
|
|
16007
16274
|
const content = message["content"];
|
|
16008
16275
|
if (typeof content === "string") {
|
|
16276
|
+
validateContentForRoleMarkers(content, `${msgLocation}.content`, filePath, errors);
|
|
16009
16277
|
} else if (Array.isArray(content)) {
|
|
16010
16278
|
for (let j2 = 0; j2 < content.length; j2++) {
|
|
16011
16279
|
const contentItem = content[j2];
|
|
16012
16280
|
const contentLocation = `${msgLocation}.content[${j2}]`;
|
|
16013
16281
|
if (typeof contentItem === "string") {
|
|
16282
|
+
validateContentForRoleMarkers(contentItem, contentLocation, filePath, errors);
|
|
16014
16283
|
} else if (isObject(contentItem)) {
|
|
16015
16284
|
const type = contentItem["type"];
|
|
16016
16285
|
if (typeof type !== "string") {
|
|
@@ -16030,6 +16299,8 @@ function validateMessages(messages, location, filePath, errors) {
|
|
|
16030
16299
|
location: `${contentLocation}.value`,
|
|
16031
16300
|
message: "Content with type 'text' must have a 'value' field"
|
|
16032
16301
|
});
|
|
16302
|
+
} else {
|
|
16303
|
+
validateContentForRoleMarkers(value, `${contentLocation}.value`, filePath, errors);
|
|
16033
16304
|
}
|
|
16034
16305
|
}
|
|
16035
16306
|
} else {
|
|
@@ -16051,6 +16322,19 @@ function validateMessages(messages, location, filePath, errors) {
|
|
|
16051
16322
|
}
|
|
16052
16323
|
}
|
|
16053
16324
|
}
|
|
16325
|
+
function validateContentForRoleMarkers(content, location, filePath, errors) {
|
|
16326
|
+
const markers = ["@[System]:", "@[User]:", "@[Assistant]:", "@[Tool]:"];
|
|
16327
|
+
for (const marker of markers) {
|
|
16328
|
+
if (content.toLowerCase().includes(marker.toLowerCase())) {
|
|
16329
|
+
errors.push({
|
|
16330
|
+
severity: "warning",
|
|
16331
|
+
filePath,
|
|
16332
|
+
location,
|
|
16333
|
+
message: `Content contains potential role marker '${marker}'. This may confuse agentic providers or cause prompt injection.`
|
|
16334
|
+
});
|
|
16335
|
+
}
|
|
16336
|
+
}
|
|
16337
|
+
}
|
|
16054
16338
|
function isObject2(value) {
|
|
16055
16339
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
16056
16340
|
}
|
|
@@ -16659,9 +16943,8 @@ async function validateMessagesFileRefs(messages, location, searchRoots, filePat
|
|
|
16659
16943
|
|
|
16660
16944
|
// src/commands/eval/targets.ts
|
|
16661
16945
|
import { constants as constants5 } from "node:fs";
|
|
16662
|
-
import { access as access5
|
|
16946
|
+
import { access as access5 } from "node:fs/promises";
|
|
16663
16947
|
import path13 from "node:path";
|
|
16664
|
-
import { parse as parse6 } from "yaml";
|
|
16665
16948
|
var TARGET_FILE_CANDIDATES = [
|
|
16666
16949
|
"targets.yaml",
|
|
16667
16950
|
"targets.yml",
|
|
@@ -16683,18 +16966,8 @@ async function fileExists5(filePath) {
|
|
|
16683
16966
|
}
|
|
16684
16967
|
}
|
|
16685
16968
|
async function readTestSuiteTarget(testFilePath) {
|
|
16686
|
-
|
|
16687
|
-
|
|
16688
|
-
const parsed = parse6(raw);
|
|
16689
|
-
if (parsed && typeof parsed === "object" && !Array.isArray(parsed)) {
|
|
16690
|
-
const targetValue = parsed.target;
|
|
16691
|
-
if (typeof targetValue === "string" && targetValue.trim().length > 0) {
|
|
16692
|
-
return targetValue.trim();
|
|
16693
|
-
}
|
|
16694
|
-
}
|
|
16695
|
-
} catch {
|
|
16696
|
-
}
|
|
16697
|
-
return void 0;
|
|
16969
|
+
const metadata = await readTestSuiteMetadata(testFilePath);
|
|
16970
|
+
return metadata.target;
|
|
16698
16971
|
}
|
|
16699
16972
|
async function discoverTargetsFile(options) {
|
|
16700
16973
|
const { explicitPath, testFilePath, repoRoot, cwd } = options;
|
|
@@ -17665,4 +17938,4 @@ export {
|
|
|
17665
17938
|
createProgram,
|
|
17666
17939
|
runCli
|
|
17667
17940
|
};
|
|
17668
|
-
//# sourceMappingURL=chunk-
|
|
17941
|
+
//# sourceMappingURL=chunk-72BHGHIT.js.map
|