agentv 0.9.0 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -590,7 +590,7 @@ import fg from "fast-glob";
|
|
|
590
590
|
import { stat as stat3 } from "node:fs/promises";
|
|
591
591
|
import path15 from "node:path";
|
|
592
592
|
|
|
593
|
-
// ../../packages/core/dist/chunk-
|
|
593
|
+
// ../../packages/core/dist/chunk-YQBJAT5I.js
|
|
594
594
|
import { constants } from "node:fs";
|
|
595
595
|
import { access, readFile } from "node:fs/promises";
|
|
596
596
|
import path from "node:path";
|
|
@@ -4636,7 +4636,7 @@ var coerce = {
|
|
|
4636
4636
|
};
|
|
4637
4637
|
var NEVER = INVALID;
|
|
4638
4638
|
|
|
4639
|
-
// ../../packages/core/dist/chunk-
|
|
4639
|
+
// ../../packages/core/dist/chunk-YQBJAT5I.js
|
|
4640
4640
|
async function fileExists(filePath) {
|
|
4641
4641
|
try {
|
|
4642
4642
|
await access(filePath, constants.F_OK);
|
|
@@ -11947,14 +11947,11 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
11947
11947
|
logWarning(`Skipping incomplete eval case: ${id ?? "unknown"}`);
|
|
11948
11948
|
continue;
|
|
11949
11949
|
}
|
|
11950
|
-
|
|
11951
|
-
logWarning(`Eval case '${id}' missing expected_messages array`);
|
|
11952
|
-
continue;
|
|
11953
|
-
}
|
|
11950
|
+
const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
|
|
11954
11951
|
const inputMessages = inputMessagesValue.filter((msg) => isTestMessage(msg));
|
|
11955
|
-
const expectedMessages = expectedMessagesValue.filter((msg) => isTestMessage(msg));
|
|
11956
|
-
if (expectedMessages.length === 0) {
|
|
11957
|
-
logWarning(`No expected message found for eval case: ${id}`);
|
|
11952
|
+
const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
|
|
11953
|
+
if (hasExpectedMessages && expectedMessages.length === 0) {
|
|
11954
|
+
logWarning(`No valid expected message found for eval case: ${id}`);
|
|
11958
11955
|
continue;
|
|
11959
11956
|
}
|
|
11960
11957
|
if (expectedMessages.length > 1) {
|
|
@@ -11972,17 +11969,17 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
11972
11969
|
messageType: "input",
|
|
11973
11970
|
verbose
|
|
11974
11971
|
});
|
|
11975
|
-
const outputSegments = await processMessages({
|
|
11972
|
+
const outputSegments = hasExpectedMessages ? await processMessages({
|
|
11976
11973
|
messages: expectedMessages,
|
|
11977
11974
|
searchRoots,
|
|
11978
11975
|
repoRootPath,
|
|
11979
11976
|
guidelinePatterns,
|
|
11980
11977
|
messageType: "output",
|
|
11981
11978
|
verbose
|
|
11982
|
-
});
|
|
11979
|
+
}) : [];
|
|
11983
11980
|
const codeSnippets = extractCodeBlocks(inputSegments);
|
|
11984
11981
|
const expectedContent = expectedMessages[0]?.content;
|
|
11985
|
-
const referenceAnswer = await resolveAssistantContent(expectedContent, searchRoots, verbose);
|
|
11982
|
+
const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
|
|
11986
11983
|
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
11987
11984
|
const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
|
|
11988
11985
|
const evaluators = await parseEvaluators(evalcase, searchRoots, id ?? "unknown");
|
|
@@ -12001,6 +11998,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
12001
11998
|
dataset: datasetName,
|
|
12002
11999
|
conversation_id: conversationId,
|
|
12003
12000
|
question,
|
|
12001
|
+
input_messages: inputMessages,
|
|
12004
12002
|
input_segments: inputSegments,
|
|
12005
12003
|
output_segments: outputSegments,
|
|
12006
12004
|
reference_answer: referenceAnswer,
|
|
@@ -12028,6 +12026,54 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
12028
12026
|
}
|
|
12029
12027
|
return results;
|
|
12030
12028
|
}
|
|
12029
|
+
function needsRoleMarkers(messages, processedSegmentsByMessage) {
|
|
12030
|
+
if (messages.some((msg) => msg.role === "assistant" || msg.role === "tool")) {
|
|
12031
|
+
return true;
|
|
12032
|
+
}
|
|
12033
|
+
let messagesWithContent = 0;
|
|
12034
|
+
for (const segments of processedSegmentsByMessage) {
|
|
12035
|
+
if (hasVisibleContent(segments)) {
|
|
12036
|
+
messagesWithContent++;
|
|
12037
|
+
}
|
|
12038
|
+
}
|
|
12039
|
+
return messagesWithContent > 1;
|
|
12040
|
+
}
|
|
12041
|
+
function hasVisibleContent(segments) {
|
|
12042
|
+
return segments.some((segment) => {
|
|
12043
|
+
const type = asString(segment.type);
|
|
12044
|
+
if (type === "text") {
|
|
12045
|
+
const value = asString(segment.value);
|
|
12046
|
+
return value !== void 0 && value.trim().length > 0;
|
|
12047
|
+
}
|
|
12048
|
+
if (type === "guideline_ref") {
|
|
12049
|
+
return false;
|
|
12050
|
+
}
|
|
12051
|
+
if (type === "file") {
|
|
12052
|
+
const text = asString(segment.text);
|
|
12053
|
+
return text !== void 0 && text.trim().length > 0;
|
|
12054
|
+
}
|
|
12055
|
+
return false;
|
|
12056
|
+
});
|
|
12057
|
+
}
|
|
12058
|
+
function formatSegment(segment) {
|
|
12059
|
+
const type = asString(segment.type);
|
|
12060
|
+
if (type === "text") {
|
|
12061
|
+
return asString(segment.value);
|
|
12062
|
+
}
|
|
12063
|
+
if (type === "guideline_ref") {
|
|
12064
|
+
const refPath = asString(segment.path);
|
|
12065
|
+
return refPath ? `<Attached: ${refPath}>` : void 0;
|
|
12066
|
+
}
|
|
12067
|
+
if (type === "file") {
|
|
12068
|
+
const text = asString(segment.text);
|
|
12069
|
+
const filePath = asString(segment.path);
|
|
12070
|
+
if (text && filePath) {
|
|
12071
|
+
return `=== ${filePath} ===
|
|
12072
|
+
${text}`;
|
|
12073
|
+
}
|
|
12074
|
+
}
|
|
12075
|
+
return void 0;
|
|
12076
|
+
}
|
|
12031
12077
|
async function buildPromptInputs(testCase) {
|
|
12032
12078
|
const guidelineContents = [];
|
|
12033
12079
|
for (const rawPath of testCase.guideline_paths) {
|
|
@@ -12044,36 +12090,168 @@ ${content}`);
|
|
|
12044
12090
|
logWarning(`Could not read guideline file ${absolutePath}: ${error.message}`);
|
|
12045
12091
|
}
|
|
12046
12092
|
}
|
|
12047
|
-
const
|
|
12093
|
+
const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
|
|
12094
|
+
const segmentsByMessage = [];
|
|
12095
|
+
const fileContentsByPath = /* @__PURE__ */ new Map();
|
|
12048
12096
|
for (const segment of testCase.input_segments) {
|
|
12049
|
-
|
|
12050
|
-
|
|
12051
|
-
const pathValue = segment.path;
|
|
12052
|
-
const textValue = segment.text;
|
|
12053
|
-
const label = typeof pathValue === "string" ? pathValue : "file";
|
|
12054
|
-
const body = typeof textValue === "string" ? textValue : "";
|
|
12055
|
-
questionParts.push(`=== ${label} ===
|
|
12056
|
-
${body}`);
|
|
12057
|
-
continue;
|
|
12097
|
+
if (segment.type === "file" && typeof segment.path === "string" && typeof segment.text === "string") {
|
|
12098
|
+
fileContentsByPath.set(segment.path, segment.text);
|
|
12058
12099
|
}
|
|
12059
|
-
|
|
12060
|
-
|
|
12061
|
-
|
|
12062
|
-
|
|
12100
|
+
}
|
|
12101
|
+
for (const message of testCase.input_messages) {
|
|
12102
|
+
const messageSegments = [];
|
|
12103
|
+
if (typeof message.content === "string") {
|
|
12104
|
+
if (message.content.trim().length > 0) {
|
|
12105
|
+
messageSegments.push({ type: "text", value: message.content });
|
|
12106
|
+
}
|
|
12107
|
+
} else if (Array.isArray(message.content)) {
|
|
12108
|
+
for (const segment of message.content) {
|
|
12109
|
+
if (typeof segment === "string") {
|
|
12110
|
+
if (segment.trim().length > 0) {
|
|
12111
|
+
messageSegments.push({ type: "text", value: segment });
|
|
12112
|
+
}
|
|
12113
|
+
} else if (isJsonObject(segment)) {
|
|
12114
|
+
const type = asString(segment.type);
|
|
12115
|
+
if (type === "file") {
|
|
12116
|
+
const value = asString(segment.value);
|
|
12117
|
+
if (!value) continue;
|
|
12118
|
+
if (testCase.guideline_patterns && isGuidelineFile(value, testCase.guideline_patterns)) {
|
|
12119
|
+
messageSegments.push({ type: "guideline_ref", path: value });
|
|
12120
|
+
continue;
|
|
12121
|
+
}
|
|
12122
|
+
const fileText = fileContentsByPath.get(value);
|
|
12123
|
+
if (fileText !== void 0) {
|
|
12124
|
+
messageSegments.push({ type: "file", text: fileText, path: value });
|
|
12125
|
+
}
|
|
12126
|
+
} else if (type === "text") {
|
|
12127
|
+
const textValue = asString(segment.value);
|
|
12128
|
+
if (textValue && textValue.trim().length > 0) {
|
|
12129
|
+
messageSegments.push({ type: "text", value: textValue });
|
|
12130
|
+
}
|
|
12131
|
+
}
|
|
12132
|
+
}
|
|
12063
12133
|
}
|
|
12064
|
-
continue;
|
|
12065
12134
|
}
|
|
12066
|
-
|
|
12067
|
-
|
|
12068
|
-
|
|
12135
|
+
segmentsByMessage.push(messageSegments);
|
|
12136
|
+
}
|
|
12137
|
+
const useRoleMarkers = needsRoleMarkers(testCase.input_messages, segmentsByMessage);
|
|
12138
|
+
let question;
|
|
12139
|
+
if (useRoleMarkers) {
|
|
12140
|
+
const messageParts = [];
|
|
12141
|
+
for (let i6 = 0; i6 < testCase.input_messages.length; i6++) {
|
|
12142
|
+
const message = testCase.input_messages[i6];
|
|
12143
|
+
const segments = segmentsByMessage[i6];
|
|
12144
|
+
if (!hasVisibleContent(segments)) {
|
|
12145
|
+
continue;
|
|
12146
|
+
}
|
|
12147
|
+
const roleLabel = message.role.charAt(0).toUpperCase() + message.role.slice(1);
|
|
12148
|
+
const contentParts = [];
|
|
12149
|
+
for (const segment of segments) {
|
|
12150
|
+
const formattedContent = formatSegment(segment);
|
|
12151
|
+
if (formattedContent) {
|
|
12152
|
+
contentParts.push(formattedContent);
|
|
12153
|
+
}
|
|
12154
|
+
}
|
|
12155
|
+
if (contentParts.length > 0) {
|
|
12156
|
+
const messageContent = contentParts.join("\n");
|
|
12157
|
+
messageParts.push(`@[${roleLabel}]:
|
|
12158
|
+
${messageContent}`);
|
|
12159
|
+
}
|
|
12160
|
+
}
|
|
12161
|
+
question = messageParts.join("\n\n");
|
|
12162
|
+
} else {
|
|
12163
|
+
const questionParts = [];
|
|
12164
|
+
for (const segment of testCase.input_segments) {
|
|
12165
|
+
const formattedContent = formatSegment(segment);
|
|
12166
|
+
if (formattedContent) {
|
|
12167
|
+
questionParts.push(formattedContent);
|
|
12168
|
+
}
|
|
12169
|
+
}
|
|
12170
|
+
if (testCase.code_snippets.length > 0) {
|
|
12171
|
+
questionParts.push(testCase.code_snippets.join("\n"));
|
|
12069
12172
|
}
|
|
12173
|
+
question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
|
|
12070
12174
|
}
|
|
12071
|
-
|
|
12072
|
-
|
|
12175
|
+
const chatPrompt = useRoleMarkers ? buildChatPromptFromSegments({
|
|
12176
|
+
messages: testCase.input_messages,
|
|
12177
|
+
segmentsByMessage,
|
|
12178
|
+
guidelinePatterns: testCase.guideline_patterns,
|
|
12179
|
+
guidelineContent: guidelines
|
|
12180
|
+
}) : void 0;
|
|
12181
|
+
return { question, guidelines, chatPrompt };
|
|
12182
|
+
}
|
|
12183
|
+
function buildChatPromptFromSegments(options) {
|
|
12184
|
+
const { messages, segmentsByMessage, guidelinePatterns, guidelineContent, systemPrompt } = options;
|
|
12185
|
+
if (messages.length === 0) {
|
|
12186
|
+
return void 0;
|
|
12073
12187
|
}
|
|
12074
|
-
const
|
|
12075
|
-
|
|
12076
|
-
|
|
12188
|
+
const systemSegments = [];
|
|
12189
|
+
if (systemPrompt && systemPrompt.trim().length > 0) {
|
|
12190
|
+
systemSegments.push(systemPrompt.trim());
|
|
12191
|
+
}
|
|
12192
|
+
if (guidelineContent && guidelineContent.trim().length > 0) {
|
|
12193
|
+
systemSegments.push(`[[ ## Guidelines ## ]]
|
|
12194
|
+
|
|
12195
|
+
${guidelineContent.trim()}`);
|
|
12196
|
+
}
|
|
12197
|
+
let startIndex = 0;
|
|
12198
|
+
while (startIndex < messages.length && messages[startIndex].role === "system") {
|
|
12199
|
+
const segments = segmentsByMessage[startIndex];
|
|
12200
|
+
const contentParts = [];
|
|
12201
|
+
for (const segment of segments) {
|
|
12202
|
+
const formatted = formatSegment(segment);
|
|
12203
|
+
if (formatted) {
|
|
12204
|
+
contentParts.push(formatted);
|
|
12205
|
+
}
|
|
12206
|
+
}
|
|
12207
|
+
if (contentParts.length > 0) {
|
|
12208
|
+
systemSegments.push(contentParts.join("\n"));
|
|
12209
|
+
}
|
|
12210
|
+
startIndex += 1;
|
|
12211
|
+
}
|
|
12212
|
+
const chatPrompt = [];
|
|
12213
|
+
if (systemSegments.length > 0) {
|
|
12214
|
+
chatPrompt.push({
|
|
12215
|
+
role: "system",
|
|
12216
|
+
content: systemSegments.join("\n\n")
|
|
12217
|
+
});
|
|
12218
|
+
}
|
|
12219
|
+
for (let i6 = startIndex; i6 < messages.length; i6++) {
|
|
12220
|
+
const message = messages[i6];
|
|
12221
|
+
const segments = segmentsByMessage[i6];
|
|
12222
|
+
const contentParts = [];
|
|
12223
|
+
let role = message.role;
|
|
12224
|
+
let name;
|
|
12225
|
+
if (role === "system") {
|
|
12226
|
+
role = "assistant";
|
|
12227
|
+
contentParts.push("@[System]:");
|
|
12228
|
+
} else if (role === "tool") {
|
|
12229
|
+
role = "function";
|
|
12230
|
+
name = "tool";
|
|
12231
|
+
}
|
|
12232
|
+
for (const segment of segments) {
|
|
12233
|
+
if (segment.type === "guideline_ref") {
|
|
12234
|
+
continue;
|
|
12235
|
+
}
|
|
12236
|
+
const formatted = formatSegment(segment);
|
|
12237
|
+
if (formatted) {
|
|
12238
|
+
const isGuidelineRef = segment.type === "file" && typeof segment.path === "string" && guidelinePatterns && isGuidelineFile(segment.path, guidelinePatterns);
|
|
12239
|
+
if (isGuidelineRef) {
|
|
12240
|
+
continue;
|
|
12241
|
+
}
|
|
12242
|
+
contentParts.push(formatted);
|
|
12243
|
+
}
|
|
12244
|
+
}
|
|
12245
|
+
if (contentParts.length === 0) {
|
|
12246
|
+
continue;
|
|
12247
|
+
}
|
|
12248
|
+
chatPrompt.push({
|
|
12249
|
+
role,
|
|
12250
|
+
content: contentParts.join("\n"),
|
|
12251
|
+
...name ? { name } : {}
|
|
12252
|
+
});
|
|
12253
|
+
}
|
|
12254
|
+
return chatPrompt.length > 0 ? chatPrompt : void 0;
|
|
12077
12255
|
}
|
|
12078
12256
|
async function fileExists2(absolutePath) {
|
|
12079
12257
|
try {
|
|
@@ -12267,21 +12445,14 @@ ${detailBlock}${ANSI_RESET}`);
|
|
|
12267
12445
|
var DEFAULT_SYSTEM_PROMPT = "You are a careful assistant. Follow all provided instructions and do not fabricate results.";
|
|
12268
12446
|
function buildChatPrompt(request) {
|
|
12269
12447
|
if (request.chatPrompt) {
|
|
12270
|
-
|
|
12271
|
-
|
|
12272
|
-
|
|
12273
|
-
|
|
12274
|
-
|
|
12275
|
-
|
|
12276
|
-
} else {
|
|
12277
|
-
systemSegments.push(DEFAULT_SYSTEM_PROMPT);
|
|
12278
|
-
}
|
|
12279
|
-
if (request.guidelines && request.guidelines.trim().length > 0) {
|
|
12280
|
-
systemSegments.push(`[[ ## Guidelines ## ]]
|
|
12281
|
-
|
|
12282
|
-
${request.guidelines.trim()}`);
|
|
12448
|
+
const hasSystemMessage = request.chatPrompt.some((message) => message.role === "system");
|
|
12449
|
+
if (hasSystemMessage) {
|
|
12450
|
+
return request.chatPrompt;
|
|
12451
|
+
}
|
|
12452
|
+
const systemContent2 = resolveSystemContent(request);
|
|
12453
|
+
return [{ role: "system", content: systemContent2 }, ...request.chatPrompt];
|
|
12283
12454
|
}
|
|
12284
|
-
const systemContent =
|
|
12455
|
+
const systemContent = resolveSystemContent(request);
|
|
12285
12456
|
const userContent = request.question.trim();
|
|
12286
12457
|
const prompt = [
|
|
12287
12458
|
{
|
|
@@ -12295,6 +12466,21 @@ ${request.guidelines.trim()}`);
|
|
|
12295
12466
|
];
|
|
12296
12467
|
return prompt;
|
|
12297
12468
|
}
|
|
12469
|
+
function resolveSystemContent(request) {
|
|
12470
|
+
const systemSegments = [];
|
|
12471
|
+
const metadataSystemPrompt = typeof request.metadata?.systemPrompt === "string" ? request.metadata.systemPrompt : void 0;
|
|
12472
|
+
if (metadataSystemPrompt && metadataSystemPrompt.trim().length > 0) {
|
|
12473
|
+
systemSegments.push(metadataSystemPrompt.trim());
|
|
12474
|
+
} else {
|
|
12475
|
+
systemSegments.push(DEFAULT_SYSTEM_PROMPT);
|
|
12476
|
+
}
|
|
12477
|
+
if (request.guidelines && request.guidelines.trim().length > 0) {
|
|
12478
|
+
systemSegments.push(`[[ ## Guidelines ## ]]
|
|
12479
|
+
|
|
12480
|
+
${request.guidelines.trim()}`);
|
|
12481
|
+
}
|
|
12482
|
+
return systemSegments.join("\n\n");
|
|
12483
|
+
}
|
|
12298
12484
|
function extractModelConfig(request, defaults) {
|
|
12299
12485
|
const temperature = request.temperature ?? defaults.temperature;
|
|
12300
12486
|
const maxTokens = request.maxOutputTokens ?? defaults.maxOutputTokens;
|
|
@@ -13955,19 +14141,21 @@ var LlmJudgeEvaluator = class {
|
|
|
13955
14141
|
return this.evaluateWithPrompt(context2, judgeProvider);
|
|
13956
14142
|
}
|
|
13957
14143
|
async evaluateWithPrompt(context2, judgeProvider) {
|
|
13958
|
-
|
|
13959
|
-
|
|
14144
|
+
const hasReferenceAnswer = hasNonEmptyReferenceAnswer(context2.evalCase);
|
|
14145
|
+
const formattedQuestion = context2.promptInputs.question && context2.promptInputs.question.trim().length > 0 ? context2.promptInputs.question : context2.evalCase.question;
|
|
14146
|
+
let prompt = buildQualityPrompt(context2.evalCase, context2.candidate, formattedQuestion);
|
|
14147
|
+
let systemPrompt = context2.systemPrompt ?? this.customPrompt ?? buildSystemPrompt(hasReferenceAnswer);
|
|
13960
14148
|
if (systemPrompt && hasTemplateVariables(systemPrompt)) {
|
|
13961
14149
|
const variables = {
|
|
13962
14150
|
input_messages: JSON.stringify(context2.evalCase.input_segments, null, 2),
|
|
13963
14151
|
output_messages: JSON.stringify(context2.evalCase.output_segments, null, 2),
|
|
13964
14152
|
candidate_answer: context2.candidate,
|
|
13965
|
-
reference_answer: context2.evalCase.reference_answer,
|
|
14153
|
+
reference_answer: context2.evalCase.reference_answer ?? "",
|
|
13966
14154
|
expected_outcome: context2.evalCase.expected_outcome,
|
|
13967
|
-
question:
|
|
14155
|
+
question: formattedQuestion
|
|
13968
14156
|
};
|
|
13969
14157
|
prompt = substituteVariables(systemPrompt, variables);
|
|
13970
|
-
systemPrompt =
|
|
14158
|
+
systemPrompt = buildSystemPrompt(hasReferenceAnswer);
|
|
13971
14159
|
}
|
|
13972
14160
|
const metadata = {
|
|
13973
14161
|
...systemPrompt !== void 0 ? { systemPrompt } : {},
|
|
@@ -14005,38 +14193,51 @@ var LlmJudgeEvaluator = class {
|
|
|
14005
14193
|
};
|
|
14006
14194
|
}
|
|
14007
14195
|
};
|
|
14008
|
-
|
|
14009
|
-
|
|
14010
|
-
|
|
14011
|
-
|
|
14012
|
-
|
|
14013
|
-
|
|
14014
|
-
|
|
14015
|
-
|
|
14016
|
-
|
|
14017
|
-
|
|
14018
|
-
|
|
14019
|
-
|
|
14020
|
-
|
|
14021
|
-
|
|
14022
|
-
|
|
14023
|
-
|
|
14024
|
-
|
|
14196
|
+
function buildSystemPrompt(hasReferenceAnswer) {
|
|
14197
|
+
const basePrompt = [
|
|
14198
|
+
"You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.",
|
|
14199
|
+
""
|
|
14200
|
+
];
|
|
14201
|
+
if (hasReferenceAnswer) {
|
|
14202
|
+
basePrompt.push(
|
|
14203
|
+
"Use the reference_answer as a gold standard for a high-quality response. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.",
|
|
14204
|
+
""
|
|
14205
|
+
);
|
|
14206
|
+
}
|
|
14207
|
+
basePrompt.push(
|
|
14208
|
+
"Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.",
|
|
14209
|
+
"",
|
|
14210
|
+
"You must respond with a single JSON object matching this schema:",
|
|
14211
|
+
"",
|
|
14212
|
+
"{",
|
|
14213
|
+
' "score": <number between 0.0 and 1.0>,',
|
|
14214
|
+
' "hits": [<array of strings, max 4 items, brief specific achievements>],',
|
|
14215
|
+
' "misses": [<array of strings, max 4 items, brief specific failures or omissions, empty if none>],',
|
|
14216
|
+
' "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
|
|
14217
|
+
"}"
|
|
14218
|
+
);
|
|
14219
|
+
return basePrompt.join("\n");
|
|
14220
|
+
}
|
|
14221
|
+
function buildQualityPrompt(evalCase, candidate, question) {
|
|
14025
14222
|
const parts = [
|
|
14026
14223
|
"[[ ## expected_outcome ## ]]",
|
|
14027
14224
|
evalCase.expected_outcome.trim(),
|
|
14028
14225
|
"",
|
|
14029
14226
|
"[[ ## question ## ]]",
|
|
14030
|
-
|
|
14031
|
-
""
|
|
14032
|
-
"[[ ## reference_answer ## ]]",
|
|
14033
|
-
evalCase.reference_answer.trim(),
|
|
14034
|
-
"",
|
|
14035
|
-
"[[ ## candidate_answer ## ]]",
|
|
14036
|
-
candidate.trim(),
|
|
14037
|
-
"",
|
|
14038
|
-
"Respond with a single JSON object matching the schema described in the system prompt."
|
|
14227
|
+
question.trim(),
|
|
14228
|
+
""
|
|
14039
14229
|
];
|
|
14230
|
+
if (hasNonEmptyReferenceAnswer(evalCase)) {
|
|
14231
|
+
parts.push(
|
|
14232
|
+
"[[ ## reference_answer ## ]]",
|
|
14233
|
+
evalCase.reference_answer.trim(),
|
|
14234
|
+
""
|
|
14235
|
+
);
|
|
14236
|
+
}
|
|
14237
|
+
parts.push(
|
|
14238
|
+
"[[ ## candidate_answer ## ]]",
|
|
14239
|
+
candidate.trim()
|
|
14240
|
+
);
|
|
14040
14241
|
return parts.join("\n");
|
|
14041
14242
|
}
|
|
14042
14243
|
function clampScore(value) {
|
|
@@ -14119,6 +14320,9 @@ function extractJsonBlob(text) {
|
|
|
14119
14320
|
function isNonEmptyString(value) {
|
|
14120
14321
|
return typeof value === "string" && value.trim().length > 0;
|
|
14121
14322
|
}
|
|
14323
|
+
function hasNonEmptyReferenceAnswer(evalCase) {
|
|
14324
|
+
return evalCase.reference_answer !== void 0 && evalCase.reference_answer.trim().length > 0;
|
|
14325
|
+
}
|
|
14122
14326
|
var CodeEvaluator = class {
|
|
14123
14327
|
kind = "code";
|
|
14124
14328
|
script;
|
|
@@ -14766,11 +14970,27 @@ async function evaluateCandidate(options) {
|
|
|
14766
14970
|
agentTimeoutMs
|
|
14767
14971
|
});
|
|
14768
14972
|
const completedAt = nowFn();
|
|
14769
|
-
|
|
14770
|
-
|
|
14771
|
-
|
|
14772
|
-
|
|
14773
|
-
|
|
14973
|
+
let agentProviderRequest;
|
|
14974
|
+
let lmProviderRequest;
|
|
14975
|
+
if (isAgentProvider(provider)) {
|
|
14976
|
+
agentProviderRequest = {
|
|
14977
|
+
question: promptInputs.question,
|
|
14978
|
+
guideline_paths: evalCase.guideline_paths
|
|
14979
|
+
};
|
|
14980
|
+
} else {
|
|
14981
|
+
if (promptInputs.chatPrompt) {
|
|
14982
|
+
lmProviderRequest = {
|
|
14983
|
+
chat_prompt: promptInputs.chatPrompt,
|
|
14984
|
+
guideline_paths: evalCase.guideline_paths
|
|
14985
|
+
};
|
|
14986
|
+
} else {
|
|
14987
|
+
lmProviderRequest = {
|
|
14988
|
+
question: promptInputs.question,
|
|
14989
|
+
guidelines: promptInputs.guidelines,
|
|
14990
|
+
guideline_paths: evalCase.guideline_paths
|
|
14991
|
+
};
|
|
14992
|
+
}
|
|
14993
|
+
}
|
|
14774
14994
|
return {
|
|
14775
14995
|
eval_id: evalCase.id,
|
|
14776
14996
|
dataset: evalCase.dataset,
|
|
@@ -14784,7 +15004,8 @@ async function evaluateCandidate(options) {
|
|
|
14784
15004
|
timestamp: completedAt.toISOString(),
|
|
14785
15005
|
reasoning: score.reasoning,
|
|
14786
15006
|
raw_aspects: score.rawAspects,
|
|
14787
|
-
|
|
15007
|
+
agent_provider_request: agentProviderRequest,
|
|
15008
|
+
lm_provider_request: lmProviderRequest,
|
|
14788
15009
|
evaluator_raw_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
|
|
14789
15010
|
evaluator_results: evaluatorResults
|
|
14790
15011
|
};
|
|
@@ -15013,6 +15234,7 @@ async function invokeProvider(provider, options) {
|
|
|
15013
15234
|
question: promptInputs.question,
|
|
15014
15235
|
guidelines: promptInputs.guidelines,
|
|
15015
15236
|
guideline_patterns: evalCase.guideline_patterns,
|
|
15237
|
+
chatPrompt: promptInputs.chatPrompt,
|
|
15016
15238
|
inputFiles: evalCase.file_paths,
|
|
15017
15239
|
evalCaseId: evalCase.id,
|
|
15018
15240
|
attempt,
|
|
@@ -15029,12 +15251,30 @@ async function invokeProvider(provider, options) {
|
|
|
15029
15251
|
}
|
|
15030
15252
|
function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider) {
|
|
15031
15253
|
const message = error instanceof Error ? error.message : String(error);
|
|
15032
|
-
|
|
15033
|
-
|
|
15034
|
-
|
|
15035
|
-
|
|
15036
|
-
|
|
15037
|
-
|
|
15254
|
+
let agentProviderRequest;
|
|
15255
|
+
let lmProviderRequest;
|
|
15256
|
+
if (isAgentProvider(provider)) {
|
|
15257
|
+
agentProviderRequest = {
|
|
15258
|
+
question: promptInputs.question,
|
|
15259
|
+
guideline_paths: evalCase.guideline_paths,
|
|
15260
|
+
error: message
|
|
15261
|
+
};
|
|
15262
|
+
} else {
|
|
15263
|
+
if (promptInputs.chatPrompt) {
|
|
15264
|
+
lmProviderRequest = {
|
|
15265
|
+
chat_prompt: promptInputs.chatPrompt,
|
|
15266
|
+
guideline_paths: evalCase.guideline_paths,
|
|
15267
|
+
error: message
|
|
15268
|
+
};
|
|
15269
|
+
} else {
|
|
15270
|
+
lmProviderRequest = {
|
|
15271
|
+
question: promptInputs.question,
|
|
15272
|
+
guidelines: promptInputs.guidelines,
|
|
15273
|
+
guideline_paths: evalCase.guideline_paths,
|
|
15274
|
+
error: message
|
|
15275
|
+
};
|
|
15276
|
+
}
|
|
15277
|
+
}
|
|
15038
15278
|
return {
|
|
15039
15279
|
eval_id: evalCase.id,
|
|
15040
15280
|
dataset: evalCase.dataset,
|
|
@@ -15047,7 +15287,8 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
15047
15287
|
target: targetName,
|
|
15048
15288
|
timestamp: timestamp.toISOString(),
|
|
15049
15289
|
raw_aspects: [],
|
|
15050
|
-
|
|
15290
|
+
agent_provider_request: agentProviderRequest,
|
|
15291
|
+
lm_provider_request: lmProviderRequest,
|
|
15051
15292
|
error: message
|
|
15052
15293
|
};
|
|
15053
15294
|
}
|
|
@@ -15059,6 +15300,9 @@ function createCacheKey(provider, target, evalCase, promptInputs) {
|
|
|
15059
15300
|
hash.update(promptInputs.question);
|
|
15060
15301
|
hash.update(promptInputs.guidelines);
|
|
15061
15302
|
hash.update(promptInputs.systemMessage ?? "");
|
|
15303
|
+
if (promptInputs.chatPrompt) {
|
|
15304
|
+
hash.update(JSON.stringify(promptInputs.chatPrompt));
|
|
15305
|
+
}
|
|
15062
15306
|
return hash.digest("hex");
|
|
15063
15307
|
}
|
|
15064
15308
|
function isTimeoutLike(error) {
|
|
@@ -15486,8 +15730,6 @@ import { stripVTControlCharacters } from "node:util";
|
|
|
15486
15730
|
var ESC = "\x1B[";
|
|
15487
15731
|
var CLEAR_LINE = `${ESC}K`;
|
|
15488
15732
|
var MOVE_CURSOR_UP = `${ESC}1A`;
|
|
15489
|
-
var SYNC_START = `${ESC}?2026h`;
|
|
15490
|
-
var SYNC_END = `${ESC}?2026l`;
|
|
15491
15733
|
var ProgressDisplay = class {
|
|
15492
15734
|
workers = /* @__PURE__ */ new Map();
|
|
15493
15735
|
maxWorkers;
|
|
@@ -15963,14 +16205,14 @@ async function validateEvalFile(filePath) {
|
|
|
15963
16205
|
validateMessages(inputMessages, `${location}.input_messages`, absolutePath, errors);
|
|
15964
16206
|
}
|
|
15965
16207
|
const expectedMessages = evalCase["expected_messages"];
|
|
15966
|
-
if (!Array.isArray(expectedMessages)) {
|
|
16208
|
+
if (expectedMessages !== void 0 && !Array.isArray(expectedMessages)) {
|
|
15967
16209
|
errors.push({
|
|
15968
16210
|
severity: "error",
|
|
15969
16211
|
filePath: absolutePath,
|
|
15970
16212
|
location: `${location}.expected_messages`,
|
|
15971
|
-
message: "
|
|
16213
|
+
message: "Invalid 'expected_messages' field (must be an array if provided)"
|
|
15972
16214
|
});
|
|
15973
|
-
} else {
|
|
16215
|
+
} else if (Array.isArray(expectedMessages)) {
|
|
15974
16216
|
validateMessages(expectedMessages, `${location}.expected_messages`, absolutePath, errors);
|
|
15975
16217
|
}
|
|
15976
16218
|
}
|
|
@@ -16006,11 +16248,13 @@ function validateMessages(messages, location, filePath, errors) {
|
|
|
16006
16248
|
}
|
|
16007
16249
|
const content = message["content"];
|
|
16008
16250
|
if (typeof content === "string") {
|
|
16251
|
+
validateContentForRoleMarkers(content, `${msgLocation}.content`, filePath, errors);
|
|
16009
16252
|
} else if (Array.isArray(content)) {
|
|
16010
16253
|
for (let j2 = 0; j2 < content.length; j2++) {
|
|
16011
16254
|
const contentItem = content[j2];
|
|
16012
16255
|
const contentLocation = `${msgLocation}.content[${j2}]`;
|
|
16013
16256
|
if (typeof contentItem === "string") {
|
|
16257
|
+
validateContentForRoleMarkers(contentItem, contentLocation, filePath, errors);
|
|
16014
16258
|
} else if (isObject(contentItem)) {
|
|
16015
16259
|
const type = contentItem["type"];
|
|
16016
16260
|
if (typeof type !== "string") {
|
|
@@ -16030,6 +16274,8 @@ function validateMessages(messages, location, filePath, errors) {
|
|
|
16030
16274
|
location: `${contentLocation}.value`,
|
|
16031
16275
|
message: "Content with type 'text' must have a 'value' field"
|
|
16032
16276
|
});
|
|
16277
|
+
} else {
|
|
16278
|
+
validateContentForRoleMarkers(value, `${contentLocation}.value`, filePath, errors);
|
|
16033
16279
|
}
|
|
16034
16280
|
}
|
|
16035
16281
|
} else {
|
|
@@ -16051,6 +16297,19 @@ function validateMessages(messages, location, filePath, errors) {
|
|
|
16051
16297
|
}
|
|
16052
16298
|
}
|
|
16053
16299
|
}
|
|
16300
|
+
function validateContentForRoleMarkers(content, location, filePath, errors) {
|
|
16301
|
+
const markers = ["@[System]:", "@[User]:", "@[Assistant]:", "@[Tool]:"];
|
|
16302
|
+
for (const marker of markers) {
|
|
16303
|
+
if (content.toLowerCase().includes(marker.toLowerCase())) {
|
|
16304
|
+
errors.push({
|
|
16305
|
+
severity: "warning",
|
|
16306
|
+
filePath,
|
|
16307
|
+
location,
|
|
16308
|
+
message: `Content contains potential role marker '${marker}'. This may confuse agentic providers or cause prompt injection.`
|
|
16309
|
+
});
|
|
16310
|
+
}
|
|
16311
|
+
}
|
|
16312
|
+
}
|
|
16054
16313
|
function isObject2(value) {
|
|
16055
16314
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
16056
16315
|
}
|
|
@@ -17665,4 +17924,4 @@ export {
|
|
|
17665
17924
|
createProgram,
|
|
17666
17925
|
runCli
|
|
17667
17926
|
};
|
|
17668
|
-
//# sourceMappingURL=chunk-
|
|
17927
|
+
//# sourceMappingURL=chunk-J5HK75TC.js.map
|