@agentv/core 0.2.3 → 0.2.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-5REK5RSI.js → chunk-XXNQA4EW.js} +56 -2
- package/dist/chunk-XXNQA4EW.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +123 -12
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.d.cts +7 -2
- package/dist/evaluation/validation/index.d.ts +7 -2
- package/dist/evaluation/validation/index.js +97 -11
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +334 -201
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +48 -19
- package/dist/index.d.ts +48 -19
- package/dist/index.js +293 -206
- package/dist/index.js.map +1 -1
- package/package.json +6 -2
- package/dist/chunk-5REK5RSI.js.map +0 -1
package/dist/index.js
CHANGED
|
@@ -1,7 +1,11 @@
|
|
|
1
1
|
import {
|
|
2
|
+
TARGETS_SCHEMA_V2,
|
|
3
|
+
buildDirectoryChain,
|
|
2
4
|
buildSearchRoots,
|
|
5
|
+
fileExists,
|
|
6
|
+
findGitRoot,
|
|
3
7
|
resolveFileReference
|
|
4
|
-
} from "./chunk-
|
|
8
|
+
} from "./chunk-XXNQA4EW.js";
|
|
5
9
|
|
|
6
10
|
// src/evaluation/types.ts
|
|
7
11
|
var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
|
|
@@ -55,6 +59,7 @@ function getHitCount(result) {
|
|
|
55
59
|
}
|
|
56
60
|
|
|
57
61
|
// src/evaluation/yaml-parser.ts
|
|
62
|
+
import micromatch from "micromatch";
|
|
58
63
|
import { constants } from "node:fs";
|
|
59
64
|
import { access, readFile } from "node:fs/promises";
|
|
60
65
|
import path from "node:path";
|
|
@@ -64,9 +69,52 @@ var CODE_BLOCK_PATTERN = /```[\s\S]*?```/g;
|
|
|
64
69
|
var ANSI_YELLOW = "\x1B[33m";
|
|
65
70
|
var ANSI_RESET = "\x1B[0m";
|
|
66
71
|
var SCHEMA_EVAL_V2 = "agentv-eval-v2";
|
|
67
|
-
|
|
72
|
+
var SCHEMA_CONFIG_V2 = "agentv-config-v2";
|
|
73
|
+
async function loadConfig(evalFilePath, repoRoot) {
|
|
74
|
+
const directories = buildDirectoryChain(evalFilePath, repoRoot);
|
|
75
|
+
for (const directory of directories) {
|
|
76
|
+
const configPath = path.join(directory, ".agentv", "config.yaml");
|
|
77
|
+
if (!await fileExists2(configPath)) {
|
|
78
|
+
continue;
|
|
79
|
+
}
|
|
80
|
+
try {
|
|
81
|
+
const rawConfig = await readFile(configPath, "utf8");
|
|
82
|
+
const parsed = parse(rawConfig);
|
|
83
|
+
if (!isJsonObject(parsed)) {
|
|
84
|
+
logWarning(`Invalid .agentv/config.yaml format at ${configPath}`);
|
|
85
|
+
continue;
|
|
86
|
+
}
|
|
87
|
+
const config = parsed;
|
|
88
|
+
const schema = config.$schema;
|
|
89
|
+
if (schema !== SCHEMA_CONFIG_V2) {
|
|
90
|
+
const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${configPath}. Expected '${SCHEMA_CONFIG_V2}'` : `Missing required field '$schema' in ${configPath}.
|
|
91
|
+
Please add '$schema: ${SCHEMA_CONFIG_V2}' at the top of the file.`;
|
|
92
|
+
logWarning(message);
|
|
93
|
+
continue;
|
|
94
|
+
}
|
|
95
|
+
const guidelinePatterns = config.guideline_patterns;
|
|
96
|
+
if (guidelinePatterns !== void 0 && !Array.isArray(guidelinePatterns)) {
|
|
97
|
+
logWarning(`Invalid guideline_patterns in ${configPath}, expected array`);
|
|
98
|
+
continue;
|
|
99
|
+
}
|
|
100
|
+
if (Array.isArray(guidelinePatterns) && !guidelinePatterns.every((p) => typeof p === "string")) {
|
|
101
|
+
logWarning(`Invalid guideline_patterns in ${configPath}, all entries must be strings`);
|
|
102
|
+
continue;
|
|
103
|
+
}
|
|
104
|
+
return {
|
|
105
|
+
guideline_patterns: guidelinePatterns
|
|
106
|
+
};
|
|
107
|
+
} catch (error) {
|
|
108
|
+
logWarning(`Could not read .agentv/config.yaml at ${configPath}: ${error.message}`);
|
|
109
|
+
continue;
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
return null;
|
|
113
|
+
}
|
|
114
|
+
function isGuidelineFile(filePath, patterns) {
|
|
68
115
|
const normalized = filePath.split("\\").join("/");
|
|
69
|
-
|
|
116
|
+
const patternsToUse = patterns ?? [];
|
|
117
|
+
return micromatch.isMatch(normalized, patternsToUse);
|
|
70
118
|
}
|
|
71
119
|
function extractCodeBlocks(segments) {
|
|
72
120
|
const codeBlocks = [];
|
|
@@ -86,43 +134,45 @@ function extractCodeBlocks(segments) {
|
|
|
86
134
|
}
|
|
87
135
|
return codeBlocks;
|
|
88
136
|
}
|
|
89
|
-
async function
|
|
137
|
+
async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
90
138
|
const verbose = options?.verbose ?? false;
|
|
91
|
-
const absoluteTestPath = path.resolve(
|
|
92
|
-
if (!await
|
|
93
|
-
throw new Error(`Test file not found: ${
|
|
139
|
+
const absoluteTestPath = path.resolve(evalFilePath);
|
|
140
|
+
if (!await fileExists2(absoluteTestPath)) {
|
|
141
|
+
throw new Error(`Test file not found: ${evalFilePath}`);
|
|
94
142
|
}
|
|
95
143
|
const repoRootPath = resolveToAbsolutePath(repoRoot);
|
|
96
144
|
const searchRoots = buildSearchRoots(absoluteTestPath, repoRootPath);
|
|
145
|
+
const config = await loadConfig(absoluteTestPath, repoRootPath);
|
|
146
|
+
const guidelinePatterns = config?.guideline_patterns;
|
|
97
147
|
const rawFile = await readFile(absoluteTestPath, "utf8");
|
|
98
148
|
const parsed = parse(rawFile);
|
|
99
149
|
if (!isJsonObject(parsed)) {
|
|
100
|
-
throw new Error(`Invalid test file format: ${
|
|
150
|
+
throw new Error(`Invalid test file format: ${evalFilePath}`);
|
|
101
151
|
}
|
|
102
152
|
const suite = parsed;
|
|
103
153
|
const schema = suite.$schema;
|
|
104
154
|
if (schema !== SCHEMA_EVAL_V2) {
|
|
105
|
-
const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${
|
|
155
|
+
const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${evalFilePath}. Expected '${SCHEMA_EVAL_V2}'` : `Missing required field '$schema' in ${evalFilePath}.
|
|
106
156
|
Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
107
157
|
throw new Error(message);
|
|
108
158
|
}
|
|
109
159
|
const rawTestcases = suite.evalcases;
|
|
110
160
|
if (!Array.isArray(rawTestcases)) {
|
|
111
|
-
throw new Error(`Invalid test file format: ${
|
|
161
|
+
throw new Error(`Invalid test file format: ${evalFilePath} - missing 'evalcases' field`);
|
|
112
162
|
}
|
|
113
163
|
const globalGrader = coerceGrader(suite.grader) ?? "llm_judge";
|
|
114
164
|
const results = [];
|
|
115
|
-
for (const
|
|
116
|
-
if (!isJsonObject(
|
|
165
|
+
for (const rawEvalcase of rawTestcases) {
|
|
166
|
+
if (!isJsonObject(rawEvalcase)) {
|
|
117
167
|
logWarning("Skipping invalid test case entry (expected object)");
|
|
118
168
|
continue;
|
|
119
169
|
}
|
|
120
|
-
const
|
|
121
|
-
const id = asString(
|
|
122
|
-
const conversationId = asString(
|
|
123
|
-
const outcome = asString(
|
|
124
|
-
const inputMessagesValue =
|
|
125
|
-
const expectedMessagesValue =
|
|
170
|
+
const evalcase = rawEvalcase;
|
|
171
|
+
const id = asString(evalcase.id);
|
|
172
|
+
const conversationId = asString(evalcase.conversation_id);
|
|
173
|
+
const outcome = asString(evalcase.outcome);
|
|
174
|
+
const inputMessagesValue = evalcase.input_messages;
|
|
175
|
+
const expectedMessagesValue = evalcase.expected_messages;
|
|
126
176
|
if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
|
|
127
177
|
logWarning(`Skipping incomplete test case: ${id ?? "unknown"}`);
|
|
128
178
|
continue;
|
|
@@ -135,6 +185,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
135
185
|
const expectedMessages = expectedMessagesValue.filter((msg) => isTestMessage(msg));
|
|
136
186
|
const assistantMessages = expectedMessages.filter((message) => message.role === "assistant");
|
|
137
187
|
const userMessages = inputMessages.filter((message) => message.role === "user");
|
|
188
|
+
const systemMessages = inputMessages.filter((message) => message.role === "system");
|
|
138
189
|
if (assistantMessages.length === 0) {
|
|
139
190
|
logWarning(`No assistant message found for test case: ${id}`);
|
|
140
191
|
continue;
|
|
@@ -142,6 +193,29 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
142
193
|
if (assistantMessages.length > 1) {
|
|
143
194
|
logWarning(`Multiple assistant messages found for test case: ${id}, using first`);
|
|
144
195
|
}
|
|
196
|
+
if (systemMessages.length > 1) {
|
|
197
|
+
logWarning(`Multiple system messages found for test case: ${id}, using first`);
|
|
198
|
+
}
|
|
199
|
+
let systemMessageContent;
|
|
200
|
+
if (systemMessages.length > 0) {
|
|
201
|
+
const content = systemMessages[0]?.content;
|
|
202
|
+
if (typeof content === "string") {
|
|
203
|
+
systemMessageContent = content;
|
|
204
|
+
} else if (Array.isArray(content)) {
|
|
205
|
+
const textParts = [];
|
|
206
|
+
for (const segment of content) {
|
|
207
|
+
if (isJsonObject(segment)) {
|
|
208
|
+
const value = segment.value;
|
|
209
|
+
if (typeof value === "string") {
|
|
210
|
+
textParts.push(value);
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
if (textParts.length > 0) {
|
|
215
|
+
systemMessageContent = textParts.join("\n\n");
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
}
|
|
145
219
|
const userSegments = [];
|
|
146
220
|
const guidelinePaths = [];
|
|
147
221
|
const userTextParts = [];
|
|
@@ -173,7 +247,8 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
173
247
|
}
|
|
174
248
|
try {
|
|
175
249
|
const fileContent = (await readFile(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
|
|
176
|
-
|
|
250
|
+
const relativeToRepo = path.relative(repoRootPath, resolvedPath);
|
|
251
|
+
if (isGuidelineFile(relativeToRepo, guidelinePatterns)) {
|
|
177
252
|
guidelinePaths.push(path.resolve(resolvedPath));
|
|
178
253
|
if (verbose) {
|
|
179
254
|
console.log(` [Guideline] Found: ${displayPath}`);
|
|
@@ -183,7 +258,8 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
183
258
|
userSegments.push({
|
|
184
259
|
type: "file",
|
|
185
260
|
path: displayPath,
|
|
186
|
-
text: fileContent
|
|
261
|
+
text: fileContent,
|
|
262
|
+
resolvedPath: path.resolve(resolvedPath)
|
|
187
263
|
});
|
|
188
264
|
if (verbose) {
|
|
189
265
|
console.log(` [File] Found: ${displayPath}`);
|
|
@@ -205,16 +281,29 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
205
281
|
}
|
|
206
282
|
const codeSnippets = extractCodeBlocks(userSegments);
|
|
207
283
|
const assistantContent = assistantMessages[0]?.content;
|
|
208
|
-
const expectedAssistantRaw =
|
|
284
|
+
const expectedAssistantRaw = await resolveAssistantContent(assistantContent, searchRoots, verbose);
|
|
209
285
|
const userTextPrompt = userTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
210
|
-
const testCaseGrader = coerceGrader(
|
|
286
|
+
const testCaseGrader = coerceGrader(evalcase.grader) ?? globalGrader;
|
|
287
|
+
const userFilePaths = [];
|
|
288
|
+
for (const segment of userSegments) {
|
|
289
|
+
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
290
|
+
userFilePaths.push(segment.resolvedPath);
|
|
291
|
+
}
|
|
292
|
+
}
|
|
293
|
+
const allFilePaths = [
|
|
294
|
+
...guidelinePaths.map((guidelinePath) => path.resolve(guidelinePath)),
|
|
295
|
+
...userFilePaths
|
|
296
|
+
];
|
|
211
297
|
const testCase = {
|
|
212
298
|
id,
|
|
213
299
|
conversation_id: conversationId,
|
|
214
300
|
task: userTextPrompt,
|
|
215
301
|
user_segments: userSegments,
|
|
302
|
+
system_message: systemMessageContent,
|
|
216
303
|
expected_assistant_raw: expectedAssistantRaw,
|
|
217
304
|
guideline_paths: guidelinePaths.map((guidelinePath) => path.resolve(guidelinePath)),
|
|
305
|
+
guideline_patterns: guidelinePatterns,
|
|
306
|
+
file_paths: allFilePaths,
|
|
218
307
|
code_snippets: codeSnippets,
|
|
219
308
|
outcome,
|
|
220
309
|
grader: testCaseGrader
|
|
@@ -239,7 +328,7 @@ async function buildPromptInputs(testCase) {
|
|
|
239
328
|
const guidelineContents = [];
|
|
240
329
|
for (const rawPath of testCase.guideline_paths) {
|
|
241
330
|
const absolutePath = path.resolve(rawPath);
|
|
242
|
-
if (!await
|
|
331
|
+
if (!await fileExists2(absolutePath)) {
|
|
243
332
|
logWarning(`Could not read guideline file ${absolutePath}: file does not exist`);
|
|
244
333
|
continue;
|
|
245
334
|
}
|
|
@@ -280,9 +369,9 @@ ${body}`);
|
|
|
280
369
|
}
|
|
281
370
|
const request = requestParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
|
|
282
371
|
const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
|
|
283
|
-
return { request, guidelines };
|
|
372
|
+
return { request, guidelines, systemMessage: testCase.system_message };
|
|
284
373
|
}
|
|
285
|
-
async function
|
|
374
|
+
async function fileExists2(absolutePath) {
|
|
286
375
|
try {
|
|
287
376
|
await access(absolutePath, constants.F_OK);
|
|
288
377
|
return true;
|
|
@@ -321,7 +410,7 @@ function cloneJsonValue(value) {
|
|
|
321
410
|
}
|
|
322
411
|
return cloneJsonObject(value);
|
|
323
412
|
}
|
|
324
|
-
function
|
|
413
|
+
async function resolveAssistantContent(content, searchRoots, verbose) {
|
|
325
414
|
if (typeof content === "string") {
|
|
326
415
|
return content;
|
|
327
416
|
}
|
|
@@ -334,12 +423,42 @@ function normalizeAssistantContent(content) {
|
|
|
334
423
|
parts.push(entry);
|
|
335
424
|
continue;
|
|
336
425
|
}
|
|
337
|
-
|
|
426
|
+
if (!isJsonObject(entry)) {
|
|
427
|
+
continue;
|
|
428
|
+
}
|
|
429
|
+
const segmentType = asString(entry.type);
|
|
430
|
+
if (segmentType === "file") {
|
|
431
|
+
const rawValue = asString(entry.value);
|
|
432
|
+
if (!rawValue) {
|
|
433
|
+
continue;
|
|
434
|
+
}
|
|
435
|
+
const { displayPath, resolvedPath, attempted } = await resolveFileReference(
|
|
436
|
+
rawValue,
|
|
437
|
+
searchRoots
|
|
438
|
+
);
|
|
439
|
+
if (!resolvedPath) {
|
|
440
|
+
const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
|
|
441
|
+
logWarning(`File not found in expected_messages: ${displayPath}`, attempts);
|
|
442
|
+
continue;
|
|
443
|
+
}
|
|
444
|
+
try {
|
|
445
|
+
const fileContent = (await readFile(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
|
|
446
|
+
parts.push(fileContent);
|
|
447
|
+
if (verbose) {
|
|
448
|
+
console.log(` [Expected Assistant File] Found: ${displayPath}`);
|
|
449
|
+
console.log(` Resolved to: ${resolvedPath}`);
|
|
450
|
+
}
|
|
451
|
+
} catch (error) {
|
|
452
|
+
logWarning(`Could not read file ${resolvedPath}: ${error.message}`);
|
|
453
|
+
}
|
|
454
|
+
continue;
|
|
455
|
+
}
|
|
456
|
+
const textValue = asString(entry.text);
|
|
338
457
|
if (typeof textValue === "string") {
|
|
339
458
|
parts.push(textValue);
|
|
340
459
|
continue;
|
|
341
460
|
}
|
|
342
|
-
const valueValue = asString(entry
|
|
461
|
+
const valueValue = asString(entry.value);
|
|
343
462
|
if (typeof valueValue === "string") {
|
|
344
463
|
parts.push(valueValue);
|
|
345
464
|
continue;
|
|
@@ -376,15 +495,18 @@ function buildChatPrompt(request) {
|
|
|
376
495
|
return request.chatPrompt;
|
|
377
496
|
}
|
|
378
497
|
const systemSegments = [];
|
|
379
|
-
if (request.guidelines && request.guidelines.trim().length > 0) {
|
|
380
|
-
systemSegments.push(`Guidelines:
|
|
381
|
-
${request.guidelines.trim()}`);
|
|
382
|
-
}
|
|
383
498
|
const metadataSystemPrompt = typeof request.metadata?.systemPrompt === "string" ? request.metadata.systemPrompt : void 0;
|
|
384
499
|
if (metadataSystemPrompt && metadataSystemPrompt.trim().length > 0) {
|
|
385
500
|
systemSegments.push(metadataSystemPrompt.trim());
|
|
501
|
+
} else {
|
|
502
|
+
systemSegments.push(DEFAULT_SYSTEM_PROMPT);
|
|
503
|
+
}
|
|
504
|
+
if (request.guidelines && request.guidelines.trim().length > 0) {
|
|
505
|
+
systemSegments.push(`[[ ## Guidelines ## ]]
|
|
506
|
+
|
|
507
|
+
${request.guidelines.trim()}`);
|
|
386
508
|
}
|
|
387
|
-
const systemContent = systemSegments.
|
|
509
|
+
const systemContent = systemSegments.join("\n\n");
|
|
388
510
|
const userContent = request.prompt.trim();
|
|
389
511
|
const prompt = [
|
|
390
512
|
{
|
|
@@ -840,11 +962,9 @@ function isLikelyEnvReference(value) {
|
|
|
840
962
|
}
|
|
841
963
|
|
|
842
964
|
// src/evaluation/providers/vscode.ts
|
|
843
|
-
import {
|
|
844
|
-
import { tmpdir } from "node:os";
|
|
965
|
+
import { readFile as readFile2 } from "node:fs/promises";
|
|
845
966
|
import path2 from "node:path";
|
|
846
967
|
import { dispatchAgentSession, getSubagentRoot, provisionSubagents } from "subagent";
|
|
847
|
-
var PROMPT_FILE_PREFIX = "bbeval-vscode-";
|
|
848
968
|
var VSCodeProvider = class {
|
|
849
969
|
id;
|
|
850
970
|
kind;
|
|
@@ -861,128 +981,89 @@ var VSCodeProvider = class {
|
|
|
861
981
|
throw new Error("VS Code provider request was aborted before dispatch");
|
|
862
982
|
}
|
|
863
983
|
const attachments = normalizeAttachments(request.attachments);
|
|
864
|
-
const promptContent = buildPromptDocument(request, attachments);
|
|
865
|
-
const
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
|
|
881
|
-
const failure = session.error ?? "VS Code subagent did not produce a response";
|
|
882
|
-
throw new Error(failure);
|
|
883
|
-
}
|
|
884
|
-
if (this.config.dryRun) {
|
|
885
|
-
return {
|
|
886
|
-
text: "",
|
|
887
|
-
raw: {
|
|
888
|
-
session,
|
|
889
|
-
promptFile: promptPath,
|
|
890
|
-
attachments
|
|
891
|
-
}
|
|
892
|
-
};
|
|
893
|
-
}
|
|
894
|
-
const responseText = await readFile2(session.responseFile, "utf8");
|
|
984
|
+
const promptContent = buildPromptDocument(request, attachments, request.guideline_patterns);
|
|
985
|
+
const session = await dispatchAgentSession({
|
|
986
|
+
userQuery: promptContent,
|
|
987
|
+
// Use full prompt content instead of just request.prompt
|
|
988
|
+
extraAttachments: attachments,
|
|
989
|
+
wait: this.config.waitForResponse,
|
|
990
|
+
dryRun: this.config.dryRun,
|
|
991
|
+
vscodeCmd: this.config.command,
|
|
992
|
+
subagentRoot: this.config.subagentRoot,
|
|
993
|
+
workspaceTemplate: this.config.workspaceTemplate,
|
|
994
|
+
silent: true
|
|
995
|
+
});
|
|
996
|
+
if (session.exitCode !== 0 || !session.responseFile) {
|
|
997
|
+
const failure = session.error ?? "VS Code subagent did not produce a response";
|
|
998
|
+
throw new Error(failure);
|
|
999
|
+
}
|
|
1000
|
+
if (this.config.dryRun) {
|
|
895
1001
|
return {
|
|
896
|
-
text:
|
|
1002
|
+
text: "",
|
|
897
1003
|
raw: {
|
|
898
1004
|
session,
|
|
899
|
-
promptFile: promptPath,
|
|
900
1005
|
attachments
|
|
901
1006
|
}
|
|
902
1007
|
};
|
|
903
|
-
} finally {
|
|
904
|
-
await rm(directory, { recursive: true, force: true });
|
|
905
1008
|
}
|
|
1009
|
+
const responseText = await readFile2(session.responseFile, "utf8");
|
|
1010
|
+
return {
|
|
1011
|
+
text: responseText,
|
|
1012
|
+
raw: {
|
|
1013
|
+
session,
|
|
1014
|
+
attachments
|
|
1015
|
+
}
|
|
1016
|
+
};
|
|
906
1017
|
}
|
|
907
1018
|
};
|
|
908
|
-
function buildPromptDocument(request, attachments) {
|
|
1019
|
+
function buildPromptDocument(request, attachments, guidelinePatterns) {
|
|
909
1020
|
const parts = [];
|
|
910
|
-
const
|
|
911
|
-
if (
|
|
912
|
-
parts.push(buildMandatoryPrereadBlock(
|
|
913
|
-
}
|
|
914
|
-
parts.push(`# BbEval Request`);
|
|
915
|
-
if (request.testCaseId) {
|
|
916
|
-
parts.push(`- Test Case: ${request.testCaseId}`);
|
|
917
|
-
}
|
|
918
|
-
if (request.metadata?.target) {
|
|
919
|
-
parts.push(`- Target: ${String(request.metadata.target)}`);
|
|
920
|
-
}
|
|
921
|
-
parts.push("\n## Task\n", request.prompt.trim());
|
|
922
|
-
if (request.guidelines && request.guidelines.trim().length > 0) {
|
|
923
|
-
parts.push("\n## Guidelines\n", request.guidelines.trim());
|
|
924
|
-
}
|
|
925
|
-
if (attachments && attachments.length > 0) {
|
|
926
|
-
const attachmentList = attachments.map((item) => `- ${item}`).join("\n");
|
|
927
|
-
parts.push("\n## Attachments\n", attachmentList);
|
|
1021
|
+
const guidelineFiles = collectGuidelineFiles(attachments, guidelinePatterns);
|
|
1022
|
+
if (guidelineFiles.length > 0) {
|
|
1023
|
+
parts.push("\n", buildMandatoryPrereadBlock(guidelineFiles));
|
|
928
1024
|
}
|
|
1025
|
+
parts.push("\n[[ ## user_query ## ]]\n", request.prompt.trim());
|
|
929
1026
|
return parts.join("\n").trim();
|
|
930
1027
|
}
|
|
931
|
-
function buildMandatoryPrereadBlock(
|
|
932
|
-
if (
|
|
1028
|
+
function buildMandatoryPrereadBlock(guidelineFiles) {
|
|
1029
|
+
if (guidelineFiles.length === 0) {
|
|
933
1030
|
return "";
|
|
934
1031
|
}
|
|
935
1032
|
const fileList = [];
|
|
936
|
-
const tokenList = [];
|
|
937
1033
|
let counter = 0;
|
|
938
|
-
for (const absolutePath of
|
|
1034
|
+
for (const absolutePath of guidelineFiles) {
|
|
939
1035
|
counter += 1;
|
|
940
1036
|
const fileName = path2.basename(absolutePath);
|
|
941
1037
|
const fileUri = pathToFileUri(absolutePath);
|
|
942
|
-
fileList.push(
|
|
943
|
-
tokenList.push(`INSTRUCTIONS_READ: \`${fileName}\` i=${counter} SHA256=<hex>`);
|
|
1038
|
+
fileList.push(`* [${fileName}](${fileUri})`);
|
|
944
1039
|
}
|
|
945
|
-
const filesText = fileList.join("
|
|
946
|
-
const tokensText = tokenList.join("\n");
|
|
1040
|
+
const filesText = fileList.join("\n");
|
|
947
1041
|
const instruction = [
|
|
948
|
-
`Read all
|
|
949
|
-
|
|
950
|
-
"`Get-FileHash -Algorithm SHA256 -LiteralPath '<file-path>' | Select-Object -ExpandProperty Hash`.",
|
|
951
|
-
`Then include, at the top of your reply, these exact tokens on separate lines:
|
|
1042
|
+
`Read all guideline files:
|
|
1043
|
+
${filesText}.
|
|
952
1044
|
`,
|
|
953
|
-
tokensText,
|
|
954
|
-
`
|
|
955
|
-
Replace \`<hex>\` with the actual SHA256 hash value computed from the PowerShell command.`,
|
|
956
1045
|
`If any file is missing, fail with ERROR: missing-file <filename> and stop.
|
|
957
1046
|
`,
|
|
958
|
-
`Then
|
|
959
|
-
].join("
|
|
960
|
-
return
|
|
961
|
-
|
|
962
|
-
${instruction}
|
|
963
|
-
|
|
964
|
-
`;
|
|
1047
|
+
`Then apply system_instructions on the user query below.`
|
|
1048
|
+
].join("");
|
|
1049
|
+
return `${instruction}`;
|
|
965
1050
|
}
|
|
966
|
-
function
|
|
1051
|
+
function collectGuidelineFiles(attachments, guidelinePatterns) {
|
|
967
1052
|
if (!attachments || attachments.length === 0) {
|
|
968
1053
|
return [];
|
|
969
1054
|
}
|
|
970
1055
|
const unique = /* @__PURE__ */ new Map();
|
|
971
1056
|
for (const attachment of attachments) {
|
|
972
|
-
if (!isInstructionPath(attachment)) {
|
|
973
|
-
continue;
|
|
974
|
-
}
|
|
975
1057
|
const absolutePath = path2.resolve(attachment);
|
|
976
|
-
|
|
977
|
-
|
|
1058
|
+
const normalized = absolutePath.split(path2.sep).join("/");
|
|
1059
|
+
if (isGuidelineFile(normalized, guidelinePatterns)) {
|
|
1060
|
+
if (!unique.has(absolutePath)) {
|
|
1061
|
+
unique.set(absolutePath, absolutePath);
|
|
1062
|
+
}
|
|
978
1063
|
}
|
|
979
1064
|
}
|
|
980
1065
|
return Array.from(unique.values());
|
|
981
1066
|
}
|
|
982
|
-
function isInstructionPath(filePath) {
|
|
983
|
-
const normalized = filePath.split(path2.sep).join("/");
|
|
984
|
-
return normalized.endsWith(".instructions.md") || normalized.includes("/instructions/") || normalized.endsWith(".prompt.md") || normalized.includes("/prompts/");
|
|
985
|
-
}
|
|
986
1067
|
function pathToFileUri(filePath) {
|
|
987
1068
|
const absolutePath = path2.isAbsolute(filePath) ? filePath : path2.resolve(filePath);
|
|
988
1069
|
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
@@ -991,14 +1072,6 @@ function pathToFileUri(filePath) {
|
|
|
991
1072
|
}
|
|
992
1073
|
return `file://${normalizedPath}`;
|
|
993
1074
|
}
|
|
994
|
-
function composeUserQuery(request) {
|
|
995
|
-
const segments = [];
|
|
996
|
-
segments.push(request.prompt.trim());
|
|
997
|
-
if (request.guidelines && request.guidelines.trim().length > 0) {
|
|
998
|
-
segments.push("\nGuidelines:\n", request.guidelines.trim());
|
|
999
|
-
}
|
|
1000
|
-
return segments.join("\n").trim();
|
|
1001
|
-
}
|
|
1002
1075
|
function normalizeAttachments(attachments) {
|
|
1003
1076
|
if (!attachments || attachments.length === 0) {
|
|
1004
1077
|
return void 0;
|
|
@@ -1056,18 +1129,24 @@ import { parse as parse2 } from "yaml";
|
|
|
1056
1129
|
function isRecord(value) {
|
|
1057
1130
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
1058
1131
|
}
|
|
1059
|
-
function
|
|
1060
|
-
const
|
|
1061
|
-
if (
|
|
1132
|
+
function checkSchema(parsed, absolutePath) {
|
|
1133
|
+
const schema = parsed.$schema;
|
|
1134
|
+
if (schema === void 0) {
|
|
1062
1135
|
throw new Error(
|
|
1063
|
-
`Missing
|
|
1064
|
-
Please add '
|
|
1136
|
+
`Missing $schema field in targets.yaml at ${absolutePath}.
|
|
1137
|
+
Please add '$schema: ${TARGETS_SCHEMA_V2}' at the top of the file.`
|
|
1065
1138
|
);
|
|
1066
1139
|
}
|
|
1067
|
-
if (
|
|
1140
|
+
if (typeof schema !== "string") {
|
|
1068
1141
|
throw new Error(
|
|
1069
|
-
`
|
|
1070
|
-
|
|
1142
|
+
`Invalid $schema field in targets.yaml at ${absolutePath}.
|
|
1143
|
+
Expected a string value '${TARGETS_SCHEMA_V2}'.`
|
|
1144
|
+
);
|
|
1145
|
+
}
|
|
1146
|
+
if (schema !== TARGETS_SCHEMA_V2) {
|
|
1147
|
+
throw new Error(
|
|
1148
|
+
`Invalid $schema '${schema}' in targets.yaml at ${absolutePath}.
|
|
1149
|
+
Expected '${TARGETS_SCHEMA_V2}'.`
|
|
1071
1150
|
);
|
|
1072
1151
|
}
|
|
1073
1152
|
}
|
|
@@ -1099,7 +1178,7 @@ function assertTargetDefinition(value, index, filePath) {
|
|
|
1099
1178
|
judge_target: typeof judgeTarget === "string" ? judgeTarget : void 0
|
|
1100
1179
|
};
|
|
1101
1180
|
}
|
|
1102
|
-
async function
|
|
1181
|
+
async function fileExists3(filePath) {
|
|
1103
1182
|
try {
|
|
1104
1183
|
await access2(filePath, constants2.F_OK);
|
|
1105
1184
|
return true;
|
|
@@ -1109,15 +1188,15 @@ async function fileExists2(filePath) {
|
|
|
1109
1188
|
}
|
|
1110
1189
|
async function readTargetDefinitions(filePath) {
|
|
1111
1190
|
const absolutePath = path3.resolve(filePath);
|
|
1112
|
-
if (!await
|
|
1191
|
+
if (!await fileExists3(absolutePath)) {
|
|
1113
1192
|
throw new Error(`targets.yaml not found at ${absolutePath}`);
|
|
1114
1193
|
}
|
|
1115
1194
|
const raw = await readFile3(absolutePath, "utf8");
|
|
1116
1195
|
const parsed = parse2(raw);
|
|
1117
1196
|
if (!isRecord(parsed)) {
|
|
1118
|
-
throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with '
|
|
1197
|
+
throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with '$schema' and 'targets' fields`);
|
|
1119
1198
|
}
|
|
1120
|
-
|
|
1199
|
+
checkSchema(parsed, absolutePath);
|
|
1121
1200
|
const targets = extractTargetsArray(parsed, absolutePath);
|
|
1122
1201
|
const definitions = targets.map((entry, index) => assertTargetDefinition(entry, index, absolutePath));
|
|
1123
1202
|
return definitions;
|
|
@@ -1339,7 +1418,7 @@ import { randomUUID } from "node:crypto";
|
|
|
1339
1418
|
var HeuristicGrader = class {
|
|
1340
1419
|
kind = "heuristic";
|
|
1341
1420
|
grade(context) {
|
|
1342
|
-
const expectedAspects = extractAspects(context.
|
|
1421
|
+
const expectedAspects = extractAspects(context.evalCase.expected_assistant_raw);
|
|
1343
1422
|
const result = scoreCandidateResponse(context.candidate, expectedAspects);
|
|
1344
1423
|
const misses = [...result.misses];
|
|
1345
1424
|
if (expectedAspects.length === 0 && isErrorLike(context.candidate)) {
|
|
@@ -1372,14 +1451,14 @@ var QualityGrader = class {
|
|
|
1372
1451
|
if (!judgeProvider) {
|
|
1373
1452
|
throw new Error("No judge provider available for LLM grading");
|
|
1374
1453
|
}
|
|
1375
|
-
const prompt = buildQualityPrompt(context.
|
|
1454
|
+
const prompt = buildQualityPrompt(context.evalCase, context.candidate);
|
|
1376
1455
|
const metadata = {
|
|
1377
1456
|
systemPrompt: QUALITY_SYSTEM_PROMPT
|
|
1378
1457
|
};
|
|
1379
1458
|
const response = await judgeProvider.invoke({
|
|
1380
1459
|
prompt,
|
|
1381
1460
|
metadata,
|
|
1382
|
-
|
|
1461
|
+
evalCaseId: context.evalCase.id,
|
|
1383
1462
|
attempt: context.attempt,
|
|
1384
1463
|
maxOutputTokens: this.maxOutputTokens,
|
|
1385
1464
|
temperature: this.temperature
|
|
@@ -1425,16 +1504,16 @@ var QUALITY_SYSTEM_PROMPT = [
|
|
|
1425
1504
|
function buildQualityPrompt(testCase, candidate) {
|
|
1426
1505
|
const parts = [
|
|
1427
1506
|
"[[ ## expected_outcome ## ]]",
|
|
1428
|
-
testCase.outcome,
|
|
1507
|
+
testCase.outcome.trim(),
|
|
1429
1508
|
"",
|
|
1430
1509
|
"[[ ## request ## ]]",
|
|
1431
|
-
testCase.task,
|
|
1510
|
+
testCase.task.trim(),
|
|
1432
1511
|
"",
|
|
1433
1512
|
"[[ ## reference_answer ## ]]",
|
|
1434
|
-
testCase.expected_assistant_raw,
|
|
1513
|
+
testCase.expected_assistant_raw.trim(),
|
|
1435
1514
|
"",
|
|
1436
1515
|
"[[ ## generated_answer ## ]]",
|
|
1437
|
-
candidate,
|
|
1516
|
+
candidate.trim(),
|
|
1438
1517
|
"",
|
|
1439
1518
|
"Respond with a single JSON object matching the schema described in the system prompt."
|
|
1440
1519
|
];
|
|
@@ -1678,17 +1757,17 @@ async function runEvaluation(options) {
|
|
|
1678
1757
|
cache,
|
|
1679
1758
|
useCache,
|
|
1680
1759
|
now,
|
|
1681
|
-
|
|
1760
|
+
evalId,
|
|
1682
1761
|
verbose,
|
|
1683
1762
|
onResult,
|
|
1684
1763
|
onProgress
|
|
1685
1764
|
} = options;
|
|
1686
|
-
const load =
|
|
1687
|
-
const
|
|
1688
|
-
const
|
|
1689
|
-
if (
|
|
1690
|
-
if (
|
|
1691
|
-
throw new Error(`Test case with id '${
|
|
1765
|
+
const load = loadEvalCases;
|
|
1766
|
+
const evalCases = await load(testFilePath, repoRoot, { verbose });
|
|
1767
|
+
const filteredEvalCases = filterEvalCases(evalCases, evalId);
|
|
1768
|
+
if (filteredEvalCases.length === 0) {
|
|
1769
|
+
if (evalId) {
|
|
1770
|
+
throw new Error(`Test case with id '${evalId}' not found in ${testFilePath}`);
|
|
1692
1771
|
}
|
|
1693
1772
|
return [];
|
|
1694
1773
|
}
|
|
@@ -1732,11 +1811,11 @@ async function runEvaluation(options) {
|
|
|
1732
1811
|
};
|
|
1733
1812
|
const graderRegistry = buildGraderRegistry(graders, resolveJudgeProvider);
|
|
1734
1813
|
const primaryProvider = getOrCreateProvider(target);
|
|
1735
|
-
if (onProgress &&
|
|
1736
|
-
for (let i = 0; i <
|
|
1814
|
+
if (onProgress && filteredEvalCases.length > 0) {
|
|
1815
|
+
for (let i = 0; i < filteredEvalCases.length; i++) {
|
|
1737
1816
|
await onProgress({
|
|
1738
1817
|
workerId: i + 1,
|
|
1739
|
-
|
|
1818
|
+
evalId: filteredEvalCases[i].id,
|
|
1740
1819
|
status: "pending"
|
|
1741
1820
|
});
|
|
1742
1821
|
}
|
|
@@ -1744,23 +1823,23 @@ async function runEvaluation(options) {
|
|
|
1744
1823
|
const workers = options.maxConcurrency ?? target.workers ?? 1;
|
|
1745
1824
|
const limit = pLimit(workers);
|
|
1746
1825
|
let nextWorkerId = 1;
|
|
1747
|
-
const
|
|
1748
|
-
const promises =
|
|
1749
|
-
(
|
|
1826
|
+
const workerIdByEvalId = /* @__PURE__ */ new Map();
|
|
1827
|
+
const promises = filteredEvalCases.map(
|
|
1828
|
+
(evalCase) => limit(async () => {
|
|
1750
1829
|
const workerId = nextWorkerId++;
|
|
1751
|
-
|
|
1830
|
+
workerIdByEvalId.set(evalCase.id, workerId);
|
|
1752
1831
|
if (onProgress) {
|
|
1753
1832
|
await onProgress({
|
|
1754
1833
|
workerId,
|
|
1755
|
-
|
|
1834
|
+
evalId: evalCase.id,
|
|
1756
1835
|
status: "running",
|
|
1757
1836
|
startedAt: Date.now()
|
|
1758
1837
|
});
|
|
1759
1838
|
}
|
|
1760
1839
|
try {
|
|
1761
1840
|
const judgeProvider = await resolveJudgeProvider(target);
|
|
1762
|
-
const result = await
|
|
1763
|
-
|
|
1841
|
+
const result = await runEvalCase({
|
|
1842
|
+
evalCase,
|
|
1764
1843
|
provider: primaryProvider,
|
|
1765
1844
|
target,
|
|
1766
1845
|
graders: graderRegistry,
|
|
@@ -1775,7 +1854,7 @@ async function runEvaluation(options) {
|
|
|
1775
1854
|
if (onProgress) {
|
|
1776
1855
|
await onProgress({
|
|
1777
1856
|
workerId,
|
|
1778
|
-
|
|
1857
|
+
evalId: evalCase.id,
|
|
1779
1858
|
status: "completed",
|
|
1780
1859
|
startedAt: 0,
|
|
1781
1860
|
// Not used for completed status
|
|
@@ -1790,7 +1869,7 @@ async function runEvaluation(options) {
|
|
|
1790
1869
|
if (onProgress) {
|
|
1791
1870
|
await onProgress({
|
|
1792
1871
|
workerId,
|
|
1793
|
-
|
|
1872
|
+
evalId: evalCase.id,
|
|
1794
1873
|
status: "failed",
|
|
1795
1874
|
completedAt: Date.now(),
|
|
1796
1875
|
error: error instanceof Error ? error.message : String(error)
|
|
@@ -1807,10 +1886,10 @@ async function runEvaluation(options) {
|
|
|
1807
1886
|
if (outcome.status === "fulfilled") {
|
|
1808
1887
|
results.push(outcome.value);
|
|
1809
1888
|
} else {
|
|
1810
|
-
const
|
|
1811
|
-
const promptInputs = await buildPromptInputs(
|
|
1889
|
+
const evalCase = filteredEvalCases[i];
|
|
1890
|
+
const promptInputs = await buildPromptInputs(evalCase);
|
|
1812
1891
|
const errorResult = buildErrorResult(
|
|
1813
|
-
|
|
1892
|
+
evalCase,
|
|
1814
1893
|
target.name,
|
|
1815
1894
|
(now ?? (() => /* @__PURE__ */ new Date()))(),
|
|
1816
1895
|
outcome.reason,
|
|
@@ -1824,9 +1903,9 @@ async function runEvaluation(options) {
|
|
|
1824
1903
|
}
|
|
1825
1904
|
return results;
|
|
1826
1905
|
}
|
|
1827
|
-
async function
|
|
1906
|
+
async function runEvalCase(options) {
|
|
1828
1907
|
const {
|
|
1829
|
-
|
|
1908
|
+
evalCase,
|
|
1830
1909
|
provider,
|
|
1831
1910
|
target,
|
|
1832
1911
|
graders,
|
|
@@ -1839,11 +1918,11 @@ async function runTestCase(options) {
|
|
|
1839
1918
|
signal,
|
|
1840
1919
|
judgeProvider
|
|
1841
1920
|
} = options;
|
|
1842
|
-
const promptInputs = await buildPromptInputs(
|
|
1921
|
+
const promptInputs = await buildPromptInputs(evalCase);
|
|
1843
1922
|
if (promptDumpDir) {
|
|
1844
|
-
await dumpPrompt(promptDumpDir,
|
|
1923
|
+
await dumpPrompt(promptDumpDir, evalCase, promptInputs);
|
|
1845
1924
|
}
|
|
1846
|
-
const cacheKey = useCache ? createCacheKey(provider, target,
|
|
1925
|
+
const cacheKey = useCache ? createCacheKey(provider, target, evalCase, promptInputs) : void 0;
|
|
1847
1926
|
let cachedResponse;
|
|
1848
1927
|
if (cacheKey && cache) {
|
|
1849
1928
|
cachedResponse = await cache.get(cacheKey);
|
|
@@ -1856,7 +1935,7 @@ async function runTestCase(options) {
|
|
|
1856
1935
|
while (!providerResponse && attempt < attemptBudget) {
|
|
1857
1936
|
try {
|
|
1858
1937
|
providerResponse = await invokeProvider(provider, {
|
|
1859
|
-
|
|
1938
|
+
evalCase,
|
|
1860
1939
|
target,
|
|
1861
1940
|
promptInputs,
|
|
1862
1941
|
attempt,
|
|
@@ -1869,12 +1948,12 @@ async function runTestCase(options) {
|
|
|
1869
1948
|
attempt += 1;
|
|
1870
1949
|
continue;
|
|
1871
1950
|
}
|
|
1872
|
-
return buildErrorResult(
|
|
1951
|
+
return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
|
|
1873
1952
|
}
|
|
1874
1953
|
}
|
|
1875
1954
|
if (!providerResponse) {
|
|
1876
1955
|
return buildErrorResult(
|
|
1877
|
-
|
|
1956
|
+
evalCase,
|
|
1878
1957
|
target.name,
|
|
1879
1958
|
nowFn(),
|
|
1880
1959
|
lastError ?? new Error("Provider did not return a response"),
|
|
@@ -1884,7 +1963,7 @@ async function runTestCase(options) {
|
|
|
1884
1963
|
if (cacheKey && cache && !cachedResponse) {
|
|
1885
1964
|
await cache.set(cacheKey, providerResponse);
|
|
1886
1965
|
}
|
|
1887
|
-
const graderKind =
|
|
1966
|
+
const graderKind = evalCase.grader ?? "heuristic";
|
|
1888
1967
|
const activeGrader = graders[graderKind] ?? graders.heuristic;
|
|
1889
1968
|
if (!activeGrader) {
|
|
1890
1969
|
throw new Error(`No grader registered for kind '${graderKind}'`);
|
|
@@ -1893,7 +1972,7 @@ async function runTestCase(options) {
|
|
|
1893
1972
|
try {
|
|
1894
1973
|
const gradeTimestamp = nowFn();
|
|
1895
1974
|
grade = await activeGrader.grade({
|
|
1896
|
-
|
|
1975
|
+
evalCase,
|
|
1897
1976
|
candidate: providerResponse.text ?? "",
|
|
1898
1977
|
target,
|
|
1899
1978
|
provider,
|
|
@@ -1903,17 +1982,18 @@ async function runTestCase(options) {
|
|
|
1903
1982
|
judgeProvider
|
|
1904
1983
|
});
|
|
1905
1984
|
} catch (error) {
|
|
1906
|
-
return buildErrorResult(
|
|
1985
|
+
return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
|
|
1907
1986
|
}
|
|
1908
1987
|
const completedAt = nowFn();
|
|
1909
1988
|
const rawRequest = {
|
|
1910
1989
|
request: promptInputs.request,
|
|
1911
1990
|
guidelines: promptInputs.guidelines,
|
|
1912
|
-
guideline_paths:
|
|
1991
|
+
guideline_paths: evalCase.guideline_paths,
|
|
1992
|
+
system_message: promptInputs.systemMessage ?? ""
|
|
1913
1993
|
};
|
|
1914
1994
|
return {
|
|
1915
|
-
|
|
1916
|
-
conversation_id:
|
|
1995
|
+
eval_id: evalCase.id,
|
|
1996
|
+
conversation_id: evalCase.conversation_id,
|
|
1917
1997
|
score: grade.score,
|
|
1918
1998
|
hits: grade.hits,
|
|
1919
1999
|
misses: grade.misses,
|
|
@@ -1927,11 +2007,11 @@ async function runTestCase(options) {
|
|
|
1927
2007
|
grader_raw_request: grade.graderRawRequest
|
|
1928
2008
|
};
|
|
1929
2009
|
}
|
|
1930
|
-
function
|
|
1931
|
-
if (!
|
|
1932
|
-
return
|
|
2010
|
+
function filterEvalCases(evalCases, evalId) {
|
|
2011
|
+
if (!evalId) {
|
|
2012
|
+
return evalCases;
|
|
1933
2013
|
}
|
|
1934
|
-
return
|
|
2014
|
+
return evalCases.filter((evalCase) => evalCase.id === evalId);
|
|
1935
2015
|
}
|
|
1936
2016
|
function buildGraderRegistry(overrides, resolveJudgeProvider) {
|
|
1937
2017
|
const heuristic = overrides?.heuristic ?? new HeuristicGrader();
|
|
@@ -1949,16 +2029,16 @@ function buildGraderRegistry(overrides, resolveJudgeProvider) {
|
|
|
1949
2029
|
llm_judge: llmJudge
|
|
1950
2030
|
};
|
|
1951
2031
|
}
|
|
1952
|
-
async function dumpPrompt(directory,
|
|
2032
|
+
async function dumpPrompt(directory, evalCase, promptInputs) {
|
|
1953
2033
|
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
1954
|
-
const filename = `${timestamp}_${sanitizeFilename(
|
|
2034
|
+
const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
|
|
1955
2035
|
const filePath = path4.resolve(directory, filename);
|
|
1956
2036
|
await mkdir(path4.dirname(filePath), { recursive: true });
|
|
1957
2037
|
const payload = {
|
|
1958
|
-
|
|
2038
|
+
eval_id: evalCase.id,
|
|
1959
2039
|
request: promptInputs.request,
|
|
1960
2040
|
guidelines: promptInputs.guidelines,
|
|
1961
|
-
guideline_paths:
|
|
2041
|
+
guideline_paths: evalCase.guideline_paths
|
|
1962
2042
|
};
|
|
1963
2043
|
await writeFile2(filePath, JSON.stringify(payload, null, 2), "utf8");
|
|
1964
2044
|
}
|
|
@@ -1970,7 +2050,7 @@ function sanitizeFilename(value) {
|
|
|
1970
2050
|
return sanitized.length > 0 ? sanitized : randomUUID2();
|
|
1971
2051
|
}
|
|
1972
2052
|
async function invokeProvider(provider, options) {
|
|
1973
|
-
const {
|
|
2053
|
+
const { evalCase, target, promptInputs, attempt, agentTimeoutMs, signal } = options;
|
|
1974
2054
|
const controller = new AbortController();
|
|
1975
2055
|
const timeout = agentTimeoutMs ? setTimeout(() => controller.abort(), agentTimeoutMs) : void 0;
|
|
1976
2056
|
if (signal) {
|
|
@@ -1980,12 +2060,12 @@ async function invokeProvider(provider, options) {
|
|
|
1980
2060
|
return await provider.invoke({
|
|
1981
2061
|
prompt: promptInputs.request,
|
|
1982
2062
|
guidelines: promptInputs.guidelines,
|
|
1983
|
-
|
|
1984
|
-
|
|
2063
|
+
guideline_patterns: evalCase.guideline_patterns,
|
|
2064
|
+
attachments: evalCase.file_paths,
|
|
2065
|
+
evalCaseId: evalCase.id,
|
|
1985
2066
|
attempt,
|
|
1986
2067
|
metadata: {
|
|
1987
|
-
|
|
1988
|
-
grader: testCase.grader
|
|
2068
|
+
systemPrompt: promptInputs.systemMessage ?? ""
|
|
1989
2069
|
},
|
|
1990
2070
|
signal: controller.signal
|
|
1991
2071
|
});
|
|
@@ -1995,17 +2075,18 @@ async function invokeProvider(provider, options) {
|
|
|
1995
2075
|
}
|
|
1996
2076
|
}
|
|
1997
2077
|
}
|
|
1998
|
-
function buildErrorResult(
|
|
2078
|
+
function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs) {
|
|
1999
2079
|
const message = error instanceof Error ? error.message : String(error);
|
|
2000
2080
|
const rawRequest = {
|
|
2001
2081
|
request: promptInputs.request,
|
|
2002
2082
|
guidelines: promptInputs.guidelines,
|
|
2003
|
-
guideline_paths:
|
|
2083
|
+
guideline_paths: evalCase.guideline_paths,
|
|
2084
|
+
system_message: promptInputs.systemMessage ?? "",
|
|
2004
2085
|
error: message
|
|
2005
2086
|
};
|
|
2006
2087
|
return {
|
|
2007
|
-
|
|
2008
|
-
conversation_id:
|
|
2088
|
+
eval_id: evalCase.id,
|
|
2089
|
+
conversation_id: evalCase.conversation_id,
|
|
2009
2090
|
score: 0,
|
|
2010
2091
|
hits: [],
|
|
2011
2092
|
misses: [`Error: ${message}`],
|
|
@@ -2017,13 +2098,14 @@ function buildErrorResult(testCase, targetName, timestamp, error, promptInputs)
|
|
|
2017
2098
|
raw_request: rawRequest
|
|
2018
2099
|
};
|
|
2019
2100
|
}
|
|
2020
|
-
function createCacheKey(provider, target,
|
|
2101
|
+
function createCacheKey(provider, target, evalCase, promptInputs) {
|
|
2021
2102
|
const hash = createHash("sha256");
|
|
2022
2103
|
hash.update(provider.id);
|
|
2023
2104
|
hash.update(target.name);
|
|
2024
|
-
hash.update(
|
|
2105
|
+
hash.update(evalCase.id);
|
|
2025
2106
|
hash.update(promptInputs.request);
|
|
2026
2107
|
hash.update(promptInputs.guidelines);
|
|
2108
|
+
hash.update(promptInputs.systemMessage ?? "");
|
|
2027
2109
|
return hash.digest("hex");
|
|
2028
2110
|
}
|
|
2029
2111
|
function isTimeoutLike(error) {
|
|
@@ -2051,7 +2133,9 @@ export {
|
|
|
2051
2133
|
HeuristicGrader,
|
|
2052
2134
|
QualityGrader,
|
|
2053
2135
|
TEST_MESSAGE_ROLES,
|
|
2136
|
+
buildDirectoryChain,
|
|
2054
2137
|
buildPromptInputs,
|
|
2138
|
+
buildSearchRoots,
|
|
2055
2139
|
calculateHits,
|
|
2056
2140
|
calculateMisses,
|
|
2057
2141
|
createAgentKernel,
|
|
@@ -2059,6 +2143,8 @@ export {
|
|
|
2059
2143
|
ensureVSCodeSubagents,
|
|
2060
2144
|
extractAspects,
|
|
2061
2145
|
extractCodeBlocks,
|
|
2146
|
+
fileExists,
|
|
2147
|
+
findGitRoot,
|
|
2062
2148
|
getHitCount,
|
|
2063
2149
|
isErrorLike,
|
|
2064
2150
|
isGraderKind,
|
|
@@ -2068,12 +2154,13 @@ export {
|
|
|
2068
2154
|
isTestMessage,
|
|
2069
2155
|
isTestMessageRole,
|
|
2070
2156
|
listTargetNames,
|
|
2071
|
-
|
|
2157
|
+
loadEvalCases,
|
|
2072
2158
|
readTargetDefinitions,
|
|
2073
2159
|
resolveAndCreateProvider,
|
|
2160
|
+
resolveFileReference,
|
|
2074
2161
|
resolveTargetDefinition,
|
|
2162
|
+
runEvalCase,
|
|
2075
2163
|
runEvaluation,
|
|
2076
|
-
runTestCase,
|
|
2077
2164
|
scoreCandidateResponse
|
|
2078
2165
|
};
|
|
2079
2166
|
//# sourceMappingURL=index.js.map
|