@agentv/core 0.2.3 → 0.2.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-5REK5RSI.js → chunk-XXNQA4EW.js} +56 -2
- package/dist/chunk-XXNQA4EW.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +123 -12
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.d.cts +7 -2
- package/dist/evaluation/validation/index.d.ts +7 -2
- package/dist/evaluation/validation/index.js +97 -11
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +334 -201
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +48 -19
- package/dist/index.d.ts +48 -19
- package/dist/index.js +293 -206
- package/dist/index.js.map +1 -1
- package/package.json +6 -2
- package/dist/chunk-5REK5RSI.js.map +0 -1
package/dist/index.cjs
CHANGED
|
@@ -34,7 +34,9 @@ __export(index_exports, {
|
|
|
34
34
|
HeuristicGrader: () => HeuristicGrader,
|
|
35
35
|
QualityGrader: () => QualityGrader,
|
|
36
36
|
TEST_MESSAGE_ROLES: () => TEST_MESSAGE_ROLES,
|
|
37
|
+
buildDirectoryChain: () => buildDirectoryChain,
|
|
37
38
|
buildPromptInputs: () => buildPromptInputs,
|
|
39
|
+
buildSearchRoots: () => buildSearchRoots,
|
|
38
40
|
calculateHits: () => calculateHits,
|
|
39
41
|
calculateMisses: () => calculateMisses,
|
|
40
42
|
createAgentKernel: () => createAgentKernel,
|
|
@@ -42,6 +44,8 @@ __export(index_exports, {
|
|
|
42
44
|
ensureVSCodeSubagents: () => ensureVSCodeSubagents,
|
|
43
45
|
extractAspects: () => extractAspects,
|
|
44
46
|
extractCodeBlocks: () => extractCodeBlocks,
|
|
47
|
+
fileExists: () => fileExists,
|
|
48
|
+
findGitRoot: () => findGitRoot,
|
|
45
49
|
getHitCount: () => getHitCount,
|
|
46
50
|
isErrorLike: () => isErrorLike,
|
|
47
51
|
isGraderKind: () => isGraderKind,
|
|
@@ -51,12 +55,13 @@ __export(index_exports, {
|
|
|
51
55
|
isTestMessage: () => isTestMessage,
|
|
52
56
|
isTestMessageRole: () => isTestMessageRole,
|
|
53
57
|
listTargetNames: () => listTargetNames,
|
|
54
|
-
|
|
58
|
+
loadEvalCases: () => loadEvalCases,
|
|
55
59
|
readTargetDefinitions: () => readTargetDefinitions,
|
|
56
60
|
resolveAndCreateProvider: () => resolveAndCreateProvider,
|
|
61
|
+
resolveFileReference: () => resolveFileReference,
|
|
57
62
|
resolveTargetDefinition: () => resolveTargetDefinition,
|
|
63
|
+
runEvalCase: () => runEvalCase,
|
|
58
64
|
runEvaluation: () => runEvaluation,
|
|
59
|
-
runTestCase: () => runTestCase,
|
|
60
65
|
scoreCandidateResponse: () => scoreCandidateResponse
|
|
61
66
|
});
|
|
62
67
|
module.exports = __toCommonJS(index_exports);
|
|
@@ -113,6 +118,7 @@ function getHitCount(result) {
|
|
|
113
118
|
}
|
|
114
119
|
|
|
115
120
|
// src/evaluation/yaml-parser.ts
|
|
121
|
+
var import_micromatch = __toESM(require("micromatch"), 1);
|
|
116
122
|
var import_node_fs2 = require("fs");
|
|
117
123
|
var import_promises2 = require("fs/promises");
|
|
118
124
|
var import_node_path2 = __toESM(require("path"), 1);
|
|
@@ -131,6 +137,46 @@ async function fileExists(filePath) {
|
|
|
131
137
|
return false;
|
|
132
138
|
}
|
|
133
139
|
}
|
|
140
|
+
async function findGitRoot(startPath) {
|
|
141
|
+
let currentDir = import_node_path.default.dirname(import_node_path.default.resolve(startPath));
|
|
142
|
+
const root = import_node_path.default.parse(currentDir).root;
|
|
143
|
+
while (currentDir !== root) {
|
|
144
|
+
const gitPath = import_node_path.default.join(currentDir, ".git");
|
|
145
|
+
if (await fileExists(gitPath)) {
|
|
146
|
+
return currentDir;
|
|
147
|
+
}
|
|
148
|
+
const parentDir = import_node_path.default.dirname(currentDir);
|
|
149
|
+
if (parentDir === currentDir) {
|
|
150
|
+
break;
|
|
151
|
+
}
|
|
152
|
+
currentDir = parentDir;
|
|
153
|
+
}
|
|
154
|
+
return null;
|
|
155
|
+
}
|
|
156
|
+
function buildDirectoryChain(filePath, repoRoot) {
|
|
157
|
+
const directories = [];
|
|
158
|
+
const seen = /* @__PURE__ */ new Set();
|
|
159
|
+
const boundary = import_node_path.default.resolve(repoRoot);
|
|
160
|
+
let current = import_node_path.default.resolve(import_node_path.default.dirname(filePath));
|
|
161
|
+
while (current !== void 0) {
|
|
162
|
+
if (!seen.has(current)) {
|
|
163
|
+
directories.push(current);
|
|
164
|
+
seen.add(current);
|
|
165
|
+
}
|
|
166
|
+
if (current === boundary) {
|
|
167
|
+
break;
|
|
168
|
+
}
|
|
169
|
+
const parent = import_node_path.default.dirname(current);
|
|
170
|
+
if (parent === current) {
|
|
171
|
+
break;
|
|
172
|
+
}
|
|
173
|
+
current = parent;
|
|
174
|
+
}
|
|
175
|
+
if (!seen.has(boundary)) {
|
|
176
|
+
directories.push(boundary);
|
|
177
|
+
}
|
|
178
|
+
return directories;
|
|
179
|
+
}
|
|
134
180
|
function buildSearchRoots(evalPath, repoRoot) {
|
|
135
181
|
const uniqueRoots = [];
|
|
136
182
|
const addRoot = (root) => {
|
|
@@ -188,9 +234,52 @@ var CODE_BLOCK_PATTERN = /```[\s\S]*?```/g;
|
|
|
188
234
|
var ANSI_YELLOW = "\x1B[33m";
|
|
189
235
|
var ANSI_RESET = "\x1B[0m";
|
|
190
236
|
var SCHEMA_EVAL_V2 = "agentv-eval-v2";
|
|
191
|
-
|
|
237
|
+
var SCHEMA_CONFIG_V2 = "agentv-config-v2";
|
|
238
|
+
async function loadConfig(evalFilePath, repoRoot) {
|
|
239
|
+
const directories = buildDirectoryChain(evalFilePath, repoRoot);
|
|
240
|
+
for (const directory of directories) {
|
|
241
|
+
const configPath = import_node_path2.default.join(directory, ".agentv", "config.yaml");
|
|
242
|
+
if (!await fileExists2(configPath)) {
|
|
243
|
+
continue;
|
|
244
|
+
}
|
|
245
|
+
try {
|
|
246
|
+
const rawConfig = await (0, import_promises2.readFile)(configPath, "utf8");
|
|
247
|
+
const parsed = (0, import_yaml.parse)(rawConfig);
|
|
248
|
+
if (!isJsonObject(parsed)) {
|
|
249
|
+
logWarning(`Invalid .agentv/config.yaml format at ${configPath}`);
|
|
250
|
+
continue;
|
|
251
|
+
}
|
|
252
|
+
const config = parsed;
|
|
253
|
+
const schema = config.$schema;
|
|
254
|
+
if (schema !== SCHEMA_CONFIG_V2) {
|
|
255
|
+
const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${configPath}. Expected '${SCHEMA_CONFIG_V2}'` : `Missing required field '$schema' in ${configPath}.
|
|
256
|
+
Please add '$schema: ${SCHEMA_CONFIG_V2}' at the top of the file.`;
|
|
257
|
+
logWarning(message);
|
|
258
|
+
continue;
|
|
259
|
+
}
|
|
260
|
+
const guidelinePatterns = config.guideline_patterns;
|
|
261
|
+
if (guidelinePatterns !== void 0 && !Array.isArray(guidelinePatterns)) {
|
|
262
|
+
logWarning(`Invalid guideline_patterns in ${configPath}, expected array`);
|
|
263
|
+
continue;
|
|
264
|
+
}
|
|
265
|
+
if (Array.isArray(guidelinePatterns) && !guidelinePatterns.every((p) => typeof p === "string")) {
|
|
266
|
+
logWarning(`Invalid guideline_patterns in ${configPath}, all entries must be strings`);
|
|
267
|
+
continue;
|
|
268
|
+
}
|
|
269
|
+
return {
|
|
270
|
+
guideline_patterns: guidelinePatterns
|
|
271
|
+
};
|
|
272
|
+
} catch (error) {
|
|
273
|
+
logWarning(`Could not read .agentv/config.yaml at ${configPath}: ${error.message}`);
|
|
274
|
+
continue;
|
|
275
|
+
}
|
|
276
|
+
}
|
|
277
|
+
return null;
|
|
278
|
+
}
|
|
279
|
+
function isGuidelineFile(filePath, patterns) {
|
|
192
280
|
const normalized = filePath.split("\\").join("/");
|
|
193
|
-
|
|
281
|
+
const patternsToUse = patterns ?? [];
|
|
282
|
+
return import_micromatch.default.isMatch(normalized, patternsToUse);
|
|
194
283
|
}
|
|
195
284
|
function extractCodeBlocks(segments) {
|
|
196
285
|
const codeBlocks = [];
|
|
@@ -210,43 +299,45 @@ function extractCodeBlocks(segments) {
|
|
|
210
299
|
}
|
|
211
300
|
return codeBlocks;
|
|
212
301
|
}
|
|
213
|
-
async function
|
|
302
|
+
async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
214
303
|
const verbose = options?.verbose ?? false;
|
|
215
|
-
const absoluteTestPath = import_node_path2.default.resolve(
|
|
304
|
+
const absoluteTestPath = import_node_path2.default.resolve(evalFilePath);
|
|
216
305
|
if (!await fileExists2(absoluteTestPath)) {
|
|
217
|
-
throw new Error(`Test file not found: ${
|
|
306
|
+
throw new Error(`Test file not found: ${evalFilePath}`);
|
|
218
307
|
}
|
|
219
308
|
const repoRootPath = resolveToAbsolutePath(repoRoot);
|
|
220
309
|
const searchRoots = buildSearchRoots(absoluteTestPath, repoRootPath);
|
|
310
|
+
const config = await loadConfig(absoluteTestPath, repoRootPath);
|
|
311
|
+
const guidelinePatterns = config?.guideline_patterns;
|
|
221
312
|
const rawFile = await (0, import_promises2.readFile)(absoluteTestPath, "utf8");
|
|
222
313
|
const parsed = (0, import_yaml.parse)(rawFile);
|
|
223
314
|
if (!isJsonObject(parsed)) {
|
|
224
|
-
throw new Error(`Invalid test file format: ${
|
|
315
|
+
throw new Error(`Invalid test file format: ${evalFilePath}`);
|
|
225
316
|
}
|
|
226
317
|
const suite = parsed;
|
|
227
318
|
const schema = suite.$schema;
|
|
228
319
|
if (schema !== SCHEMA_EVAL_V2) {
|
|
229
|
-
const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${
|
|
320
|
+
const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${evalFilePath}. Expected '${SCHEMA_EVAL_V2}'` : `Missing required field '$schema' in ${evalFilePath}.
|
|
230
321
|
Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
231
322
|
throw new Error(message);
|
|
232
323
|
}
|
|
233
324
|
const rawTestcases = suite.evalcases;
|
|
234
325
|
if (!Array.isArray(rawTestcases)) {
|
|
235
|
-
throw new Error(`Invalid test file format: ${
|
|
326
|
+
throw new Error(`Invalid test file format: ${evalFilePath} - missing 'evalcases' field`);
|
|
236
327
|
}
|
|
237
328
|
const globalGrader = coerceGrader(suite.grader) ?? "llm_judge";
|
|
238
329
|
const results = [];
|
|
239
|
-
for (const
|
|
240
|
-
if (!isJsonObject(
|
|
330
|
+
for (const rawEvalcase of rawTestcases) {
|
|
331
|
+
if (!isJsonObject(rawEvalcase)) {
|
|
241
332
|
logWarning("Skipping invalid test case entry (expected object)");
|
|
242
333
|
continue;
|
|
243
334
|
}
|
|
244
|
-
const
|
|
245
|
-
const id = asString(
|
|
246
|
-
const conversationId = asString(
|
|
247
|
-
const outcome = asString(
|
|
248
|
-
const inputMessagesValue =
|
|
249
|
-
const expectedMessagesValue =
|
|
335
|
+
const evalcase = rawEvalcase;
|
|
336
|
+
const id = asString(evalcase.id);
|
|
337
|
+
const conversationId = asString(evalcase.conversation_id);
|
|
338
|
+
const outcome = asString(evalcase.outcome);
|
|
339
|
+
const inputMessagesValue = evalcase.input_messages;
|
|
340
|
+
const expectedMessagesValue = evalcase.expected_messages;
|
|
250
341
|
if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
|
|
251
342
|
logWarning(`Skipping incomplete test case: ${id ?? "unknown"}`);
|
|
252
343
|
continue;
|
|
@@ -259,6 +350,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
259
350
|
const expectedMessages = expectedMessagesValue.filter((msg) => isTestMessage(msg));
|
|
260
351
|
const assistantMessages = expectedMessages.filter((message) => message.role === "assistant");
|
|
261
352
|
const userMessages = inputMessages.filter((message) => message.role === "user");
|
|
353
|
+
const systemMessages = inputMessages.filter((message) => message.role === "system");
|
|
262
354
|
if (assistantMessages.length === 0) {
|
|
263
355
|
logWarning(`No assistant message found for test case: ${id}`);
|
|
264
356
|
continue;
|
|
@@ -266,6 +358,29 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
266
358
|
if (assistantMessages.length > 1) {
|
|
267
359
|
logWarning(`Multiple assistant messages found for test case: ${id}, using first`);
|
|
268
360
|
}
|
|
361
|
+
if (systemMessages.length > 1) {
|
|
362
|
+
logWarning(`Multiple system messages found for test case: ${id}, using first`);
|
|
363
|
+
}
|
|
364
|
+
let systemMessageContent;
|
|
365
|
+
if (systemMessages.length > 0) {
|
|
366
|
+
const content = systemMessages[0]?.content;
|
|
367
|
+
if (typeof content === "string") {
|
|
368
|
+
systemMessageContent = content;
|
|
369
|
+
} else if (Array.isArray(content)) {
|
|
370
|
+
const textParts = [];
|
|
371
|
+
for (const segment of content) {
|
|
372
|
+
if (isJsonObject(segment)) {
|
|
373
|
+
const value = segment.value;
|
|
374
|
+
if (typeof value === "string") {
|
|
375
|
+
textParts.push(value);
|
|
376
|
+
}
|
|
377
|
+
}
|
|
378
|
+
}
|
|
379
|
+
if (textParts.length > 0) {
|
|
380
|
+
systemMessageContent = textParts.join("\n\n");
|
|
381
|
+
}
|
|
382
|
+
}
|
|
383
|
+
}
|
|
269
384
|
const userSegments = [];
|
|
270
385
|
const guidelinePaths = [];
|
|
271
386
|
const userTextParts = [];
|
|
@@ -297,7 +412,8 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
297
412
|
}
|
|
298
413
|
try {
|
|
299
414
|
const fileContent = (await (0, import_promises2.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
|
|
300
|
-
|
|
415
|
+
const relativeToRepo = import_node_path2.default.relative(repoRootPath, resolvedPath);
|
|
416
|
+
if (isGuidelineFile(relativeToRepo, guidelinePatterns)) {
|
|
301
417
|
guidelinePaths.push(import_node_path2.default.resolve(resolvedPath));
|
|
302
418
|
if (verbose) {
|
|
303
419
|
console.log(` [Guideline] Found: ${displayPath}`);
|
|
@@ -307,7 +423,8 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
307
423
|
userSegments.push({
|
|
308
424
|
type: "file",
|
|
309
425
|
path: displayPath,
|
|
310
|
-
text: fileContent
|
|
426
|
+
text: fileContent,
|
|
427
|
+
resolvedPath: import_node_path2.default.resolve(resolvedPath)
|
|
311
428
|
});
|
|
312
429
|
if (verbose) {
|
|
313
430
|
console.log(` [File] Found: ${displayPath}`);
|
|
@@ -329,16 +446,29 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
329
446
|
}
|
|
330
447
|
const codeSnippets = extractCodeBlocks(userSegments);
|
|
331
448
|
const assistantContent = assistantMessages[0]?.content;
|
|
332
|
-
const expectedAssistantRaw =
|
|
449
|
+
const expectedAssistantRaw = await resolveAssistantContent(assistantContent, searchRoots, verbose);
|
|
333
450
|
const userTextPrompt = userTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
334
|
-
const testCaseGrader = coerceGrader(
|
|
451
|
+
const testCaseGrader = coerceGrader(evalcase.grader) ?? globalGrader;
|
|
452
|
+
const userFilePaths = [];
|
|
453
|
+
for (const segment of userSegments) {
|
|
454
|
+
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
455
|
+
userFilePaths.push(segment.resolvedPath);
|
|
456
|
+
}
|
|
457
|
+
}
|
|
458
|
+
const allFilePaths = [
|
|
459
|
+
...guidelinePaths.map((guidelinePath) => import_node_path2.default.resolve(guidelinePath)),
|
|
460
|
+
...userFilePaths
|
|
461
|
+
];
|
|
335
462
|
const testCase = {
|
|
336
463
|
id,
|
|
337
464
|
conversation_id: conversationId,
|
|
338
465
|
task: userTextPrompt,
|
|
339
466
|
user_segments: userSegments,
|
|
467
|
+
system_message: systemMessageContent,
|
|
340
468
|
expected_assistant_raw: expectedAssistantRaw,
|
|
341
469
|
guideline_paths: guidelinePaths.map((guidelinePath) => import_node_path2.default.resolve(guidelinePath)),
|
|
470
|
+
guideline_patterns: guidelinePatterns,
|
|
471
|
+
file_paths: allFilePaths,
|
|
342
472
|
code_snippets: codeSnippets,
|
|
343
473
|
outcome,
|
|
344
474
|
grader: testCaseGrader
|
|
@@ -404,7 +534,7 @@ ${body}`);
|
|
|
404
534
|
}
|
|
405
535
|
const request = requestParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
|
|
406
536
|
const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
|
|
407
|
-
return { request, guidelines };
|
|
537
|
+
return { request, guidelines, systemMessage: testCase.system_message };
|
|
408
538
|
}
|
|
409
539
|
async function fileExists2(absolutePath) {
|
|
410
540
|
try {
|
|
@@ -445,7 +575,7 @@ function cloneJsonValue(value) {
|
|
|
445
575
|
}
|
|
446
576
|
return cloneJsonObject(value);
|
|
447
577
|
}
|
|
448
|
-
function
|
|
578
|
+
async function resolveAssistantContent(content, searchRoots, verbose) {
|
|
449
579
|
if (typeof content === "string") {
|
|
450
580
|
return content;
|
|
451
581
|
}
|
|
@@ -458,12 +588,42 @@ function normalizeAssistantContent(content) {
|
|
|
458
588
|
parts.push(entry);
|
|
459
589
|
continue;
|
|
460
590
|
}
|
|
461
|
-
|
|
591
|
+
if (!isJsonObject(entry)) {
|
|
592
|
+
continue;
|
|
593
|
+
}
|
|
594
|
+
const segmentType = asString(entry.type);
|
|
595
|
+
if (segmentType === "file") {
|
|
596
|
+
const rawValue = asString(entry.value);
|
|
597
|
+
if (!rawValue) {
|
|
598
|
+
continue;
|
|
599
|
+
}
|
|
600
|
+
const { displayPath, resolvedPath, attempted } = await resolveFileReference(
|
|
601
|
+
rawValue,
|
|
602
|
+
searchRoots
|
|
603
|
+
);
|
|
604
|
+
if (!resolvedPath) {
|
|
605
|
+
const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
|
|
606
|
+
logWarning(`File not found in expected_messages: ${displayPath}`, attempts);
|
|
607
|
+
continue;
|
|
608
|
+
}
|
|
609
|
+
try {
|
|
610
|
+
const fileContent = (await (0, import_promises2.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
|
|
611
|
+
parts.push(fileContent);
|
|
612
|
+
if (verbose) {
|
|
613
|
+
console.log(` [Expected Assistant File] Found: ${displayPath}`);
|
|
614
|
+
console.log(` Resolved to: ${resolvedPath}`);
|
|
615
|
+
}
|
|
616
|
+
} catch (error) {
|
|
617
|
+
logWarning(`Could not read file ${resolvedPath}: ${error.message}`);
|
|
618
|
+
}
|
|
619
|
+
continue;
|
|
620
|
+
}
|
|
621
|
+
const textValue = asString(entry.text);
|
|
462
622
|
if (typeof textValue === "string") {
|
|
463
623
|
parts.push(textValue);
|
|
464
624
|
continue;
|
|
465
625
|
}
|
|
466
|
-
const valueValue = asString(entry
|
|
626
|
+
const valueValue = asString(entry.value);
|
|
467
627
|
if (typeof valueValue === "string") {
|
|
468
628
|
parts.push(valueValue);
|
|
469
629
|
continue;
|
|
@@ -500,15 +660,18 @@ function buildChatPrompt(request) {
|
|
|
500
660
|
return request.chatPrompt;
|
|
501
661
|
}
|
|
502
662
|
const systemSegments = [];
|
|
503
|
-
if (request.guidelines && request.guidelines.trim().length > 0) {
|
|
504
|
-
systemSegments.push(`Guidelines:
|
|
505
|
-
${request.guidelines.trim()}`);
|
|
506
|
-
}
|
|
507
663
|
const metadataSystemPrompt = typeof request.metadata?.systemPrompt === "string" ? request.metadata.systemPrompt : void 0;
|
|
508
664
|
if (metadataSystemPrompt && metadataSystemPrompt.trim().length > 0) {
|
|
509
665
|
systemSegments.push(metadataSystemPrompt.trim());
|
|
666
|
+
} else {
|
|
667
|
+
systemSegments.push(DEFAULT_SYSTEM_PROMPT);
|
|
510
668
|
}
|
|
511
|
-
|
|
669
|
+
if (request.guidelines && request.guidelines.trim().length > 0) {
|
|
670
|
+
systemSegments.push(`[[ ## Guidelines ## ]]
|
|
671
|
+
|
|
672
|
+
${request.guidelines.trim()}`);
|
|
673
|
+
}
|
|
674
|
+
const systemContent = systemSegments.join("\n\n");
|
|
512
675
|
const userContent = request.prompt.trim();
|
|
513
676
|
const prompt = [
|
|
514
677
|
{
|
|
@@ -965,10 +1128,8 @@ function isLikelyEnvReference(value) {
|
|
|
965
1128
|
|
|
966
1129
|
// src/evaluation/providers/vscode.ts
|
|
967
1130
|
var import_promises3 = require("fs/promises");
|
|
968
|
-
var import_node_os = require("os");
|
|
969
1131
|
var import_node_path3 = __toESM(require("path"), 1);
|
|
970
1132
|
var import_subagent = require("subagent");
|
|
971
|
-
var PROMPT_FILE_PREFIX = "bbeval-vscode-";
|
|
972
1133
|
var VSCodeProvider = class {
|
|
973
1134
|
id;
|
|
974
1135
|
kind;
|
|
@@ -985,128 +1146,89 @@ var VSCodeProvider = class {
|
|
|
985
1146
|
throw new Error("VS Code provider request was aborted before dispatch");
|
|
986
1147
|
}
|
|
987
1148
|
const attachments = normalizeAttachments(request.attachments);
|
|
988
|
-
const promptContent = buildPromptDocument(request, attachments);
|
|
989
|
-
const
|
|
990
|
-
|
|
991
|
-
|
|
992
|
-
|
|
993
|
-
|
|
994
|
-
|
|
995
|
-
|
|
996
|
-
|
|
997
|
-
|
|
998
|
-
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
|
|
1004
|
-
|
|
1005
|
-
const failure = session.error ?? "VS Code subagent did not produce a response";
|
|
1006
|
-
throw new Error(failure);
|
|
1007
|
-
}
|
|
1008
|
-
if (this.config.dryRun) {
|
|
1009
|
-
return {
|
|
1010
|
-
text: "",
|
|
1011
|
-
raw: {
|
|
1012
|
-
session,
|
|
1013
|
-
promptFile: promptPath,
|
|
1014
|
-
attachments
|
|
1015
|
-
}
|
|
1016
|
-
};
|
|
1017
|
-
}
|
|
1018
|
-
const responseText = await (0, import_promises3.readFile)(session.responseFile, "utf8");
|
|
1149
|
+
const promptContent = buildPromptDocument(request, attachments, request.guideline_patterns);
|
|
1150
|
+
const session = await (0, import_subagent.dispatchAgentSession)({
|
|
1151
|
+
userQuery: promptContent,
|
|
1152
|
+
// Use full prompt content instead of just request.prompt
|
|
1153
|
+
extraAttachments: attachments,
|
|
1154
|
+
wait: this.config.waitForResponse,
|
|
1155
|
+
dryRun: this.config.dryRun,
|
|
1156
|
+
vscodeCmd: this.config.command,
|
|
1157
|
+
subagentRoot: this.config.subagentRoot,
|
|
1158
|
+
workspaceTemplate: this.config.workspaceTemplate,
|
|
1159
|
+
silent: true
|
|
1160
|
+
});
|
|
1161
|
+
if (session.exitCode !== 0 || !session.responseFile) {
|
|
1162
|
+
const failure = session.error ?? "VS Code subagent did not produce a response";
|
|
1163
|
+
throw new Error(failure);
|
|
1164
|
+
}
|
|
1165
|
+
if (this.config.dryRun) {
|
|
1019
1166
|
return {
|
|
1020
|
-
text:
|
|
1167
|
+
text: "",
|
|
1021
1168
|
raw: {
|
|
1022
1169
|
session,
|
|
1023
|
-
promptFile: promptPath,
|
|
1024
1170
|
attachments
|
|
1025
1171
|
}
|
|
1026
1172
|
};
|
|
1027
|
-
} finally {
|
|
1028
|
-
await (0, import_promises3.rm)(directory, { recursive: true, force: true });
|
|
1029
1173
|
}
|
|
1174
|
+
const responseText = await (0, import_promises3.readFile)(session.responseFile, "utf8");
|
|
1175
|
+
return {
|
|
1176
|
+
text: responseText,
|
|
1177
|
+
raw: {
|
|
1178
|
+
session,
|
|
1179
|
+
attachments
|
|
1180
|
+
}
|
|
1181
|
+
};
|
|
1030
1182
|
}
|
|
1031
1183
|
};
|
|
1032
|
-
function buildPromptDocument(request, attachments) {
|
|
1184
|
+
function buildPromptDocument(request, attachments, guidelinePatterns) {
|
|
1033
1185
|
const parts = [];
|
|
1034
|
-
const
|
|
1035
|
-
if (
|
|
1036
|
-
parts.push(buildMandatoryPrereadBlock(
|
|
1037
|
-
}
|
|
1038
|
-
parts.push(`# BbEval Request`);
|
|
1039
|
-
if (request.testCaseId) {
|
|
1040
|
-
parts.push(`- Test Case: ${request.testCaseId}`);
|
|
1041
|
-
}
|
|
1042
|
-
if (request.metadata?.target) {
|
|
1043
|
-
parts.push(`- Target: ${String(request.metadata.target)}`);
|
|
1044
|
-
}
|
|
1045
|
-
parts.push("\n## Task\n", request.prompt.trim());
|
|
1046
|
-
if (request.guidelines && request.guidelines.trim().length > 0) {
|
|
1047
|
-
parts.push("\n## Guidelines\n", request.guidelines.trim());
|
|
1048
|
-
}
|
|
1049
|
-
if (attachments && attachments.length > 0) {
|
|
1050
|
-
const attachmentList = attachments.map((item) => `- ${item}`).join("\n");
|
|
1051
|
-
parts.push("\n## Attachments\n", attachmentList);
|
|
1186
|
+
const guidelineFiles = collectGuidelineFiles(attachments, guidelinePatterns);
|
|
1187
|
+
if (guidelineFiles.length > 0) {
|
|
1188
|
+
parts.push("\n", buildMandatoryPrereadBlock(guidelineFiles));
|
|
1052
1189
|
}
|
|
1190
|
+
parts.push("\n[[ ## user_query ## ]]\n", request.prompt.trim());
|
|
1053
1191
|
return parts.join("\n").trim();
|
|
1054
1192
|
}
|
|
1055
|
-
function buildMandatoryPrereadBlock(
|
|
1056
|
-
if (
|
|
1193
|
+
function buildMandatoryPrereadBlock(guidelineFiles) {
|
|
1194
|
+
if (guidelineFiles.length === 0) {
|
|
1057
1195
|
return "";
|
|
1058
1196
|
}
|
|
1059
1197
|
const fileList = [];
|
|
1060
|
-
const tokenList = [];
|
|
1061
1198
|
let counter = 0;
|
|
1062
|
-
for (const absolutePath of
|
|
1199
|
+
for (const absolutePath of guidelineFiles) {
|
|
1063
1200
|
counter += 1;
|
|
1064
1201
|
const fileName = import_node_path3.default.basename(absolutePath);
|
|
1065
1202
|
const fileUri = pathToFileUri(absolutePath);
|
|
1066
|
-
fileList.push(
|
|
1067
|
-
tokenList.push(`INSTRUCTIONS_READ: \`${fileName}\` i=${counter} SHA256=<hex>`);
|
|
1203
|
+
fileList.push(`* [${fileName}](${fileUri})`);
|
|
1068
1204
|
}
|
|
1069
|
-
const filesText = fileList.join("
|
|
1070
|
-
const tokensText = tokenList.join("\n");
|
|
1205
|
+
const filesText = fileList.join("\n");
|
|
1071
1206
|
const instruction = [
|
|
1072
|
-
`Read all
|
|
1073
|
-
|
|
1074
|
-
"`Get-FileHash -Algorithm SHA256 -LiteralPath '<file-path>' | Select-Object -ExpandProperty Hash`.",
|
|
1075
|
-
`Then include, at the top of your reply, these exact tokens on separate lines:
|
|
1207
|
+
`Read all guideline files:
|
|
1208
|
+
${filesText}.
|
|
1076
1209
|
`,
|
|
1077
|
-
tokensText,
|
|
1078
|
-
`
|
|
1079
|
-
Replace \`<hex>\` with the actual SHA256 hash value computed from the PowerShell command.`,
|
|
1080
1210
|
`If any file is missing, fail with ERROR: missing-file <filename> and stop.
|
|
1081
1211
|
`,
|
|
1082
|
-
`Then
|
|
1083
|
-
].join("
|
|
1084
|
-
return
|
|
1085
|
-
|
|
1086
|
-
${instruction}
|
|
1087
|
-
|
|
1088
|
-
`;
|
|
1212
|
+
`Then apply system_instructions on the user query below.`
|
|
1213
|
+
].join("");
|
|
1214
|
+
return `${instruction}`;
|
|
1089
1215
|
}
|
|
1090
|
-
function
|
|
1216
|
+
function collectGuidelineFiles(attachments, guidelinePatterns) {
|
|
1091
1217
|
if (!attachments || attachments.length === 0) {
|
|
1092
1218
|
return [];
|
|
1093
1219
|
}
|
|
1094
1220
|
const unique = /* @__PURE__ */ new Map();
|
|
1095
1221
|
for (const attachment of attachments) {
|
|
1096
|
-
if (!isInstructionPath(attachment)) {
|
|
1097
|
-
continue;
|
|
1098
|
-
}
|
|
1099
1222
|
const absolutePath = import_node_path3.default.resolve(attachment);
|
|
1100
|
-
|
|
1101
|
-
|
|
1223
|
+
const normalized = absolutePath.split(import_node_path3.default.sep).join("/");
|
|
1224
|
+
if (isGuidelineFile(normalized, guidelinePatterns)) {
|
|
1225
|
+
if (!unique.has(absolutePath)) {
|
|
1226
|
+
unique.set(absolutePath, absolutePath);
|
|
1227
|
+
}
|
|
1102
1228
|
}
|
|
1103
1229
|
}
|
|
1104
1230
|
return Array.from(unique.values());
|
|
1105
1231
|
}
|
|
1106
|
-
function isInstructionPath(filePath) {
|
|
1107
|
-
const normalized = filePath.split(import_node_path3.default.sep).join("/");
|
|
1108
|
-
return normalized.endsWith(".instructions.md") || normalized.includes("/instructions/") || normalized.endsWith(".prompt.md") || normalized.includes("/prompts/");
|
|
1109
|
-
}
|
|
1110
1232
|
function pathToFileUri(filePath) {
|
|
1111
1233
|
const absolutePath = import_node_path3.default.isAbsolute(filePath) ? filePath : import_node_path3.default.resolve(filePath);
|
|
1112
1234
|
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
@@ -1115,14 +1237,6 @@ function pathToFileUri(filePath) {
|
|
|
1115
1237
|
}
|
|
1116
1238
|
return `file://${normalizedPath}`;
|
|
1117
1239
|
}
|
|
1118
|
-
function composeUserQuery(request) {
|
|
1119
|
-
const segments = [];
|
|
1120
|
-
segments.push(request.prompt.trim());
|
|
1121
|
-
if (request.guidelines && request.guidelines.trim().length > 0) {
|
|
1122
|
-
segments.push("\nGuidelines:\n", request.guidelines.trim());
|
|
1123
|
-
}
|
|
1124
|
-
return segments.join("\n").trim();
|
|
1125
|
-
}
|
|
1126
1240
|
function normalizeAttachments(attachments) {
|
|
1127
1241
|
if (!attachments || attachments.length === 0) {
|
|
1128
1242
|
return void 0;
|
|
@@ -1177,21 +1291,32 @@ var import_node_fs3 = require("fs");
|
|
|
1177
1291
|
var import_promises4 = require("fs/promises");
|
|
1178
1292
|
var import_node_path4 = __toESM(require("path"), 1);
|
|
1179
1293
|
var import_yaml2 = require("yaml");
|
|
1294
|
+
|
|
1295
|
+
// src/evaluation/providers/types.ts
|
|
1296
|
+
var TARGETS_SCHEMA_V2 = "agentv-targets-v2";
|
|
1297
|
+
|
|
1298
|
+
// src/evaluation/providers/targets-file.ts
|
|
1180
1299
|
function isRecord(value) {
|
|
1181
1300
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
1182
1301
|
}
|
|
1183
|
-
function
|
|
1184
|
-
const
|
|
1185
|
-
if (
|
|
1302
|
+
function checkSchema(parsed, absolutePath) {
|
|
1303
|
+
const schema = parsed.$schema;
|
|
1304
|
+
if (schema === void 0) {
|
|
1305
|
+
throw new Error(
|
|
1306
|
+
`Missing $schema field in targets.yaml at ${absolutePath}.
|
|
1307
|
+
Please add '$schema: ${TARGETS_SCHEMA_V2}' at the top of the file.`
|
|
1308
|
+
);
|
|
1309
|
+
}
|
|
1310
|
+
if (typeof schema !== "string") {
|
|
1186
1311
|
throw new Error(
|
|
1187
|
-
`
|
|
1188
|
-
|
|
1312
|
+
`Invalid $schema field in targets.yaml at ${absolutePath}.
|
|
1313
|
+
Expected a string value '${TARGETS_SCHEMA_V2}'.`
|
|
1189
1314
|
);
|
|
1190
1315
|
}
|
|
1191
|
-
if (
|
|
1316
|
+
if (schema !== TARGETS_SCHEMA_V2) {
|
|
1192
1317
|
throw new Error(
|
|
1193
|
-
`
|
|
1194
|
-
|
|
1318
|
+
`Invalid $schema '${schema}' in targets.yaml at ${absolutePath}.
|
|
1319
|
+
Expected '${TARGETS_SCHEMA_V2}'.`
|
|
1195
1320
|
);
|
|
1196
1321
|
}
|
|
1197
1322
|
}
|
|
@@ -1239,9 +1364,9 @@ async function readTargetDefinitions(filePath) {
|
|
|
1239
1364
|
const raw = await (0, import_promises4.readFile)(absolutePath, "utf8");
|
|
1240
1365
|
const parsed = (0, import_yaml2.parse)(raw);
|
|
1241
1366
|
if (!isRecord(parsed)) {
|
|
1242
|
-
throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with '
|
|
1367
|
+
throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with '$schema' and 'targets' fields`);
|
|
1243
1368
|
}
|
|
1244
|
-
|
|
1369
|
+
checkSchema(parsed, absolutePath);
|
|
1245
1370
|
const targets = extractTargetsArray(parsed, absolutePath);
|
|
1246
1371
|
const definitions = targets.map((entry, index) => assertTargetDefinition(entry, index, absolutePath));
|
|
1247
1372
|
return definitions;
|
|
@@ -1463,7 +1588,7 @@ var import_node_crypto = require("crypto");
|
|
|
1463
1588
|
var HeuristicGrader = class {
|
|
1464
1589
|
kind = "heuristic";
|
|
1465
1590
|
grade(context) {
|
|
1466
|
-
const expectedAspects = extractAspects(context.
|
|
1591
|
+
const expectedAspects = extractAspects(context.evalCase.expected_assistant_raw);
|
|
1467
1592
|
const result = scoreCandidateResponse(context.candidate, expectedAspects);
|
|
1468
1593
|
const misses = [...result.misses];
|
|
1469
1594
|
if (expectedAspects.length === 0 && isErrorLike(context.candidate)) {
|
|
@@ -1496,14 +1621,14 @@ var QualityGrader = class {
|
|
|
1496
1621
|
if (!judgeProvider) {
|
|
1497
1622
|
throw new Error("No judge provider available for LLM grading");
|
|
1498
1623
|
}
|
|
1499
|
-
const prompt = buildQualityPrompt(context.
|
|
1624
|
+
const prompt = buildQualityPrompt(context.evalCase, context.candidate);
|
|
1500
1625
|
const metadata = {
|
|
1501
1626
|
systemPrompt: QUALITY_SYSTEM_PROMPT
|
|
1502
1627
|
};
|
|
1503
1628
|
const response = await judgeProvider.invoke({
|
|
1504
1629
|
prompt,
|
|
1505
1630
|
metadata,
|
|
1506
|
-
|
|
1631
|
+
evalCaseId: context.evalCase.id,
|
|
1507
1632
|
attempt: context.attempt,
|
|
1508
1633
|
maxOutputTokens: this.maxOutputTokens,
|
|
1509
1634
|
temperature: this.temperature
|
|
@@ -1549,16 +1674,16 @@ var QUALITY_SYSTEM_PROMPT = [
|
|
|
1549
1674
|
function buildQualityPrompt(testCase, candidate) {
|
|
1550
1675
|
const parts = [
|
|
1551
1676
|
"[[ ## expected_outcome ## ]]",
|
|
1552
|
-
testCase.outcome,
|
|
1677
|
+
testCase.outcome.trim(),
|
|
1553
1678
|
"",
|
|
1554
1679
|
"[[ ## request ## ]]",
|
|
1555
|
-
testCase.task,
|
|
1680
|
+
testCase.task.trim(),
|
|
1556
1681
|
"",
|
|
1557
1682
|
"[[ ## reference_answer ## ]]",
|
|
1558
|
-
testCase.expected_assistant_raw,
|
|
1683
|
+
testCase.expected_assistant_raw.trim(),
|
|
1559
1684
|
"",
|
|
1560
1685
|
"[[ ## generated_answer ## ]]",
|
|
1561
|
-
candidate,
|
|
1686
|
+
candidate.trim(),
|
|
1562
1687
|
"",
|
|
1563
1688
|
"Respond with a single JSON object matching the schema described in the system prompt."
|
|
1564
1689
|
];
|
|
@@ -1802,17 +1927,17 @@ async function runEvaluation(options) {
|
|
|
1802
1927
|
cache,
|
|
1803
1928
|
useCache,
|
|
1804
1929
|
now,
|
|
1805
|
-
|
|
1930
|
+
evalId,
|
|
1806
1931
|
verbose,
|
|
1807
1932
|
onResult,
|
|
1808
1933
|
onProgress
|
|
1809
1934
|
} = options;
|
|
1810
|
-
const load =
|
|
1811
|
-
const
|
|
1812
|
-
const
|
|
1813
|
-
if (
|
|
1814
|
-
if (
|
|
1815
|
-
throw new Error(`Test case with id '${
|
|
1935
|
+
const load = loadEvalCases;
|
|
1936
|
+
const evalCases = await load(testFilePath, repoRoot, { verbose });
|
|
1937
|
+
const filteredEvalCases = filterEvalCases(evalCases, evalId);
|
|
1938
|
+
if (filteredEvalCases.length === 0) {
|
|
1939
|
+
if (evalId) {
|
|
1940
|
+
throw new Error(`Test case with id '${evalId}' not found in ${testFilePath}`);
|
|
1816
1941
|
}
|
|
1817
1942
|
return [];
|
|
1818
1943
|
}
|
|
@@ -1856,11 +1981,11 @@ async function runEvaluation(options) {
|
|
|
1856
1981
|
};
|
|
1857
1982
|
const graderRegistry = buildGraderRegistry(graders, resolveJudgeProvider);
|
|
1858
1983
|
const primaryProvider = getOrCreateProvider(target);
|
|
1859
|
-
if (onProgress &&
|
|
1860
|
-
for (let i = 0; i <
|
|
1984
|
+
if (onProgress && filteredEvalCases.length > 0) {
|
|
1985
|
+
for (let i = 0; i < filteredEvalCases.length; i++) {
|
|
1861
1986
|
await onProgress({
|
|
1862
1987
|
workerId: i + 1,
|
|
1863
|
-
|
|
1988
|
+
evalId: filteredEvalCases[i].id,
|
|
1864
1989
|
status: "pending"
|
|
1865
1990
|
});
|
|
1866
1991
|
}
|
|
@@ -1868,23 +1993,23 @@ async function runEvaluation(options) {
|
|
|
1868
1993
|
const workers = options.maxConcurrency ?? target.workers ?? 1;
|
|
1869
1994
|
const limit = pLimit(workers);
|
|
1870
1995
|
let nextWorkerId = 1;
|
|
1871
|
-
const
|
|
1872
|
-
const promises =
|
|
1873
|
-
(
|
|
1996
|
+
const workerIdByEvalId = /* @__PURE__ */ new Map();
|
|
1997
|
+
const promises = filteredEvalCases.map(
|
|
1998
|
+
(evalCase) => limit(async () => {
|
|
1874
1999
|
const workerId = nextWorkerId++;
|
|
1875
|
-
|
|
2000
|
+
workerIdByEvalId.set(evalCase.id, workerId);
|
|
1876
2001
|
if (onProgress) {
|
|
1877
2002
|
await onProgress({
|
|
1878
2003
|
workerId,
|
|
1879
|
-
|
|
2004
|
+
evalId: evalCase.id,
|
|
1880
2005
|
status: "running",
|
|
1881
2006
|
startedAt: Date.now()
|
|
1882
2007
|
});
|
|
1883
2008
|
}
|
|
1884
2009
|
try {
|
|
1885
2010
|
const judgeProvider = await resolveJudgeProvider(target);
|
|
1886
|
-
const result = await
|
|
1887
|
-
|
|
2011
|
+
const result = await runEvalCase({
|
|
2012
|
+
evalCase,
|
|
1888
2013
|
provider: primaryProvider,
|
|
1889
2014
|
target,
|
|
1890
2015
|
graders: graderRegistry,
|
|
@@ -1899,7 +2024,7 @@ async function runEvaluation(options) {
|
|
|
1899
2024
|
if (onProgress) {
|
|
1900
2025
|
await onProgress({
|
|
1901
2026
|
workerId,
|
|
1902
|
-
|
|
2027
|
+
evalId: evalCase.id,
|
|
1903
2028
|
status: "completed",
|
|
1904
2029
|
startedAt: 0,
|
|
1905
2030
|
// Not used for completed status
|
|
@@ -1914,7 +2039,7 @@ async function runEvaluation(options) {
|
|
|
1914
2039
|
if (onProgress) {
|
|
1915
2040
|
await onProgress({
|
|
1916
2041
|
workerId,
|
|
1917
|
-
|
|
2042
|
+
evalId: evalCase.id,
|
|
1918
2043
|
status: "failed",
|
|
1919
2044
|
completedAt: Date.now(),
|
|
1920
2045
|
error: error instanceof Error ? error.message : String(error)
|
|
@@ -1931,10 +2056,10 @@ async function runEvaluation(options) {
|
|
|
1931
2056
|
if (outcome.status === "fulfilled") {
|
|
1932
2057
|
results.push(outcome.value);
|
|
1933
2058
|
} else {
|
|
1934
|
-
const
|
|
1935
|
-
const promptInputs = await buildPromptInputs(
|
|
2059
|
+
const evalCase = filteredEvalCases[i];
|
|
2060
|
+
const promptInputs = await buildPromptInputs(evalCase);
|
|
1936
2061
|
const errorResult = buildErrorResult(
|
|
1937
|
-
|
|
2062
|
+
evalCase,
|
|
1938
2063
|
target.name,
|
|
1939
2064
|
(now ?? (() => /* @__PURE__ */ new Date()))(),
|
|
1940
2065
|
outcome.reason,
|
|
@@ -1948,9 +2073,9 @@ async function runEvaluation(options) {
|
|
|
1948
2073
|
}
|
|
1949
2074
|
return results;
|
|
1950
2075
|
}
|
|
1951
|
-
async function
|
|
2076
|
+
async function runEvalCase(options) {
|
|
1952
2077
|
const {
|
|
1953
|
-
|
|
2078
|
+
evalCase,
|
|
1954
2079
|
provider,
|
|
1955
2080
|
target,
|
|
1956
2081
|
graders,
|
|
@@ -1963,11 +2088,11 @@ async function runTestCase(options) {
|
|
|
1963
2088
|
signal,
|
|
1964
2089
|
judgeProvider
|
|
1965
2090
|
} = options;
|
|
1966
|
-
const promptInputs = await buildPromptInputs(
|
|
2091
|
+
const promptInputs = await buildPromptInputs(evalCase);
|
|
1967
2092
|
if (promptDumpDir) {
|
|
1968
|
-
await dumpPrompt(promptDumpDir,
|
|
2093
|
+
await dumpPrompt(promptDumpDir, evalCase, promptInputs);
|
|
1969
2094
|
}
|
|
1970
|
-
const cacheKey = useCache ? createCacheKey(provider, target,
|
|
2095
|
+
const cacheKey = useCache ? createCacheKey(provider, target, evalCase, promptInputs) : void 0;
|
|
1971
2096
|
let cachedResponse;
|
|
1972
2097
|
if (cacheKey && cache) {
|
|
1973
2098
|
cachedResponse = await cache.get(cacheKey);
|
|
@@ -1980,7 +2105,7 @@ async function runTestCase(options) {
|
|
|
1980
2105
|
while (!providerResponse && attempt < attemptBudget) {
|
|
1981
2106
|
try {
|
|
1982
2107
|
providerResponse = await invokeProvider(provider, {
|
|
1983
|
-
|
|
2108
|
+
evalCase,
|
|
1984
2109
|
target,
|
|
1985
2110
|
promptInputs,
|
|
1986
2111
|
attempt,
|
|
@@ -1993,12 +2118,12 @@ async function runTestCase(options) {
|
|
|
1993
2118
|
attempt += 1;
|
|
1994
2119
|
continue;
|
|
1995
2120
|
}
|
|
1996
|
-
return buildErrorResult(
|
|
2121
|
+
return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
|
|
1997
2122
|
}
|
|
1998
2123
|
}
|
|
1999
2124
|
if (!providerResponse) {
|
|
2000
2125
|
return buildErrorResult(
|
|
2001
|
-
|
|
2126
|
+
evalCase,
|
|
2002
2127
|
target.name,
|
|
2003
2128
|
nowFn(),
|
|
2004
2129
|
lastError ?? new Error("Provider did not return a response"),
|
|
@@ -2008,7 +2133,7 @@ async function runTestCase(options) {
|
|
|
2008
2133
|
if (cacheKey && cache && !cachedResponse) {
|
|
2009
2134
|
await cache.set(cacheKey, providerResponse);
|
|
2010
2135
|
}
|
|
2011
|
-
const graderKind =
|
|
2136
|
+
const graderKind = evalCase.grader ?? "heuristic";
|
|
2012
2137
|
const activeGrader = graders[graderKind] ?? graders.heuristic;
|
|
2013
2138
|
if (!activeGrader) {
|
|
2014
2139
|
throw new Error(`No grader registered for kind '${graderKind}'`);
|
|
@@ -2017,7 +2142,7 @@ async function runTestCase(options) {
|
|
|
2017
2142
|
try {
|
|
2018
2143
|
const gradeTimestamp = nowFn();
|
|
2019
2144
|
grade = await activeGrader.grade({
|
|
2020
|
-
|
|
2145
|
+
evalCase,
|
|
2021
2146
|
candidate: providerResponse.text ?? "",
|
|
2022
2147
|
target,
|
|
2023
2148
|
provider,
|
|
@@ -2027,17 +2152,18 @@ async function runTestCase(options) {
|
|
|
2027
2152
|
judgeProvider
|
|
2028
2153
|
});
|
|
2029
2154
|
} catch (error) {
|
|
2030
|
-
return buildErrorResult(
|
|
2155
|
+
return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
|
|
2031
2156
|
}
|
|
2032
2157
|
const completedAt = nowFn();
|
|
2033
2158
|
const rawRequest = {
|
|
2034
2159
|
request: promptInputs.request,
|
|
2035
2160
|
guidelines: promptInputs.guidelines,
|
|
2036
|
-
guideline_paths:
|
|
2161
|
+
guideline_paths: evalCase.guideline_paths,
|
|
2162
|
+
system_message: promptInputs.systemMessage ?? ""
|
|
2037
2163
|
};
|
|
2038
2164
|
return {
|
|
2039
|
-
|
|
2040
|
-
conversation_id:
|
|
2165
|
+
eval_id: evalCase.id,
|
|
2166
|
+
conversation_id: evalCase.conversation_id,
|
|
2041
2167
|
score: grade.score,
|
|
2042
2168
|
hits: grade.hits,
|
|
2043
2169
|
misses: grade.misses,
|
|
@@ -2051,11 +2177,11 @@ async function runTestCase(options) {
|
|
|
2051
2177
|
grader_raw_request: grade.graderRawRequest
|
|
2052
2178
|
};
|
|
2053
2179
|
}
|
|
2054
|
-
function
|
|
2055
|
-
if (!
|
|
2056
|
-
return
|
|
2180
|
+
function filterEvalCases(evalCases, evalId) {
|
|
2181
|
+
if (!evalId) {
|
|
2182
|
+
return evalCases;
|
|
2057
2183
|
}
|
|
2058
|
-
return
|
|
2184
|
+
return evalCases.filter((evalCase) => evalCase.id === evalId);
|
|
2059
2185
|
}
|
|
2060
2186
|
function buildGraderRegistry(overrides, resolveJudgeProvider) {
|
|
2061
2187
|
const heuristic = overrides?.heuristic ?? new HeuristicGrader();
|
|
@@ -2073,16 +2199,16 @@ function buildGraderRegistry(overrides, resolveJudgeProvider) {
|
|
|
2073
2199
|
llm_judge: llmJudge
|
|
2074
2200
|
};
|
|
2075
2201
|
}
|
|
2076
|
-
async function dumpPrompt(directory,
|
|
2202
|
+
async function dumpPrompt(directory, evalCase, promptInputs) {
|
|
2077
2203
|
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
2078
|
-
const filename = `${timestamp}_${sanitizeFilename(
|
|
2204
|
+
const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
|
|
2079
2205
|
const filePath = import_node_path5.default.resolve(directory, filename);
|
|
2080
2206
|
await (0, import_promises5.mkdir)(import_node_path5.default.dirname(filePath), { recursive: true });
|
|
2081
2207
|
const payload = {
|
|
2082
|
-
|
|
2208
|
+
eval_id: evalCase.id,
|
|
2083
2209
|
request: promptInputs.request,
|
|
2084
2210
|
guidelines: promptInputs.guidelines,
|
|
2085
|
-
guideline_paths:
|
|
2211
|
+
guideline_paths: evalCase.guideline_paths
|
|
2086
2212
|
};
|
|
2087
2213
|
await (0, import_promises5.writeFile)(filePath, JSON.stringify(payload, null, 2), "utf8");
|
|
2088
2214
|
}
|
|
@@ -2094,7 +2220,7 @@ function sanitizeFilename(value) {
|
|
|
2094
2220
|
return sanitized.length > 0 ? sanitized : (0, import_node_crypto2.randomUUID)();
|
|
2095
2221
|
}
|
|
2096
2222
|
async function invokeProvider(provider, options) {
|
|
2097
|
-
const {
|
|
2223
|
+
const { evalCase, target, promptInputs, attempt, agentTimeoutMs, signal } = options;
|
|
2098
2224
|
const controller = new AbortController();
|
|
2099
2225
|
const timeout = agentTimeoutMs ? setTimeout(() => controller.abort(), agentTimeoutMs) : void 0;
|
|
2100
2226
|
if (signal) {
|
|
@@ -2104,12 +2230,12 @@ async function invokeProvider(provider, options) {
|
|
|
2104
2230
|
return await provider.invoke({
|
|
2105
2231
|
prompt: promptInputs.request,
|
|
2106
2232
|
guidelines: promptInputs.guidelines,
|
|
2107
|
-
|
|
2108
|
-
|
|
2233
|
+
guideline_patterns: evalCase.guideline_patterns,
|
|
2234
|
+
attachments: evalCase.file_paths,
|
|
2235
|
+
evalCaseId: evalCase.id,
|
|
2109
2236
|
attempt,
|
|
2110
2237
|
metadata: {
|
|
2111
|
-
|
|
2112
|
-
grader: testCase.grader
|
|
2238
|
+
systemPrompt: promptInputs.systemMessage ?? ""
|
|
2113
2239
|
},
|
|
2114
2240
|
signal: controller.signal
|
|
2115
2241
|
});
|
|
@@ -2119,17 +2245,18 @@ async function invokeProvider(provider, options) {
|
|
|
2119
2245
|
}
|
|
2120
2246
|
}
|
|
2121
2247
|
}
|
|
2122
|
-
function buildErrorResult(
|
|
2248
|
+
function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs) {
|
|
2123
2249
|
const message = error instanceof Error ? error.message : String(error);
|
|
2124
2250
|
const rawRequest = {
|
|
2125
2251
|
request: promptInputs.request,
|
|
2126
2252
|
guidelines: promptInputs.guidelines,
|
|
2127
|
-
guideline_paths:
|
|
2253
|
+
guideline_paths: evalCase.guideline_paths,
|
|
2254
|
+
system_message: promptInputs.systemMessage ?? "",
|
|
2128
2255
|
error: message
|
|
2129
2256
|
};
|
|
2130
2257
|
return {
|
|
2131
|
-
|
|
2132
|
-
conversation_id:
|
|
2258
|
+
eval_id: evalCase.id,
|
|
2259
|
+
conversation_id: evalCase.conversation_id,
|
|
2133
2260
|
score: 0,
|
|
2134
2261
|
hits: [],
|
|
2135
2262
|
misses: [`Error: ${message}`],
|
|
@@ -2141,13 +2268,14 @@ function buildErrorResult(testCase, targetName, timestamp, error, promptInputs)
|
|
|
2141
2268
|
raw_request: rawRequest
|
|
2142
2269
|
};
|
|
2143
2270
|
}
|
|
2144
|
-
function createCacheKey(provider, target,
|
|
2271
|
+
function createCacheKey(provider, target, evalCase, promptInputs) {
|
|
2145
2272
|
const hash = (0, import_node_crypto2.createHash)("sha256");
|
|
2146
2273
|
hash.update(provider.id);
|
|
2147
2274
|
hash.update(target.name);
|
|
2148
|
-
hash.update(
|
|
2275
|
+
hash.update(evalCase.id);
|
|
2149
2276
|
hash.update(promptInputs.request);
|
|
2150
2277
|
hash.update(promptInputs.guidelines);
|
|
2278
|
+
hash.update(promptInputs.systemMessage ?? "");
|
|
2151
2279
|
return hash.digest("hex");
|
|
2152
2280
|
}
|
|
2153
2281
|
function isTimeoutLike(error) {
|
|
@@ -2176,7 +2304,9 @@ function createAgentKernel() {
|
|
|
2176
2304
|
HeuristicGrader,
|
|
2177
2305
|
QualityGrader,
|
|
2178
2306
|
TEST_MESSAGE_ROLES,
|
|
2307
|
+
buildDirectoryChain,
|
|
2179
2308
|
buildPromptInputs,
|
|
2309
|
+
buildSearchRoots,
|
|
2180
2310
|
calculateHits,
|
|
2181
2311
|
calculateMisses,
|
|
2182
2312
|
createAgentKernel,
|
|
@@ -2184,6 +2314,8 @@ function createAgentKernel() {
|
|
|
2184
2314
|
ensureVSCodeSubagents,
|
|
2185
2315
|
extractAspects,
|
|
2186
2316
|
extractCodeBlocks,
|
|
2317
|
+
fileExists,
|
|
2318
|
+
findGitRoot,
|
|
2187
2319
|
getHitCount,
|
|
2188
2320
|
isErrorLike,
|
|
2189
2321
|
isGraderKind,
|
|
@@ -2193,12 +2325,13 @@ function createAgentKernel() {
|
|
|
2193
2325
|
isTestMessage,
|
|
2194
2326
|
isTestMessageRole,
|
|
2195
2327
|
listTargetNames,
|
|
2196
|
-
|
|
2328
|
+
loadEvalCases,
|
|
2197
2329
|
readTargetDefinitions,
|
|
2198
2330
|
resolveAndCreateProvider,
|
|
2331
|
+
resolveFileReference,
|
|
2199
2332
|
resolveTargetDefinition,
|
|
2333
|
+
runEvalCase,
|
|
2200
2334
|
runEvaluation,
|
|
2201
|
-
runTestCase,
|
|
2202
2335
|
scoreCandidateResponse
|
|
2203
2336
|
});
|
|
2204
2337
|
//# sourceMappingURL=index.cjs.map
|