@agentv/core 0.2.6 → 0.2.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-QVS4OL44.js → chunk-P4GOYWYH.js} +27 -1
- package/dist/chunk-P4GOYWYH.js.map +1 -0
- package/dist/chunk-XXNQA4EW.js +140 -0
- package/dist/chunk-XXNQA4EW.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +93 -8
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.d.cts +7 -2
- package/dist/evaluation/validation/index.d.ts +7 -2
- package/dist/evaluation/validation/index.js +91 -7
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +533 -187
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +53 -10
- package/dist/index.d.ts +53 -10
- package/dist/index.js +502 -193
- package/dist/index.js.map +1 -1
- package/package.json +6 -2
- package/dist/chunk-QVS4OL44.js.map +0 -1
package/dist/index.cjs
CHANGED
|
@@ -34,7 +34,9 @@ __export(index_exports, {
|
|
|
34
34
|
HeuristicGrader: () => HeuristicGrader,
|
|
35
35
|
QualityGrader: () => QualityGrader,
|
|
36
36
|
TEST_MESSAGE_ROLES: () => TEST_MESSAGE_ROLES,
|
|
37
|
+
buildDirectoryChain: () => buildDirectoryChain,
|
|
37
38
|
buildPromptInputs: () => buildPromptInputs,
|
|
39
|
+
buildSearchRoots: () => buildSearchRoots,
|
|
38
40
|
calculateHits: () => calculateHits,
|
|
39
41
|
calculateMisses: () => calculateMisses,
|
|
40
42
|
createAgentKernel: () => createAgentKernel,
|
|
@@ -42,6 +44,8 @@ __export(index_exports, {
|
|
|
42
44
|
ensureVSCodeSubagents: () => ensureVSCodeSubagents,
|
|
43
45
|
extractAspects: () => extractAspects,
|
|
44
46
|
extractCodeBlocks: () => extractCodeBlocks,
|
|
47
|
+
fileExists: () => fileExists,
|
|
48
|
+
findGitRoot: () => findGitRoot,
|
|
45
49
|
getHitCount: () => getHitCount,
|
|
46
50
|
isErrorLike: () => isErrorLike,
|
|
47
51
|
isGraderKind: () => isGraderKind,
|
|
@@ -51,12 +55,13 @@ __export(index_exports, {
|
|
|
51
55
|
isTestMessage: () => isTestMessage,
|
|
52
56
|
isTestMessageRole: () => isTestMessageRole,
|
|
53
57
|
listTargetNames: () => listTargetNames,
|
|
54
|
-
|
|
58
|
+
loadEvalCases: () => loadEvalCases,
|
|
55
59
|
readTargetDefinitions: () => readTargetDefinitions,
|
|
56
60
|
resolveAndCreateProvider: () => resolveAndCreateProvider,
|
|
61
|
+
resolveFileReference: () => resolveFileReference,
|
|
57
62
|
resolveTargetDefinition: () => resolveTargetDefinition,
|
|
63
|
+
runEvalCase: () => runEvalCase,
|
|
58
64
|
runEvaluation: () => runEvaluation,
|
|
59
|
-
runTestCase: () => runTestCase,
|
|
60
65
|
scoreCandidateResponse: () => scoreCandidateResponse
|
|
61
66
|
});
|
|
62
67
|
module.exports = __toCommonJS(index_exports);
|
|
@@ -113,6 +118,7 @@ function getHitCount(result) {
|
|
|
113
118
|
}
|
|
114
119
|
|
|
115
120
|
// src/evaluation/yaml-parser.ts
|
|
121
|
+
var import_micromatch = __toESM(require("micromatch"), 1);
|
|
116
122
|
var import_node_fs2 = require("fs");
|
|
117
123
|
var import_promises2 = require("fs/promises");
|
|
118
124
|
var import_node_path2 = __toESM(require("path"), 1);
|
|
@@ -131,6 +137,46 @@ async function fileExists(filePath) {
|
|
|
131
137
|
return false;
|
|
132
138
|
}
|
|
133
139
|
}
|
|
140
|
+
async function findGitRoot(startPath) {
|
|
141
|
+
let currentDir = import_node_path.default.dirname(import_node_path.default.resolve(startPath));
|
|
142
|
+
const root = import_node_path.default.parse(currentDir).root;
|
|
143
|
+
while (currentDir !== root) {
|
|
144
|
+
const gitPath = import_node_path.default.join(currentDir, ".git");
|
|
145
|
+
if (await fileExists(gitPath)) {
|
|
146
|
+
return currentDir;
|
|
147
|
+
}
|
|
148
|
+
const parentDir = import_node_path.default.dirname(currentDir);
|
|
149
|
+
if (parentDir === currentDir) {
|
|
150
|
+
break;
|
|
151
|
+
}
|
|
152
|
+
currentDir = parentDir;
|
|
153
|
+
}
|
|
154
|
+
return null;
|
|
155
|
+
}
|
|
156
|
+
function buildDirectoryChain(filePath, repoRoot) {
|
|
157
|
+
const directories = [];
|
|
158
|
+
const seen = /* @__PURE__ */ new Set();
|
|
159
|
+
const boundary = import_node_path.default.resolve(repoRoot);
|
|
160
|
+
let current = import_node_path.default.resolve(import_node_path.default.dirname(filePath));
|
|
161
|
+
while (current !== void 0) {
|
|
162
|
+
if (!seen.has(current)) {
|
|
163
|
+
directories.push(current);
|
|
164
|
+
seen.add(current);
|
|
165
|
+
}
|
|
166
|
+
if (current === boundary) {
|
|
167
|
+
break;
|
|
168
|
+
}
|
|
169
|
+
const parent = import_node_path.default.dirname(current);
|
|
170
|
+
if (parent === current) {
|
|
171
|
+
break;
|
|
172
|
+
}
|
|
173
|
+
current = parent;
|
|
174
|
+
}
|
|
175
|
+
if (!seen.has(boundary)) {
|
|
176
|
+
directories.push(boundary);
|
|
177
|
+
}
|
|
178
|
+
return directories;
|
|
179
|
+
}
|
|
134
180
|
function buildSearchRoots(evalPath, repoRoot) {
|
|
135
181
|
const uniqueRoots = [];
|
|
136
182
|
const addRoot = (root) => {
|
|
@@ -188,9 +234,52 @@ var CODE_BLOCK_PATTERN = /```[\s\S]*?```/g;
|
|
|
188
234
|
var ANSI_YELLOW = "\x1B[33m";
|
|
189
235
|
var ANSI_RESET = "\x1B[0m";
|
|
190
236
|
var SCHEMA_EVAL_V2 = "agentv-eval-v2";
|
|
191
|
-
|
|
237
|
+
var SCHEMA_CONFIG_V2 = "agentv-config-v2";
|
|
238
|
+
async function loadConfig(evalFilePath, repoRoot) {
|
|
239
|
+
const directories = buildDirectoryChain(evalFilePath, repoRoot);
|
|
240
|
+
for (const directory of directories) {
|
|
241
|
+
const configPath = import_node_path2.default.join(directory, ".agentv", "config.yaml");
|
|
242
|
+
if (!await fileExists2(configPath)) {
|
|
243
|
+
continue;
|
|
244
|
+
}
|
|
245
|
+
try {
|
|
246
|
+
const rawConfig = await (0, import_promises2.readFile)(configPath, "utf8");
|
|
247
|
+
const parsed = (0, import_yaml.parse)(rawConfig);
|
|
248
|
+
if (!isJsonObject(parsed)) {
|
|
249
|
+
logWarning(`Invalid .agentv/config.yaml format at ${configPath}`);
|
|
250
|
+
continue;
|
|
251
|
+
}
|
|
252
|
+
const config = parsed;
|
|
253
|
+
const schema = config.$schema;
|
|
254
|
+
if (schema !== SCHEMA_CONFIG_V2) {
|
|
255
|
+
const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${configPath}. Expected '${SCHEMA_CONFIG_V2}'` : `Missing required field '$schema' in ${configPath}.
|
|
256
|
+
Please add '$schema: ${SCHEMA_CONFIG_V2}' at the top of the file.`;
|
|
257
|
+
logWarning(message);
|
|
258
|
+
continue;
|
|
259
|
+
}
|
|
260
|
+
const guidelinePatterns = config.guideline_patterns;
|
|
261
|
+
if (guidelinePatterns !== void 0 && !Array.isArray(guidelinePatterns)) {
|
|
262
|
+
logWarning(`Invalid guideline_patterns in ${configPath}, expected array`);
|
|
263
|
+
continue;
|
|
264
|
+
}
|
|
265
|
+
if (Array.isArray(guidelinePatterns) && !guidelinePatterns.every((p) => typeof p === "string")) {
|
|
266
|
+
logWarning(`Invalid guideline_patterns in ${configPath}, all entries must be strings`);
|
|
267
|
+
continue;
|
|
268
|
+
}
|
|
269
|
+
return {
|
|
270
|
+
guideline_patterns: guidelinePatterns
|
|
271
|
+
};
|
|
272
|
+
} catch (error) {
|
|
273
|
+
logWarning(`Could not read .agentv/config.yaml at ${configPath}: ${error.message}`);
|
|
274
|
+
continue;
|
|
275
|
+
}
|
|
276
|
+
}
|
|
277
|
+
return null;
|
|
278
|
+
}
|
|
279
|
+
function isGuidelineFile(filePath, patterns) {
|
|
192
280
|
const normalized = filePath.split("\\").join("/");
|
|
193
|
-
|
|
281
|
+
const patternsToUse = patterns ?? [];
|
|
282
|
+
return import_micromatch.default.isMatch(normalized, patternsToUse);
|
|
194
283
|
}
|
|
195
284
|
function extractCodeBlocks(segments) {
|
|
196
285
|
const codeBlocks = [];
|
|
@@ -210,43 +299,45 @@ function extractCodeBlocks(segments) {
|
|
|
210
299
|
}
|
|
211
300
|
return codeBlocks;
|
|
212
301
|
}
|
|
213
|
-
async function
|
|
302
|
+
async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
214
303
|
const verbose = options?.verbose ?? false;
|
|
215
|
-
const absoluteTestPath = import_node_path2.default.resolve(
|
|
304
|
+
const absoluteTestPath = import_node_path2.default.resolve(evalFilePath);
|
|
216
305
|
if (!await fileExists2(absoluteTestPath)) {
|
|
217
|
-
throw new Error(`Test file not found: ${
|
|
306
|
+
throw new Error(`Test file not found: ${evalFilePath}`);
|
|
218
307
|
}
|
|
219
308
|
const repoRootPath = resolveToAbsolutePath(repoRoot);
|
|
220
309
|
const searchRoots = buildSearchRoots(absoluteTestPath, repoRootPath);
|
|
310
|
+
const config = await loadConfig(absoluteTestPath, repoRootPath);
|
|
311
|
+
const guidelinePatterns = config?.guideline_patterns;
|
|
221
312
|
const rawFile = await (0, import_promises2.readFile)(absoluteTestPath, "utf8");
|
|
222
313
|
const parsed = (0, import_yaml.parse)(rawFile);
|
|
223
314
|
if (!isJsonObject(parsed)) {
|
|
224
|
-
throw new Error(`Invalid test file format: ${
|
|
315
|
+
throw new Error(`Invalid test file format: ${evalFilePath}`);
|
|
225
316
|
}
|
|
226
317
|
const suite = parsed;
|
|
227
318
|
const schema = suite.$schema;
|
|
228
319
|
if (schema !== SCHEMA_EVAL_V2) {
|
|
229
|
-
const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${
|
|
320
|
+
const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${evalFilePath}. Expected '${SCHEMA_EVAL_V2}'` : `Missing required field '$schema' in ${evalFilePath}.
|
|
230
321
|
Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
231
322
|
throw new Error(message);
|
|
232
323
|
}
|
|
233
324
|
const rawTestcases = suite.evalcases;
|
|
234
325
|
if (!Array.isArray(rawTestcases)) {
|
|
235
|
-
throw new Error(`Invalid test file format: ${
|
|
326
|
+
throw new Error(`Invalid test file format: ${evalFilePath} - missing 'evalcases' field`);
|
|
236
327
|
}
|
|
237
328
|
const globalGrader = coerceGrader(suite.grader) ?? "llm_judge";
|
|
238
329
|
const results = [];
|
|
239
|
-
for (const
|
|
240
|
-
if (!isJsonObject(
|
|
330
|
+
for (const rawEvalcase of rawTestcases) {
|
|
331
|
+
if (!isJsonObject(rawEvalcase)) {
|
|
241
332
|
logWarning("Skipping invalid test case entry (expected object)");
|
|
242
333
|
continue;
|
|
243
334
|
}
|
|
244
|
-
const
|
|
245
|
-
const id = asString(
|
|
246
|
-
const conversationId = asString(
|
|
247
|
-
const outcome = asString(
|
|
248
|
-
const inputMessagesValue =
|
|
249
|
-
const expectedMessagesValue =
|
|
335
|
+
const evalcase = rawEvalcase;
|
|
336
|
+
const id = asString(evalcase.id);
|
|
337
|
+
const conversationId = asString(evalcase.conversation_id);
|
|
338
|
+
const outcome = asString(evalcase.outcome);
|
|
339
|
+
const inputMessagesValue = evalcase.input_messages;
|
|
340
|
+
const expectedMessagesValue = evalcase.expected_messages;
|
|
250
341
|
if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
|
|
251
342
|
logWarning(`Skipping incomplete test case: ${id ?? "unknown"}`);
|
|
252
343
|
continue;
|
|
@@ -259,6 +350,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
259
350
|
const expectedMessages = expectedMessagesValue.filter((msg) => isTestMessage(msg));
|
|
260
351
|
const assistantMessages = expectedMessages.filter((message) => message.role === "assistant");
|
|
261
352
|
const userMessages = inputMessages.filter((message) => message.role === "user");
|
|
353
|
+
const systemMessages = inputMessages.filter((message) => message.role === "system");
|
|
262
354
|
if (assistantMessages.length === 0) {
|
|
263
355
|
logWarning(`No assistant message found for test case: ${id}`);
|
|
264
356
|
continue;
|
|
@@ -266,6 +358,29 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
266
358
|
if (assistantMessages.length > 1) {
|
|
267
359
|
logWarning(`Multiple assistant messages found for test case: ${id}, using first`);
|
|
268
360
|
}
|
|
361
|
+
if (systemMessages.length > 1) {
|
|
362
|
+
logWarning(`Multiple system messages found for test case: ${id}, using first`);
|
|
363
|
+
}
|
|
364
|
+
let systemMessageContent;
|
|
365
|
+
if (systemMessages.length > 0) {
|
|
366
|
+
const content = systemMessages[0]?.content;
|
|
367
|
+
if (typeof content === "string") {
|
|
368
|
+
systemMessageContent = content;
|
|
369
|
+
} else if (Array.isArray(content)) {
|
|
370
|
+
const textParts = [];
|
|
371
|
+
for (const segment of content) {
|
|
372
|
+
if (isJsonObject(segment)) {
|
|
373
|
+
const value = segment.value;
|
|
374
|
+
if (typeof value === "string") {
|
|
375
|
+
textParts.push(value);
|
|
376
|
+
}
|
|
377
|
+
}
|
|
378
|
+
}
|
|
379
|
+
if (textParts.length > 0) {
|
|
380
|
+
systemMessageContent = textParts.join("\n\n");
|
|
381
|
+
}
|
|
382
|
+
}
|
|
383
|
+
}
|
|
269
384
|
const userSegments = [];
|
|
270
385
|
const guidelinePaths = [];
|
|
271
386
|
const userTextParts = [];
|
|
@@ -297,7 +412,8 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
297
412
|
}
|
|
298
413
|
try {
|
|
299
414
|
const fileContent = (await (0, import_promises2.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
|
|
300
|
-
|
|
415
|
+
const relativeToRepo = import_node_path2.default.relative(repoRootPath, resolvedPath);
|
|
416
|
+
if (isGuidelineFile(relativeToRepo, guidelinePatterns)) {
|
|
301
417
|
guidelinePaths.push(import_node_path2.default.resolve(resolvedPath));
|
|
302
418
|
if (verbose) {
|
|
303
419
|
console.log(` [Guideline] Found: ${displayPath}`);
|
|
@@ -307,7 +423,8 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
307
423
|
userSegments.push({
|
|
308
424
|
type: "file",
|
|
309
425
|
path: displayPath,
|
|
310
|
-
text: fileContent
|
|
426
|
+
text: fileContent,
|
|
427
|
+
resolvedPath: import_node_path2.default.resolve(resolvedPath)
|
|
311
428
|
});
|
|
312
429
|
if (verbose) {
|
|
313
430
|
console.log(` [File] Found: ${displayPath}`);
|
|
@@ -331,14 +448,27 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
331
448
|
const assistantContent = assistantMessages[0]?.content;
|
|
332
449
|
const expectedAssistantRaw = await resolveAssistantContent(assistantContent, searchRoots, verbose);
|
|
333
450
|
const userTextPrompt = userTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
334
|
-
const testCaseGrader = coerceGrader(
|
|
451
|
+
const testCaseGrader = coerceGrader(evalcase.grader) ?? globalGrader;
|
|
452
|
+
const userFilePaths = [];
|
|
453
|
+
for (const segment of userSegments) {
|
|
454
|
+
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
455
|
+
userFilePaths.push(segment.resolvedPath);
|
|
456
|
+
}
|
|
457
|
+
}
|
|
458
|
+
const allFilePaths = [
|
|
459
|
+
...guidelinePaths.map((guidelinePath) => import_node_path2.default.resolve(guidelinePath)),
|
|
460
|
+
...userFilePaths
|
|
461
|
+
];
|
|
335
462
|
const testCase = {
|
|
336
463
|
id,
|
|
337
464
|
conversation_id: conversationId,
|
|
338
465
|
task: userTextPrompt,
|
|
339
466
|
user_segments: userSegments,
|
|
467
|
+
system_message: systemMessageContent,
|
|
340
468
|
expected_assistant_raw: expectedAssistantRaw,
|
|
341
469
|
guideline_paths: guidelinePaths.map((guidelinePath) => import_node_path2.default.resolve(guidelinePath)),
|
|
470
|
+
guideline_patterns: guidelinePatterns,
|
|
471
|
+
file_paths: allFilePaths,
|
|
342
472
|
code_snippets: codeSnippets,
|
|
343
473
|
outcome,
|
|
344
474
|
grader: testCaseGrader
|
|
@@ -404,7 +534,7 @@ ${body}`);
|
|
|
404
534
|
}
|
|
405
535
|
const request = requestParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
|
|
406
536
|
const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
|
|
407
|
-
return { request, guidelines };
|
|
537
|
+
return { request, guidelines, systemMessage: testCase.system_message };
|
|
408
538
|
}
|
|
409
539
|
async function fileExists2(absolutePath) {
|
|
410
540
|
try {
|
|
@@ -530,15 +660,18 @@ function buildChatPrompt(request) {
|
|
|
530
660
|
return request.chatPrompt;
|
|
531
661
|
}
|
|
532
662
|
const systemSegments = [];
|
|
533
|
-
if (request.guidelines && request.guidelines.trim().length > 0) {
|
|
534
|
-
systemSegments.push(`Guidelines:
|
|
535
|
-
${request.guidelines.trim()}`);
|
|
536
|
-
}
|
|
537
663
|
const metadataSystemPrompt = typeof request.metadata?.systemPrompt === "string" ? request.metadata.systemPrompt : void 0;
|
|
538
664
|
if (metadataSystemPrompt && metadataSystemPrompt.trim().length > 0) {
|
|
539
665
|
systemSegments.push(metadataSystemPrompt.trim());
|
|
666
|
+
} else {
|
|
667
|
+
systemSegments.push(DEFAULT_SYSTEM_PROMPT);
|
|
540
668
|
}
|
|
541
|
-
|
|
669
|
+
if (request.guidelines && request.guidelines.trim().length > 0) {
|
|
670
|
+
systemSegments.push(`[[ ## Guidelines ## ]]
|
|
671
|
+
|
|
672
|
+
${request.guidelines.trim()}`);
|
|
673
|
+
}
|
|
674
|
+
const systemContent = systemSegments.join("\n\n");
|
|
542
675
|
const userContent = request.prompt.trim();
|
|
543
676
|
const prompt = [
|
|
544
677
|
{
|
|
@@ -767,6 +900,9 @@ function normalizeAzureApiVersion(value) {
|
|
|
767
900
|
function resolveTargetDefinition(definition, env = process.env) {
|
|
768
901
|
const parsed = BASE_TARGET_SCHEMA.parse(definition);
|
|
769
902
|
const provider = parsed.provider.toLowerCase();
|
|
903
|
+
const providerBatching = resolveOptionalBoolean(
|
|
904
|
+
parsed.settings?.provider_batching ?? parsed.settings?.providerBatching
|
|
905
|
+
);
|
|
770
906
|
switch (provider) {
|
|
771
907
|
case "azure":
|
|
772
908
|
case "azure-openai":
|
|
@@ -775,6 +911,7 @@ function resolveTargetDefinition(definition, env = process.env) {
|
|
|
775
911
|
name: parsed.name,
|
|
776
912
|
judgeTarget: parsed.judge_target,
|
|
777
913
|
workers: parsed.workers,
|
|
914
|
+
providerBatching,
|
|
778
915
|
config: resolveAzureConfig(parsed, env)
|
|
779
916
|
};
|
|
780
917
|
case "anthropic":
|
|
@@ -783,6 +920,7 @@ function resolveTargetDefinition(definition, env = process.env) {
|
|
|
783
920
|
name: parsed.name,
|
|
784
921
|
judgeTarget: parsed.judge_target,
|
|
785
922
|
workers: parsed.workers,
|
|
923
|
+
providerBatching,
|
|
786
924
|
config: resolveAnthropicConfig(parsed, env)
|
|
787
925
|
};
|
|
788
926
|
case "gemini":
|
|
@@ -793,6 +931,7 @@ function resolveTargetDefinition(definition, env = process.env) {
|
|
|
793
931
|
name: parsed.name,
|
|
794
932
|
judgeTarget: parsed.judge_target,
|
|
795
933
|
workers: parsed.workers,
|
|
934
|
+
providerBatching,
|
|
796
935
|
config: resolveGeminiConfig(parsed, env)
|
|
797
936
|
};
|
|
798
937
|
case "mock":
|
|
@@ -801,6 +940,7 @@ function resolveTargetDefinition(definition, env = process.env) {
|
|
|
801
940
|
name: parsed.name,
|
|
802
941
|
judgeTarget: parsed.judge_target,
|
|
803
942
|
workers: parsed.workers,
|
|
943
|
+
providerBatching,
|
|
804
944
|
config: resolveMockConfig(parsed)
|
|
805
945
|
};
|
|
806
946
|
case "vscode":
|
|
@@ -810,6 +950,7 @@ function resolveTargetDefinition(definition, env = process.env) {
|
|
|
810
950
|
name: parsed.name,
|
|
811
951
|
judgeTarget: parsed.judge_target,
|
|
812
952
|
workers: parsed.workers,
|
|
953
|
+
providerBatching,
|
|
813
954
|
config: resolveVSCodeConfig(parsed, env, provider === "vscode-insiders")
|
|
814
955
|
};
|
|
815
956
|
default:
|
|
@@ -995,14 +1136,13 @@ function isLikelyEnvReference(value) {
|
|
|
995
1136
|
|
|
996
1137
|
// src/evaluation/providers/vscode.ts
|
|
997
1138
|
var import_promises3 = require("fs/promises");
|
|
998
|
-
var import_node_os = require("os");
|
|
999
1139
|
var import_node_path3 = __toESM(require("path"), 1);
|
|
1000
1140
|
var import_subagent = require("subagent");
|
|
1001
|
-
var PROMPT_FILE_PREFIX = "agentv-vscode-";
|
|
1002
1141
|
var VSCodeProvider = class {
|
|
1003
1142
|
id;
|
|
1004
1143
|
kind;
|
|
1005
1144
|
targetName;
|
|
1145
|
+
supportsBatch = true;
|
|
1006
1146
|
config;
|
|
1007
1147
|
constructor(targetName, config, kind) {
|
|
1008
1148
|
this.id = `${kind}:${targetName}`;
|
|
@@ -1015,117 +1155,159 @@ var VSCodeProvider = class {
|
|
|
1015
1155
|
throw new Error("VS Code provider request was aborted before dispatch");
|
|
1016
1156
|
}
|
|
1017
1157
|
const attachments = normalizeAttachments(request.attachments);
|
|
1018
|
-
const promptContent = buildPromptDocument(request, attachments);
|
|
1019
|
-
const
|
|
1020
|
-
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
|
|
1035
|
-
const failure = session.error ?? "VS Code subagent did not produce a response";
|
|
1036
|
-
throw new Error(failure);
|
|
1037
|
-
}
|
|
1038
|
-
if (this.config.dryRun) {
|
|
1039
|
-
return {
|
|
1040
|
-
text: "",
|
|
1041
|
-
raw: {
|
|
1042
|
-
session,
|
|
1043
|
-
promptFile: promptPath,
|
|
1044
|
-
attachments
|
|
1045
|
-
}
|
|
1046
|
-
};
|
|
1047
|
-
}
|
|
1048
|
-
const responseText = await (0, import_promises3.readFile)(session.responseFile, "utf8");
|
|
1158
|
+
const promptContent = buildPromptDocument(request, attachments, request.guideline_patterns);
|
|
1159
|
+
const session = await (0, import_subagent.dispatchAgentSession)({
|
|
1160
|
+
userQuery: promptContent,
|
|
1161
|
+
// Use full prompt content instead of just request.prompt
|
|
1162
|
+
extraAttachments: attachments,
|
|
1163
|
+
wait: this.config.waitForResponse,
|
|
1164
|
+
dryRun: this.config.dryRun,
|
|
1165
|
+
vscodeCmd: this.config.command,
|
|
1166
|
+
subagentRoot: this.config.subagentRoot,
|
|
1167
|
+
workspaceTemplate: this.config.workspaceTemplate,
|
|
1168
|
+
silent: true
|
|
1169
|
+
});
|
|
1170
|
+
if (session.exitCode !== 0 || !session.responseFile) {
|
|
1171
|
+
const failure = session.error ?? "VS Code subagent did not produce a response";
|
|
1172
|
+
throw new Error(failure);
|
|
1173
|
+
}
|
|
1174
|
+
if (this.config.dryRun) {
|
|
1049
1175
|
return {
|
|
1050
|
-
text:
|
|
1176
|
+
text: "",
|
|
1051
1177
|
raw: {
|
|
1052
1178
|
session,
|
|
1053
|
-
promptFile: promptPath,
|
|
1054
1179
|
attachments
|
|
1055
1180
|
}
|
|
1056
1181
|
};
|
|
1057
|
-
} finally {
|
|
1058
|
-
await (0, import_promises3.rm)(directory, { recursive: true, force: true });
|
|
1059
1182
|
}
|
|
1183
|
+
const responseText = await (0, import_promises3.readFile)(session.responseFile, "utf8");
|
|
1184
|
+
return {
|
|
1185
|
+
text: responseText,
|
|
1186
|
+
raw: {
|
|
1187
|
+
session,
|
|
1188
|
+
attachments
|
|
1189
|
+
}
|
|
1190
|
+
};
|
|
1191
|
+
}
|
|
1192
|
+
async invokeBatch(requests) {
|
|
1193
|
+
if (requests.length === 0) {
|
|
1194
|
+
return [];
|
|
1195
|
+
}
|
|
1196
|
+
const normalizedRequests = requests.map((req) => ({
|
|
1197
|
+
request: req,
|
|
1198
|
+
attachments: normalizeAttachments(req.attachments)
|
|
1199
|
+
}));
|
|
1200
|
+
const combinedAttachments = mergeAttachments(
|
|
1201
|
+
normalizedRequests.map(({ attachments }) => attachments)
|
|
1202
|
+
);
|
|
1203
|
+
const userQueries = normalizedRequests.map(
|
|
1204
|
+
({ request, attachments }) => buildPromptDocument(request, attachments, request.guideline_patterns)
|
|
1205
|
+
);
|
|
1206
|
+
const session = await (0, import_subagent.dispatchBatchAgent)({
|
|
1207
|
+
userQueries,
|
|
1208
|
+
extraAttachments: combinedAttachments,
|
|
1209
|
+
wait: this.config.waitForResponse,
|
|
1210
|
+
dryRun: this.config.dryRun,
|
|
1211
|
+
vscodeCmd: this.config.command,
|
|
1212
|
+
subagentRoot: this.config.subagentRoot,
|
|
1213
|
+
workspaceTemplate: this.config.workspaceTemplate,
|
|
1214
|
+
silent: true
|
|
1215
|
+
});
|
|
1216
|
+
if (session.exitCode !== 0 || !session.responseFiles) {
|
|
1217
|
+
const failure = session.error ?? "VS Code subagent did not produce batch responses";
|
|
1218
|
+
throw new Error(failure);
|
|
1219
|
+
}
|
|
1220
|
+
if (this.config.dryRun) {
|
|
1221
|
+
return normalizedRequests.map(({ attachments }) => ({
|
|
1222
|
+
text: "",
|
|
1223
|
+
raw: {
|
|
1224
|
+
session,
|
|
1225
|
+
attachments,
|
|
1226
|
+
allAttachments: combinedAttachments
|
|
1227
|
+
}
|
|
1228
|
+
}));
|
|
1229
|
+
}
|
|
1230
|
+
if (session.responseFiles.length !== requests.length) {
|
|
1231
|
+
throw new Error(
|
|
1232
|
+
`VS Code batch returned ${session.responseFiles.length} responses for ${requests.length} requests`
|
|
1233
|
+
);
|
|
1234
|
+
}
|
|
1235
|
+
const responses = [];
|
|
1236
|
+
for (const [index, responseFile] of session.responseFiles.entries()) {
|
|
1237
|
+
const responseText = await (0, import_promises3.readFile)(responseFile, "utf8");
|
|
1238
|
+
responses.push({
|
|
1239
|
+
text: responseText,
|
|
1240
|
+
raw: {
|
|
1241
|
+
session,
|
|
1242
|
+
attachments: normalizedRequests[index]?.attachments,
|
|
1243
|
+
allAttachments: combinedAttachments,
|
|
1244
|
+
responseFile
|
|
1245
|
+
}
|
|
1246
|
+
});
|
|
1247
|
+
}
|
|
1248
|
+
return responses;
|
|
1060
1249
|
}
|
|
1061
1250
|
};
|
|
1062
|
-
function buildPromptDocument(request, attachments) {
|
|
1251
|
+
function buildPromptDocument(request, attachments, guidelinePatterns) {
|
|
1063
1252
|
const parts = [];
|
|
1064
|
-
const
|
|
1065
|
-
|
|
1066
|
-
|
|
1067
|
-
|
|
1068
|
-
|
|
1069
|
-
|
|
1070
|
-
|
|
1071
|
-
|
|
1072
|
-
if (request.metadata?.target) {
|
|
1073
|
-
parts.push(`- Target: ${String(request.metadata.target)}`);
|
|
1074
|
-
}
|
|
1075
|
-
parts.push("\n## Task\n", request.prompt.trim());
|
|
1076
|
-
if (request.guidelines && request.guidelines.trim().length > 0) {
|
|
1077
|
-
parts.push("\n## Guidelines\n", request.guidelines.trim());
|
|
1078
|
-
}
|
|
1079
|
-
if (attachments && attachments.length > 0) {
|
|
1080
|
-
const attachmentList = attachments.map((item) => `- ${item}`).join("\n");
|
|
1081
|
-
parts.push("\n## Attachments\n", attachmentList);
|
|
1253
|
+
const guidelineFiles = collectGuidelineFiles(attachments, guidelinePatterns);
|
|
1254
|
+
const attachmentFiles = collectAttachmentFiles(attachments);
|
|
1255
|
+
const nonGuidelineAttachments = attachmentFiles.filter(
|
|
1256
|
+
(file) => !guidelineFiles.includes(file)
|
|
1257
|
+
);
|
|
1258
|
+
const prereadBlock = buildMandatoryPrereadBlock(guidelineFiles, nonGuidelineAttachments);
|
|
1259
|
+
if (prereadBlock.length > 0) {
|
|
1260
|
+
parts.push("\n", prereadBlock);
|
|
1082
1261
|
}
|
|
1262
|
+
parts.push("\n[[ ## user_query ## ]]\n", request.prompt.trim());
|
|
1083
1263
|
return parts.join("\n").trim();
|
|
1084
1264
|
}
|
|
1085
|
-
function buildMandatoryPrereadBlock(
|
|
1086
|
-
if (
|
|
1265
|
+
function buildMandatoryPrereadBlock(guidelineFiles, attachmentFiles) {
|
|
1266
|
+
if (guidelineFiles.length === 0 && attachmentFiles.length === 0) {
|
|
1087
1267
|
return "";
|
|
1088
1268
|
}
|
|
1089
|
-
const
|
|
1090
|
-
const tokenList = [];
|
|
1091
|
-
let counter = 0;
|
|
1092
|
-
for (const absolutePath of instructionFiles) {
|
|
1093
|
-
counter += 1;
|
|
1269
|
+
const buildList = (files) => files.map((absolutePath) => {
|
|
1094
1270
|
const fileName = import_node_path3.default.basename(absolutePath);
|
|
1095
1271
|
const fileUri = pathToFileUri(absolutePath);
|
|
1096
|
-
|
|
1097
|
-
|
|
1098
|
-
|
|
1099
|
-
|
|
1100
|
-
|
|
1101
|
-
|
|
1102
|
-
|
|
1103
|
-
|
|
1104
|
-
|
|
1105
|
-
|
|
1106
|
-
|
|
1107
|
-
|
|
1108
|
-
|
|
1109
|
-
|
|
1110
|
-
|
|
1111
|
-
|
|
1112
|
-
`Then fetch all documentation required by the instructions before proceeding with your task.`
|
|
1113
|
-
].join(" ");
|
|
1114
|
-
return `[[ ## mandatory_pre_read ## ]]
|
|
1115
|
-
|
|
1116
|
-
${instruction}
|
|
1117
|
-
|
|
1118
|
-
`;
|
|
1272
|
+
return `* [${fileName}](${fileUri})`;
|
|
1273
|
+
});
|
|
1274
|
+
const sections = [];
|
|
1275
|
+
if (guidelineFiles.length > 0) {
|
|
1276
|
+
sections.push(`Read all guideline files:
|
|
1277
|
+
${buildList(guidelineFiles).join("\n")}.`);
|
|
1278
|
+
}
|
|
1279
|
+
if (attachmentFiles.length > 0) {
|
|
1280
|
+
sections.push(`Read all attachment files:
|
|
1281
|
+
${buildList(attachmentFiles).join("\n")}.`);
|
|
1282
|
+
}
|
|
1283
|
+
sections.push(
|
|
1284
|
+
"If any file is missing, fail with ERROR: missing-file <filename> and stop.",
|
|
1285
|
+
"Then apply system_instructions on the user query below."
|
|
1286
|
+
);
|
|
1287
|
+
return sections.join("\n");
|
|
1119
1288
|
}
|
|
1120
|
-
function
|
|
1289
|
+
function collectGuidelineFiles(attachments, guidelinePatterns) {
|
|
1121
1290
|
if (!attachments || attachments.length === 0) {
|
|
1122
1291
|
return [];
|
|
1123
1292
|
}
|
|
1124
1293
|
const unique = /* @__PURE__ */ new Map();
|
|
1125
1294
|
for (const attachment of attachments) {
|
|
1126
|
-
|
|
1127
|
-
|
|
1295
|
+
const absolutePath = import_node_path3.default.resolve(attachment);
|
|
1296
|
+
const normalized = absolutePath.split(import_node_path3.default.sep).join("/");
|
|
1297
|
+
if (isGuidelineFile(normalized, guidelinePatterns)) {
|
|
1298
|
+
if (!unique.has(absolutePath)) {
|
|
1299
|
+
unique.set(absolutePath, absolutePath);
|
|
1300
|
+
}
|
|
1128
1301
|
}
|
|
1302
|
+
}
|
|
1303
|
+
return Array.from(unique.values());
|
|
1304
|
+
}
|
|
1305
|
+
function collectAttachmentFiles(attachments) {
|
|
1306
|
+
if (!attachments || attachments.length === 0) {
|
|
1307
|
+
return [];
|
|
1308
|
+
}
|
|
1309
|
+
const unique = /* @__PURE__ */ new Map();
|
|
1310
|
+
for (const attachment of attachments) {
|
|
1129
1311
|
const absolutePath = import_node_path3.default.resolve(attachment);
|
|
1130
1312
|
if (!unique.has(absolutePath)) {
|
|
1131
1313
|
unique.set(absolutePath, absolutePath);
|
|
@@ -1133,10 +1315,6 @@ function collectInstructionFiles(attachments) {
|
|
|
1133
1315
|
}
|
|
1134
1316
|
return Array.from(unique.values());
|
|
1135
1317
|
}
|
|
1136
|
-
function isInstructionPath(filePath) {
|
|
1137
|
-
const normalized = filePath.split(import_node_path3.default.sep).join("/");
|
|
1138
|
-
return normalized.endsWith(".instructions.md") || normalized.includes("/instructions/") || normalized.endsWith(".prompt.md") || normalized.includes("/prompts/");
|
|
1139
|
-
}
|
|
1140
1318
|
function pathToFileUri(filePath) {
|
|
1141
1319
|
const absolutePath = import_node_path3.default.isAbsolute(filePath) ? filePath : import_node_path3.default.resolve(filePath);
|
|
1142
1320
|
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
@@ -1145,14 +1323,6 @@ function pathToFileUri(filePath) {
|
|
|
1145
1323
|
}
|
|
1146
1324
|
return `file://${normalizedPath}`;
|
|
1147
1325
|
}
|
|
1148
|
-
function composeUserQuery(request) {
|
|
1149
|
-
const segments = [];
|
|
1150
|
-
segments.push(request.prompt.trim());
|
|
1151
|
-
if (request.guidelines && request.guidelines.trim().length > 0) {
|
|
1152
|
-
segments.push("\nGuidelines:\n", request.guidelines.trim());
|
|
1153
|
-
}
|
|
1154
|
-
return segments.join("\n").trim();
|
|
1155
|
-
}
|
|
1156
1326
|
function normalizeAttachments(attachments) {
|
|
1157
1327
|
if (!attachments || attachments.length === 0) {
|
|
1158
1328
|
return void 0;
|
|
@@ -1163,6 +1333,16 @@ function normalizeAttachments(attachments) {
|
|
|
1163
1333
|
}
|
|
1164
1334
|
return Array.from(deduped);
|
|
1165
1335
|
}
|
|
1336
|
+
function mergeAttachments(all) {
|
|
1337
|
+
const deduped = /* @__PURE__ */ new Set();
|
|
1338
|
+
for (const list of all) {
|
|
1339
|
+
if (!list) continue;
|
|
1340
|
+
for (const attachment of list) {
|
|
1341
|
+
deduped.add(import_node_path3.default.resolve(attachment));
|
|
1342
|
+
}
|
|
1343
|
+
}
|
|
1344
|
+
return deduped.size > 0 ? Array.from(deduped) : void 0;
|
|
1345
|
+
}
|
|
1166
1346
|
async function ensureVSCodeSubagents(options) {
|
|
1167
1347
|
const { kind, count, verbose = false } = options;
|
|
1168
1348
|
const vscodeCmd = kind === "vscode-insiders" ? "code-insiders" : "code";
|
|
@@ -1504,7 +1684,7 @@ var import_node_crypto = require("crypto");
|
|
|
1504
1684
|
var HeuristicGrader = class {
|
|
1505
1685
|
kind = "heuristic";
|
|
1506
1686
|
grade(context) {
|
|
1507
|
-
const expectedAspects = extractAspects(context.
|
|
1687
|
+
const expectedAspects = extractAspects(context.evalCase.expected_assistant_raw);
|
|
1508
1688
|
const result = scoreCandidateResponse(context.candidate, expectedAspects);
|
|
1509
1689
|
const misses = [...result.misses];
|
|
1510
1690
|
if (expectedAspects.length === 0 && isErrorLike(context.candidate)) {
|
|
@@ -1537,14 +1717,14 @@ var QualityGrader = class {
|
|
|
1537
1717
|
if (!judgeProvider) {
|
|
1538
1718
|
throw new Error("No judge provider available for LLM grading");
|
|
1539
1719
|
}
|
|
1540
|
-
const prompt = buildQualityPrompt(context.
|
|
1720
|
+
const prompt = buildQualityPrompt(context.evalCase, context.candidate);
|
|
1541
1721
|
const metadata = {
|
|
1542
1722
|
systemPrompt: QUALITY_SYSTEM_PROMPT
|
|
1543
1723
|
};
|
|
1544
1724
|
const response = await judgeProvider.invoke({
|
|
1545
1725
|
prompt,
|
|
1546
1726
|
metadata,
|
|
1547
|
-
|
|
1727
|
+
evalCaseId: context.evalCase.id,
|
|
1548
1728
|
attempt: context.attempt,
|
|
1549
1729
|
maxOutputTokens: this.maxOutputTokens,
|
|
1550
1730
|
temperature: this.temperature
|
|
@@ -1590,16 +1770,16 @@ var QUALITY_SYSTEM_PROMPT = [
|
|
|
1590
1770
|
function buildQualityPrompt(testCase, candidate) {
|
|
1591
1771
|
const parts = [
|
|
1592
1772
|
"[[ ## expected_outcome ## ]]",
|
|
1593
|
-
testCase.outcome,
|
|
1773
|
+
testCase.outcome.trim(),
|
|
1594
1774
|
"",
|
|
1595
1775
|
"[[ ## request ## ]]",
|
|
1596
|
-
testCase.task,
|
|
1776
|
+
testCase.task.trim(),
|
|
1597
1777
|
"",
|
|
1598
1778
|
"[[ ## reference_answer ## ]]",
|
|
1599
|
-
testCase.expected_assistant_raw,
|
|
1779
|
+
testCase.expected_assistant_raw.trim(),
|
|
1600
1780
|
"",
|
|
1601
1781
|
"[[ ## generated_answer ## ]]",
|
|
1602
|
-
candidate,
|
|
1782
|
+
candidate.trim(),
|
|
1603
1783
|
"",
|
|
1604
1784
|
"Respond with a single JSON object matching the schema described in the system prompt."
|
|
1605
1785
|
];
|
|
@@ -1848,10 +2028,10 @@ async function runEvaluation(options) {
|
|
|
1848
2028
|
onResult,
|
|
1849
2029
|
onProgress
|
|
1850
2030
|
} = options;
|
|
1851
|
-
const load =
|
|
1852
|
-
const
|
|
1853
|
-
const
|
|
1854
|
-
if (
|
|
2031
|
+
const load = loadEvalCases;
|
|
2032
|
+
const evalCases = await load(testFilePath, repoRoot, { verbose });
|
|
2033
|
+
const filteredEvalCases = filterEvalCases(evalCases, evalId);
|
|
2034
|
+
if (filteredEvalCases.length === 0) {
|
|
1855
2035
|
if (evalId) {
|
|
1856
2036
|
throw new Error(`Test case with id '${evalId}' not found in ${testFilePath}`);
|
|
1857
2037
|
}
|
|
@@ -1897,35 +2077,62 @@ async function runEvaluation(options) {
|
|
|
1897
2077
|
};
|
|
1898
2078
|
const graderRegistry = buildGraderRegistry(graders, resolveJudgeProvider);
|
|
1899
2079
|
const primaryProvider = getOrCreateProvider(target);
|
|
1900
|
-
|
|
1901
|
-
|
|
2080
|
+
const providerSupportsBatch = target.providerBatching === true && primaryProvider.supportsBatch === true && typeof primaryProvider.invokeBatch === "function";
|
|
2081
|
+
if (target.providerBatching && !providerSupportsBatch && verbose) {
|
|
2082
|
+
console.warn(
|
|
2083
|
+
`Provider batching requested for target '${target.name}', but provider does not advertise batch support. Using per-case dispatch.`
|
|
2084
|
+
);
|
|
2085
|
+
}
|
|
2086
|
+
if (onProgress && filteredEvalCases.length > 0) {
|
|
2087
|
+
for (let i = 0; i < filteredEvalCases.length; i++) {
|
|
1902
2088
|
await onProgress({
|
|
1903
2089
|
workerId: i + 1,
|
|
1904
|
-
evalId:
|
|
2090
|
+
evalId: filteredEvalCases[i].id,
|
|
1905
2091
|
status: "pending"
|
|
1906
2092
|
});
|
|
1907
2093
|
}
|
|
1908
2094
|
}
|
|
2095
|
+
if (providerSupportsBatch) {
|
|
2096
|
+
try {
|
|
2097
|
+
return await runBatchEvaluation({
|
|
2098
|
+
evalCases: filteredEvalCases,
|
|
2099
|
+
provider: primaryProvider,
|
|
2100
|
+
target,
|
|
2101
|
+
graderRegistry,
|
|
2102
|
+
promptDumpDir,
|
|
2103
|
+
nowFn: now ?? (() => /* @__PURE__ */ new Date()),
|
|
2104
|
+
onProgress,
|
|
2105
|
+
onResult,
|
|
2106
|
+
verbose,
|
|
2107
|
+
resolveJudgeProvider
|
|
2108
|
+
});
|
|
2109
|
+
} catch (error) {
|
|
2110
|
+
if (verbose) {
|
|
2111
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
2112
|
+
console.warn(`Provider batch execution failed, falling back to per-case dispatch: ${message}`);
|
|
2113
|
+
}
|
|
2114
|
+
}
|
|
2115
|
+
}
|
|
1909
2116
|
const workers = options.maxConcurrency ?? target.workers ?? 1;
|
|
1910
2117
|
const limit = pLimit(workers);
|
|
1911
2118
|
let nextWorkerId = 1;
|
|
1912
2119
|
const workerIdByEvalId = /* @__PURE__ */ new Map();
|
|
1913
|
-
const promises =
|
|
1914
|
-
(
|
|
2120
|
+
const promises = filteredEvalCases.map(
|
|
2121
|
+
(evalCase) => limit(async () => {
|
|
1915
2122
|
const workerId = nextWorkerId++;
|
|
1916
|
-
workerIdByEvalId.set(
|
|
2123
|
+
workerIdByEvalId.set(evalCase.id, workerId);
|
|
1917
2124
|
if (onProgress) {
|
|
1918
2125
|
await onProgress({
|
|
1919
2126
|
workerId,
|
|
1920
|
-
evalId:
|
|
2127
|
+
evalId: evalCase.id,
|
|
1921
2128
|
status: "running",
|
|
1922
2129
|
startedAt: Date.now()
|
|
1923
2130
|
});
|
|
1924
2131
|
}
|
|
1925
2132
|
try {
|
|
1926
2133
|
const judgeProvider = await resolveJudgeProvider(target);
|
|
1927
|
-
const result = await
|
|
1928
|
-
|
|
2134
|
+
const result = await runEvalCase({
|
|
2135
|
+
evalCase,
|
|
1929
2136
|
provider: primaryProvider,
|
|
1930
2137
|
target,
|
|
1931
2138
|
graders: graderRegistry,
|
|
@@ -1940,7 +2147,7 @@ async function runEvaluation(options) {
|
|
|
1940
2147
|
if (onProgress) {
|
|
1941
2148
|
await onProgress({
|
|
1942
2149
|
workerId,
|
|
1943
|
-
evalId:
|
|
2150
|
+
evalId: evalCase.id,
|
|
1944
2151
|
status: "completed",
|
|
1945
2152
|
startedAt: 0,
|
|
1946
2153
|
// Not used for completed status
|
|
@@ -1955,7 +2162,7 @@ async function runEvaluation(options) {
|
|
|
1955
2162
|
if (onProgress) {
|
|
1956
2163
|
await onProgress({
|
|
1957
2164
|
workerId,
|
|
1958
|
-
evalId:
|
|
2165
|
+
evalId: evalCase.id,
|
|
1959
2166
|
status: "failed",
|
|
1960
2167
|
completedAt: Date.now(),
|
|
1961
2168
|
error: error instanceof Error ? error.message : String(error)
|
|
@@ -1972,10 +2179,10 @@ async function runEvaluation(options) {
|
|
|
1972
2179
|
if (outcome.status === "fulfilled") {
|
|
1973
2180
|
results.push(outcome.value);
|
|
1974
2181
|
} else {
|
|
1975
|
-
const
|
|
1976
|
-
const promptInputs = await buildPromptInputs(
|
|
2182
|
+
const evalCase = filteredEvalCases[i];
|
|
2183
|
+
const promptInputs = await buildPromptInputs(evalCase);
|
|
1977
2184
|
const errorResult = buildErrorResult(
|
|
1978
|
-
|
|
2185
|
+
evalCase,
|
|
1979
2186
|
target.name,
|
|
1980
2187
|
(now ?? (() => /* @__PURE__ */ new Date()))(),
|
|
1981
2188
|
outcome.reason,
|
|
@@ -1989,9 +2196,140 @@ async function runEvaluation(options) {
|
|
|
1989
2196
|
}
|
|
1990
2197
|
return results;
|
|
1991
2198
|
}
|
|
1992
|
-
async function
|
|
2199
|
+
async function runBatchEvaluation(options) {
|
|
2200
|
+
const {
|
|
2201
|
+
evalCases,
|
|
2202
|
+
provider,
|
|
2203
|
+
target,
|
|
2204
|
+
graderRegistry,
|
|
2205
|
+
promptDumpDir,
|
|
2206
|
+
nowFn,
|
|
2207
|
+
onProgress,
|
|
2208
|
+
onResult,
|
|
2209
|
+
resolveJudgeProvider
|
|
2210
|
+
} = options;
|
|
2211
|
+
const promptInputsList = [];
|
|
2212
|
+
for (const evalCase of evalCases) {
|
|
2213
|
+
const promptInputs = await buildPromptInputs(evalCase);
|
|
2214
|
+
if (promptDumpDir) {
|
|
2215
|
+
await dumpPrompt(promptDumpDir, evalCase, promptInputs);
|
|
2216
|
+
}
|
|
2217
|
+
promptInputsList.push(promptInputs);
|
|
2218
|
+
}
|
|
2219
|
+
const batchRequests = evalCases.map((evalCase, index) => {
|
|
2220
|
+
const promptInputs = promptInputsList[index];
|
|
2221
|
+
return {
|
|
2222
|
+
prompt: promptInputs.request,
|
|
2223
|
+
guidelines: promptInputs.guidelines,
|
|
2224
|
+
guideline_patterns: evalCase.guideline_patterns,
|
|
2225
|
+
attachments: evalCase.file_paths,
|
|
2226
|
+
evalCaseId: evalCase.id,
|
|
2227
|
+
metadata: {
|
|
2228
|
+
systemPrompt: promptInputs.systemMessage ?? ""
|
|
2229
|
+
}
|
|
2230
|
+
};
|
|
2231
|
+
});
|
|
2232
|
+
const batchResponse = await provider.invokeBatch?.(batchRequests);
|
|
2233
|
+
if (!Array.isArray(batchResponse)) {
|
|
2234
|
+
throw new Error("Provider batching failed: invokeBatch did not return an array");
|
|
2235
|
+
}
|
|
2236
|
+
if (batchResponse.length !== evalCases.length) {
|
|
2237
|
+
throw new Error(
|
|
2238
|
+
`Provider batching failed: expected ${evalCases.length} responses, received ${batchResponse.length}`
|
|
2239
|
+
);
|
|
2240
|
+
}
|
|
2241
|
+
if (onProgress) {
|
|
2242
|
+
const startedAt = Date.now();
|
|
2243
|
+
for (let i = 0; i < evalCases.length; i++) {
|
|
2244
|
+
await onProgress({
|
|
2245
|
+
workerId: 1,
|
|
2246
|
+
evalId: evalCases[i].id,
|
|
2247
|
+
status: "running",
|
|
2248
|
+
startedAt
|
|
2249
|
+
});
|
|
2250
|
+
}
|
|
2251
|
+
}
|
|
2252
|
+
const results = [];
|
|
2253
|
+
for (let i = 0; i < evalCases.length; i++) {
|
|
2254
|
+
const evalCase = evalCases[i];
|
|
2255
|
+
const promptInputs = promptInputsList[i];
|
|
2256
|
+
const providerResponse = batchResponse[i];
|
|
2257
|
+
const now = nowFn();
|
|
2258
|
+
const graderKind = evalCase.grader ?? "heuristic";
|
|
2259
|
+
const activeGrader = graderRegistry[graderKind] ?? graderRegistry.heuristic;
|
|
2260
|
+
if (!activeGrader) {
|
|
2261
|
+
throw new Error(`No grader registered for kind '${graderKind}'`);
|
|
2262
|
+
}
|
|
2263
|
+
let grade;
|
|
2264
|
+
try {
|
|
2265
|
+
grade = await activeGrader.grade({
|
|
2266
|
+
evalCase,
|
|
2267
|
+
candidate: providerResponse.text ?? "",
|
|
2268
|
+
target,
|
|
2269
|
+
provider,
|
|
2270
|
+
attempt: 0,
|
|
2271
|
+
promptInputs,
|
|
2272
|
+
now,
|
|
2273
|
+
judgeProvider: await resolveJudgeProvider(target)
|
|
2274
|
+
});
|
|
2275
|
+
} catch (error) {
|
|
2276
|
+
const errorResult = buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
|
|
2277
|
+
results.push(errorResult);
|
|
2278
|
+
if (onResult) {
|
|
2279
|
+
await onResult(errorResult);
|
|
2280
|
+
}
|
|
2281
|
+
if (onProgress) {
|
|
2282
|
+
await onProgress({
|
|
2283
|
+
workerId: 1,
|
|
2284
|
+
evalId: evalCase.id,
|
|
2285
|
+
status: "failed",
|
|
2286
|
+
completedAt: Date.now(),
|
|
2287
|
+
error: error instanceof Error ? error.message : String(error)
|
|
2288
|
+
});
|
|
2289
|
+
}
|
|
2290
|
+
continue;
|
|
2291
|
+
}
|
|
2292
|
+
const completedAt = nowFn();
|
|
2293
|
+
const rawRequest = {
|
|
2294
|
+
request: promptInputs.request,
|
|
2295
|
+
guidelines: promptInputs.guidelines,
|
|
2296
|
+
guideline_paths: evalCase.guideline_paths,
|
|
2297
|
+
system_message: promptInputs.systemMessage ?? ""
|
|
2298
|
+
};
|
|
2299
|
+
const result = {
|
|
2300
|
+
eval_id: evalCase.id,
|
|
2301
|
+
conversation_id: evalCase.conversation_id,
|
|
2302
|
+
score: grade.score,
|
|
2303
|
+
hits: grade.hits,
|
|
2304
|
+
misses: grade.misses,
|
|
2305
|
+
model_answer: providerResponse.text ?? "",
|
|
2306
|
+
expected_aspect_count: grade.expectedAspectCount,
|
|
2307
|
+
target: target.name,
|
|
2308
|
+
timestamp: completedAt.toISOString(),
|
|
2309
|
+
reasoning: grade.reasoning,
|
|
2310
|
+
raw_aspects: grade.rawAspects,
|
|
2311
|
+
raw_request: rawRequest,
|
|
2312
|
+
grader_raw_request: grade.graderRawRequest
|
|
2313
|
+
};
|
|
2314
|
+
results.push(result);
|
|
2315
|
+
if (onResult) {
|
|
2316
|
+
await onResult(result);
|
|
2317
|
+
}
|
|
2318
|
+
if (onProgress) {
|
|
2319
|
+
await onProgress({
|
|
2320
|
+
workerId: 1,
|
|
2321
|
+
evalId: evalCase.id,
|
|
2322
|
+
status: "completed",
|
|
2323
|
+
startedAt: 0,
|
|
2324
|
+
completedAt: Date.now()
|
|
2325
|
+
});
|
|
2326
|
+
}
|
|
2327
|
+
}
|
|
2328
|
+
return results;
|
|
2329
|
+
}
|
|
2330
|
+
async function runEvalCase(options) {
|
|
1993
2331
|
const {
|
|
1994
|
-
|
|
2332
|
+
evalCase,
|
|
1995
2333
|
provider,
|
|
1996
2334
|
target,
|
|
1997
2335
|
graders,
|
|
@@ -2004,11 +2342,11 @@ async function runTestCase(options) {
|
|
|
2004
2342
|
signal,
|
|
2005
2343
|
judgeProvider
|
|
2006
2344
|
} = options;
|
|
2007
|
-
const promptInputs = await buildPromptInputs(
|
|
2345
|
+
const promptInputs = await buildPromptInputs(evalCase);
|
|
2008
2346
|
if (promptDumpDir) {
|
|
2009
|
-
await dumpPrompt(promptDumpDir,
|
|
2347
|
+
await dumpPrompt(promptDumpDir, evalCase, promptInputs);
|
|
2010
2348
|
}
|
|
2011
|
-
const cacheKey = useCache ? createCacheKey(provider, target,
|
|
2349
|
+
const cacheKey = useCache ? createCacheKey(provider, target, evalCase, promptInputs) : void 0;
|
|
2012
2350
|
let cachedResponse;
|
|
2013
2351
|
if (cacheKey && cache) {
|
|
2014
2352
|
cachedResponse = await cache.get(cacheKey);
|
|
@@ -2021,7 +2359,7 @@ async function runTestCase(options) {
|
|
|
2021
2359
|
while (!providerResponse && attempt < attemptBudget) {
|
|
2022
2360
|
try {
|
|
2023
2361
|
providerResponse = await invokeProvider(provider, {
|
|
2024
|
-
|
|
2362
|
+
evalCase,
|
|
2025
2363
|
target,
|
|
2026
2364
|
promptInputs,
|
|
2027
2365
|
attempt,
|
|
@@ -2034,12 +2372,12 @@ async function runTestCase(options) {
|
|
|
2034
2372
|
attempt += 1;
|
|
2035
2373
|
continue;
|
|
2036
2374
|
}
|
|
2037
|
-
return buildErrorResult(
|
|
2375
|
+
return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
|
|
2038
2376
|
}
|
|
2039
2377
|
}
|
|
2040
2378
|
if (!providerResponse) {
|
|
2041
2379
|
return buildErrorResult(
|
|
2042
|
-
|
|
2380
|
+
evalCase,
|
|
2043
2381
|
target.name,
|
|
2044
2382
|
nowFn(),
|
|
2045
2383
|
lastError ?? new Error("Provider did not return a response"),
|
|
@@ -2049,7 +2387,7 @@ async function runTestCase(options) {
|
|
|
2049
2387
|
if (cacheKey && cache && !cachedResponse) {
|
|
2050
2388
|
await cache.set(cacheKey, providerResponse);
|
|
2051
2389
|
}
|
|
2052
|
-
const graderKind =
|
|
2390
|
+
const graderKind = evalCase.grader ?? "heuristic";
|
|
2053
2391
|
const activeGrader = graders[graderKind] ?? graders.heuristic;
|
|
2054
2392
|
if (!activeGrader) {
|
|
2055
2393
|
throw new Error(`No grader registered for kind '${graderKind}'`);
|
|
@@ -2058,7 +2396,7 @@ async function runTestCase(options) {
|
|
|
2058
2396
|
try {
|
|
2059
2397
|
const gradeTimestamp = nowFn();
|
|
2060
2398
|
grade = await activeGrader.grade({
|
|
2061
|
-
|
|
2399
|
+
evalCase,
|
|
2062
2400
|
candidate: providerResponse.text ?? "",
|
|
2063
2401
|
target,
|
|
2064
2402
|
provider,
|
|
@@ -2068,17 +2406,18 @@ async function runTestCase(options) {
|
|
|
2068
2406
|
judgeProvider
|
|
2069
2407
|
});
|
|
2070
2408
|
} catch (error) {
|
|
2071
|
-
return buildErrorResult(
|
|
2409
|
+
return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
|
|
2072
2410
|
}
|
|
2073
2411
|
const completedAt = nowFn();
|
|
2074
2412
|
const rawRequest = {
|
|
2075
2413
|
request: promptInputs.request,
|
|
2076
2414
|
guidelines: promptInputs.guidelines,
|
|
2077
|
-
guideline_paths:
|
|
2415
|
+
guideline_paths: evalCase.guideline_paths,
|
|
2416
|
+
system_message: promptInputs.systemMessage ?? ""
|
|
2078
2417
|
};
|
|
2079
2418
|
return {
|
|
2080
|
-
eval_id:
|
|
2081
|
-
conversation_id:
|
|
2419
|
+
eval_id: evalCase.id,
|
|
2420
|
+
conversation_id: evalCase.conversation_id,
|
|
2082
2421
|
score: grade.score,
|
|
2083
2422
|
hits: grade.hits,
|
|
2084
2423
|
misses: grade.misses,
|
|
@@ -2092,11 +2431,11 @@ async function runTestCase(options) {
|
|
|
2092
2431
|
grader_raw_request: grade.graderRawRequest
|
|
2093
2432
|
};
|
|
2094
2433
|
}
|
|
2095
|
-
function
|
|
2434
|
+
function filterEvalCases(evalCases, evalId) {
|
|
2096
2435
|
if (!evalId) {
|
|
2097
|
-
return
|
|
2436
|
+
return evalCases;
|
|
2098
2437
|
}
|
|
2099
|
-
return
|
|
2438
|
+
return evalCases.filter((evalCase) => evalCase.id === evalId);
|
|
2100
2439
|
}
|
|
2101
2440
|
function buildGraderRegistry(overrides, resolveJudgeProvider) {
|
|
2102
2441
|
const heuristic = overrides?.heuristic ?? new HeuristicGrader();
|
|
@@ -2114,16 +2453,16 @@ function buildGraderRegistry(overrides, resolveJudgeProvider) {
|
|
|
2114
2453
|
llm_judge: llmJudge
|
|
2115
2454
|
};
|
|
2116
2455
|
}
|
|
2117
|
-
async function dumpPrompt(directory,
|
|
2456
|
+
async function dumpPrompt(directory, evalCase, promptInputs) {
|
|
2118
2457
|
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
2119
|
-
const filename = `${timestamp}_${sanitizeFilename(
|
|
2458
|
+
const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
|
|
2120
2459
|
const filePath = import_node_path5.default.resolve(directory, filename);
|
|
2121
2460
|
await (0, import_promises5.mkdir)(import_node_path5.default.dirname(filePath), { recursive: true });
|
|
2122
2461
|
const payload = {
|
|
2123
|
-
eval_id:
|
|
2462
|
+
eval_id: evalCase.id,
|
|
2124
2463
|
request: promptInputs.request,
|
|
2125
2464
|
guidelines: promptInputs.guidelines,
|
|
2126
|
-
guideline_paths:
|
|
2465
|
+
guideline_paths: evalCase.guideline_paths
|
|
2127
2466
|
};
|
|
2128
2467
|
await (0, import_promises5.writeFile)(filePath, JSON.stringify(payload, null, 2), "utf8");
|
|
2129
2468
|
}
|
|
@@ -2135,7 +2474,7 @@ function sanitizeFilename(value) {
|
|
|
2135
2474
|
return sanitized.length > 0 ? sanitized : (0, import_node_crypto2.randomUUID)();
|
|
2136
2475
|
}
|
|
2137
2476
|
async function invokeProvider(provider, options) {
|
|
2138
|
-
const {
|
|
2477
|
+
const { evalCase, target, promptInputs, attempt, agentTimeoutMs, signal } = options;
|
|
2139
2478
|
const controller = new AbortController();
|
|
2140
2479
|
const timeout = agentTimeoutMs ? setTimeout(() => controller.abort(), agentTimeoutMs) : void 0;
|
|
2141
2480
|
if (signal) {
|
|
@@ -2145,12 +2484,12 @@ async function invokeProvider(provider, options) {
|
|
|
2145
2484
|
return await provider.invoke({
|
|
2146
2485
|
prompt: promptInputs.request,
|
|
2147
2486
|
guidelines: promptInputs.guidelines,
|
|
2148
|
-
|
|
2149
|
-
|
|
2487
|
+
guideline_patterns: evalCase.guideline_patterns,
|
|
2488
|
+
attachments: evalCase.file_paths,
|
|
2489
|
+
evalCaseId: evalCase.id,
|
|
2150
2490
|
attempt,
|
|
2151
2491
|
metadata: {
|
|
2152
|
-
|
|
2153
|
-
grader: testCase.grader
|
|
2492
|
+
systemPrompt: promptInputs.systemMessage ?? ""
|
|
2154
2493
|
},
|
|
2155
2494
|
signal: controller.signal
|
|
2156
2495
|
});
|
|
@@ -2160,17 +2499,18 @@ async function invokeProvider(provider, options) {
|
|
|
2160
2499
|
}
|
|
2161
2500
|
}
|
|
2162
2501
|
}
|
|
2163
|
-
function buildErrorResult(
|
|
2502
|
+
function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs) {
|
|
2164
2503
|
const message = error instanceof Error ? error.message : String(error);
|
|
2165
2504
|
const rawRequest = {
|
|
2166
2505
|
request: promptInputs.request,
|
|
2167
2506
|
guidelines: promptInputs.guidelines,
|
|
2168
|
-
guideline_paths:
|
|
2507
|
+
guideline_paths: evalCase.guideline_paths,
|
|
2508
|
+
system_message: promptInputs.systemMessage ?? "",
|
|
2169
2509
|
error: message
|
|
2170
2510
|
};
|
|
2171
2511
|
return {
|
|
2172
|
-
eval_id:
|
|
2173
|
-
conversation_id:
|
|
2512
|
+
eval_id: evalCase.id,
|
|
2513
|
+
conversation_id: evalCase.conversation_id,
|
|
2174
2514
|
score: 0,
|
|
2175
2515
|
hits: [],
|
|
2176
2516
|
misses: [`Error: ${message}`],
|
|
@@ -2182,13 +2522,14 @@ function buildErrorResult(testCase, targetName, timestamp, error, promptInputs)
|
|
|
2182
2522
|
raw_request: rawRequest
|
|
2183
2523
|
};
|
|
2184
2524
|
}
|
|
2185
|
-
function createCacheKey(provider, target,
|
|
2525
|
+
function createCacheKey(provider, target, evalCase, promptInputs) {
|
|
2186
2526
|
const hash = (0, import_node_crypto2.createHash)("sha256");
|
|
2187
2527
|
hash.update(provider.id);
|
|
2188
2528
|
hash.update(target.name);
|
|
2189
|
-
hash.update(
|
|
2529
|
+
hash.update(evalCase.id);
|
|
2190
2530
|
hash.update(promptInputs.request);
|
|
2191
2531
|
hash.update(promptInputs.guidelines);
|
|
2532
|
+
hash.update(promptInputs.systemMessage ?? "");
|
|
2192
2533
|
return hash.digest("hex");
|
|
2193
2534
|
}
|
|
2194
2535
|
function isTimeoutLike(error) {
|
|
@@ -2217,7 +2558,9 @@ function createAgentKernel() {
|
|
|
2217
2558
|
HeuristicGrader,
|
|
2218
2559
|
QualityGrader,
|
|
2219
2560
|
TEST_MESSAGE_ROLES,
|
|
2561
|
+
buildDirectoryChain,
|
|
2220
2562
|
buildPromptInputs,
|
|
2563
|
+
buildSearchRoots,
|
|
2221
2564
|
calculateHits,
|
|
2222
2565
|
calculateMisses,
|
|
2223
2566
|
createAgentKernel,
|
|
@@ -2225,6 +2568,8 @@ function createAgentKernel() {
|
|
|
2225
2568
|
ensureVSCodeSubagents,
|
|
2226
2569
|
extractAspects,
|
|
2227
2570
|
extractCodeBlocks,
|
|
2571
|
+
fileExists,
|
|
2572
|
+
findGitRoot,
|
|
2228
2573
|
getHitCount,
|
|
2229
2574
|
isErrorLike,
|
|
2230
2575
|
isGraderKind,
|
|
@@ -2234,12 +2579,13 @@ function createAgentKernel() {
|
|
|
2234
2579
|
isTestMessage,
|
|
2235
2580
|
isTestMessageRole,
|
|
2236
2581
|
listTargetNames,
|
|
2237
|
-
|
|
2582
|
+
loadEvalCases,
|
|
2238
2583
|
readTargetDefinitions,
|
|
2239
2584
|
resolveAndCreateProvider,
|
|
2585
|
+
resolveFileReference,
|
|
2240
2586
|
resolveTargetDefinition,
|
|
2587
|
+
runEvalCase,
|
|
2241
2588
|
runEvaluation,
|
|
2242
|
-
runTestCase,
|
|
2243
2589
|
scoreCandidateResponse
|
|
2244
2590
|
});
|
|
2245
2591
|
//# sourceMappingURL=index.cjs.map
|