@agentv/core 0.2.6 → 0.2.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-QVS4OL44.js → chunk-P4GOYWYH.js} +27 -1
- package/dist/chunk-P4GOYWYH.js.map +1 -0
- package/dist/chunk-XXNQA4EW.js +140 -0
- package/dist/chunk-XXNQA4EW.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +93 -8
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.d.cts +7 -2
- package/dist/evaluation/validation/index.d.ts +7 -2
- package/dist/evaluation/validation/index.js +91 -7
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +533 -187
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +53 -10
- package/dist/index.d.ts +53 -10
- package/dist/index.js +502 -193
- package/dist/index.js.map +1 -1
- package/package.json +6 -2
- package/dist/chunk-QVS4OL44.js.map +0 -1
package/dist/index.js
CHANGED
|
@@ -1,8 +1,11 @@
|
|
|
1
1
|
import {
|
|
2
2
|
TARGETS_SCHEMA_V2,
|
|
3
|
+
buildDirectoryChain,
|
|
3
4
|
buildSearchRoots,
|
|
5
|
+
fileExists,
|
|
6
|
+
findGitRoot,
|
|
4
7
|
resolveFileReference
|
|
5
|
-
} from "./chunk-
|
|
8
|
+
} from "./chunk-P4GOYWYH.js";
|
|
6
9
|
|
|
7
10
|
// src/evaluation/types.ts
|
|
8
11
|
var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
|
|
@@ -56,6 +59,7 @@ function getHitCount(result) {
|
|
|
56
59
|
}
|
|
57
60
|
|
|
58
61
|
// src/evaluation/yaml-parser.ts
|
|
62
|
+
import micromatch from "micromatch";
|
|
59
63
|
import { constants } from "node:fs";
|
|
60
64
|
import { access, readFile } from "node:fs/promises";
|
|
61
65
|
import path from "node:path";
|
|
@@ -65,9 +69,52 @@ var CODE_BLOCK_PATTERN = /```[\s\S]*?```/g;
|
|
|
65
69
|
var ANSI_YELLOW = "\x1B[33m";
|
|
66
70
|
var ANSI_RESET = "\x1B[0m";
|
|
67
71
|
var SCHEMA_EVAL_V2 = "agentv-eval-v2";
|
|
68
|
-
|
|
72
|
+
var SCHEMA_CONFIG_V2 = "agentv-config-v2";
|
|
73
|
+
async function loadConfig(evalFilePath, repoRoot) {
|
|
74
|
+
const directories = buildDirectoryChain(evalFilePath, repoRoot);
|
|
75
|
+
for (const directory of directories) {
|
|
76
|
+
const configPath = path.join(directory, ".agentv", "config.yaml");
|
|
77
|
+
if (!await fileExists2(configPath)) {
|
|
78
|
+
continue;
|
|
79
|
+
}
|
|
80
|
+
try {
|
|
81
|
+
const rawConfig = await readFile(configPath, "utf8");
|
|
82
|
+
const parsed = parse(rawConfig);
|
|
83
|
+
if (!isJsonObject(parsed)) {
|
|
84
|
+
logWarning(`Invalid .agentv/config.yaml format at ${configPath}`);
|
|
85
|
+
continue;
|
|
86
|
+
}
|
|
87
|
+
const config = parsed;
|
|
88
|
+
const schema = config.$schema;
|
|
89
|
+
if (schema !== SCHEMA_CONFIG_V2) {
|
|
90
|
+
const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${configPath}. Expected '${SCHEMA_CONFIG_V2}'` : `Missing required field '$schema' in ${configPath}.
|
|
91
|
+
Please add '$schema: ${SCHEMA_CONFIG_V2}' at the top of the file.`;
|
|
92
|
+
logWarning(message);
|
|
93
|
+
continue;
|
|
94
|
+
}
|
|
95
|
+
const guidelinePatterns = config.guideline_patterns;
|
|
96
|
+
if (guidelinePatterns !== void 0 && !Array.isArray(guidelinePatterns)) {
|
|
97
|
+
logWarning(`Invalid guideline_patterns in ${configPath}, expected array`);
|
|
98
|
+
continue;
|
|
99
|
+
}
|
|
100
|
+
if (Array.isArray(guidelinePatterns) && !guidelinePatterns.every((p) => typeof p === "string")) {
|
|
101
|
+
logWarning(`Invalid guideline_patterns in ${configPath}, all entries must be strings`);
|
|
102
|
+
continue;
|
|
103
|
+
}
|
|
104
|
+
return {
|
|
105
|
+
guideline_patterns: guidelinePatterns
|
|
106
|
+
};
|
|
107
|
+
} catch (error) {
|
|
108
|
+
logWarning(`Could not read .agentv/config.yaml at ${configPath}: ${error.message}`);
|
|
109
|
+
continue;
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
return null;
|
|
113
|
+
}
|
|
114
|
+
function isGuidelineFile(filePath, patterns) {
|
|
69
115
|
const normalized = filePath.split("\\").join("/");
|
|
70
|
-
|
|
116
|
+
const patternsToUse = patterns ?? [];
|
|
117
|
+
return micromatch.isMatch(normalized, patternsToUse);
|
|
71
118
|
}
|
|
72
119
|
function extractCodeBlocks(segments) {
|
|
73
120
|
const codeBlocks = [];
|
|
@@ -87,43 +134,45 @@ function extractCodeBlocks(segments) {
|
|
|
87
134
|
}
|
|
88
135
|
return codeBlocks;
|
|
89
136
|
}
|
|
90
|
-
async function
|
|
137
|
+
async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
91
138
|
const verbose = options?.verbose ?? false;
|
|
92
|
-
const absoluteTestPath = path.resolve(
|
|
93
|
-
if (!await
|
|
94
|
-
throw new Error(`Test file not found: ${
|
|
139
|
+
const absoluteTestPath = path.resolve(evalFilePath);
|
|
140
|
+
if (!await fileExists2(absoluteTestPath)) {
|
|
141
|
+
throw new Error(`Test file not found: ${evalFilePath}`);
|
|
95
142
|
}
|
|
96
143
|
const repoRootPath = resolveToAbsolutePath(repoRoot);
|
|
97
144
|
const searchRoots = buildSearchRoots(absoluteTestPath, repoRootPath);
|
|
145
|
+
const config = await loadConfig(absoluteTestPath, repoRootPath);
|
|
146
|
+
const guidelinePatterns = config?.guideline_patterns;
|
|
98
147
|
const rawFile = await readFile(absoluteTestPath, "utf8");
|
|
99
148
|
const parsed = parse(rawFile);
|
|
100
149
|
if (!isJsonObject(parsed)) {
|
|
101
|
-
throw new Error(`Invalid test file format: ${
|
|
150
|
+
throw new Error(`Invalid test file format: ${evalFilePath}`);
|
|
102
151
|
}
|
|
103
152
|
const suite = parsed;
|
|
104
153
|
const schema = suite.$schema;
|
|
105
154
|
if (schema !== SCHEMA_EVAL_V2) {
|
|
106
|
-
const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${
|
|
155
|
+
const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${evalFilePath}. Expected '${SCHEMA_EVAL_V2}'` : `Missing required field '$schema' in ${evalFilePath}.
|
|
107
156
|
Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
108
157
|
throw new Error(message);
|
|
109
158
|
}
|
|
110
159
|
const rawTestcases = suite.evalcases;
|
|
111
160
|
if (!Array.isArray(rawTestcases)) {
|
|
112
|
-
throw new Error(`Invalid test file format: ${
|
|
161
|
+
throw new Error(`Invalid test file format: ${evalFilePath} - missing 'evalcases' field`);
|
|
113
162
|
}
|
|
114
163
|
const globalGrader = coerceGrader(suite.grader) ?? "llm_judge";
|
|
115
164
|
const results = [];
|
|
116
|
-
for (const
|
|
117
|
-
if (!isJsonObject(
|
|
165
|
+
for (const rawEvalcase of rawTestcases) {
|
|
166
|
+
if (!isJsonObject(rawEvalcase)) {
|
|
118
167
|
logWarning("Skipping invalid test case entry (expected object)");
|
|
119
168
|
continue;
|
|
120
169
|
}
|
|
121
|
-
const
|
|
122
|
-
const id = asString(
|
|
123
|
-
const conversationId = asString(
|
|
124
|
-
const outcome = asString(
|
|
125
|
-
const inputMessagesValue =
|
|
126
|
-
const expectedMessagesValue =
|
|
170
|
+
const evalcase = rawEvalcase;
|
|
171
|
+
const id = asString(evalcase.id);
|
|
172
|
+
const conversationId = asString(evalcase.conversation_id);
|
|
173
|
+
const outcome = asString(evalcase.outcome);
|
|
174
|
+
const inputMessagesValue = evalcase.input_messages;
|
|
175
|
+
const expectedMessagesValue = evalcase.expected_messages;
|
|
127
176
|
if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
|
|
128
177
|
logWarning(`Skipping incomplete test case: ${id ?? "unknown"}`);
|
|
129
178
|
continue;
|
|
@@ -136,6 +185,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
136
185
|
const expectedMessages = expectedMessagesValue.filter((msg) => isTestMessage(msg));
|
|
137
186
|
const assistantMessages = expectedMessages.filter((message) => message.role === "assistant");
|
|
138
187
|
const userMessages = inputMessages.filter((message) => message.role === "user");
|
|
188
|
+
const systemMessages = inputMessages.filter((message) => message.role === "system");
|
|
139
189
|
if (assistantMessages.length === 0) {
|
|
140
190
|
logWarning(`No assistant message found for test case: ${id}`);
|
|
141
191
|
continue;
|
|
@@ -143,6 +193,29 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
143
193
|
if (assistantMessages.length > 1) {
|
|
144
194
|
logWarning(`Multiple assistant messages found for test case: ${id}, using first`);
|
|
145
195
|
}
|
|
196
|
+
if (systemMessages.length > 1) {
|
|
197
|
+
logWarning(`Multiple system messages found for test case: ${id}, using first`);
|
|
198
|
+
}
|
|
199
|
+
let systemMessageContent;
|
|
200
|
+
if (systemMessages.length > 0) {
|
|
201
|
+
const content = systemMessages[0]?.content;
|
|
202
|
+
if (typeof content === "string") {
|
|
203
|
+
systemMessageContent = content;
|
|
204
|
+
} else if (Array.isArray(content)) {
|
|
205
|
+
const textParts = [];
|
|
206
|
+
for (const segment of content) {
|
|
207
|
+
if (isJsonObject(segment)) {
|
|
208
|
+
const value = segment.value;
|
|
209
|
+
if (typeof value === "string") {
|
|
210
|
+
textParts.push(value);
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
if (textParts.length > 0) {
|
|
215
|
+
systemMessageContent = textParts.join("\n\n");
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
}
|
|
146
219
|
const userSegments = [];
|
|
147
220
|
const guidelinePaths = [];
|
|
148
221
|
const userTextParts = [];
|
|
@@ -174,7 +247,8 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
174
247
|
}
|
|
175
248
|
try {
|
|
176
249
|
const fileContent = (await readFile(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
|
|
177
|
-
|
|
250
|
+
const relativeToRepo = path.relative(repoRootPath, resolvedPath);
|
|
251
|
+
if (isGuidelineFile(relativeToRepo, guidelinePatterns)) {
|
|
178
252
|
guidelinePaths.push(path.resolve(resolvedPath));
|
|
179
253
|
if (verbose) {
|
|
180
254
|
console.log(` [Guideline] Found: ${displayPath}`);
|
|
@@ -184,7 +258,8 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
184
258
|
userSegments.push({
|
|
185
259
|
type: "file",
|
|
186
260
|
path: displayPath,
|
|
187
|
-
text: fileContent
|
|
261
|
+
text: fileContent,
|
|
262
|
+
resolvedPath: path.resolve(resolvedPath)
|
|
188
263
|
});
|
|
189
264
|
if (verbose) {
|
|
190
265
|
console.log(` [File] Found: ${displayPath}`);
|
|
@@ -208,14 +283,27 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
208
283
|
const assistantContent = assistantMessages[0]?.content;
|
|
209
284
|
const expectedAssistantRaw = await resolveAssistantContent(assistantContent, searchRoots, verbose);
|
|
210
285
|
const userTextPrompt = userTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
211
|
-
const testCaseGrader = coerceGrader(
|
|
286
|
+
const testCaseGrader = coerceGrader(evalcase.grader) ?? globalGrader;
|
|
287
|
+
const userFilePaths = [];
|
|
288
|
+
for (const segment of userSegments) {
|
|
289
|
+
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
290
|
+
userFilePaths.push(segment.resolvedPath);
|
|
291
|
+
}
|
|
292
|
+
}
|
|
293
|
+
const allFilePaths = [
|
|
294
|
+
...guidelinePaths.map((guidelinePath) => path.resolve(guidelinePath)),
|
|
295
|
+
...userFilePaths
|
|
296
|
+
];
|
|
212
297
|
const testCase = {
|
|
213
298
|
id,
|
|
214
299
|
conversation_id: conversationId,
|
|
215
300
|
task: userTextPrompt,
|
|
216
301
|
user_segments: userSegments,
|
|
302
|
+
system_message: systemMessageContent,
|
|
217
303
|
expected_assistant_raw: expectedAssistantRaw,
|
|
218
304
|
guideline_paths: guidelinePaths.map((guidelinePath) => path.resolve(guidelinePath)),
|
|
305
|
+
guideline_patterns: guidelinePatterns,
|
|
306
|
+
file_paths: allFilePaths,
|
|
219
307
|
code_snippets: codeSnippets,
|
|
220
308
|
outcome,
|
|
221
309
|
grader: testCaseGrader
|
|
@@ -240,7 +328,7 @@ async function buildPromptInputs(testCase) {
|
|
|
240
328
|
const guidelineContents = [];
|
|
241
329
|
for (const rawPath of testCase.guideline_paths) {
|
|
242
330
|
const absolutePath = path.resolve(rawPath);
|
|
243
|
-
if (!await
|
|
331
|
+
if (!await fileExists2(absolutePath)) {
|
|
244
332
|
logWarning(`Could not read guideline file ${absolutePath}: file does not exist`);
|
|
245
333
|
continue;
|
|
246
334
|
}
|
|
@@ -281,9 +369,9 @@ ${body}`);
|
|
|
281
369
|
}
|
|
282
370
|
const request = requestParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
|
|
283
371
|
const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
|
|
284
|
-
return { request, guidelines };
|
|
372
|
+
return { request, guidelines, systemMessage: testCase.system_message };
|
|
285
373
|
}
|
|
286
|
-
async function
|
|
374
|
+
async function fileExists2(absolutePath) {
|
|
287
375
|
try {
|
|
288
376
|
await access(absolutePath, constants.F_OK);
|
|
289
377
|
return true;
|
|
@@ -407,15 +495,18 @@ function buildChatPrompt(request) {
|
|
|
407
495
|
return request.chatPrompt;
|
|
408
496
|
}
|
|
409
497
|
const systemSegments = [];
|
|
410
|
-
if (request.guidelines && request.guidelines.trim().length > 0) {
|
|
411
|
-
systemSegments.push(`Guidelines:
|
|
412
|
-
${request.guidelines.trim()}`);
|
|
413
|
-
}
|
|
414
498
|
const metadataSystemPrompt = typeof request.metadata?.systemPrompt === "string" ? request.metadata.systemPrompt : void 0;
|
|
415
499
|
if (metadataSystemPrompt && metadataSystemPrompt.trim().length > 0) {
|
|
416
500
|
systemSegments.push(metadataSystemPrompt.trim());
|
|
501
|
+
} else {
|
|
502
|
+
systemSegments.push(DEFAULT_SYSTEM_PROMPT);
|
|
417
503
|
}
|
|
418
|
-
|
|
504
|
+
if (request.guidelines && request.guidelines.trim().length > 0) {
|
|
505
|
+
systemSegments.push(`[[ ## Guidelines ## ]]
|
|
506
|
+
|
|
507
|
+
${request.guidelines.trim()}`);
|
|
508
|
+
}
|
|
509
|
+
const systemContent = systemSegments.join("\n\n");
|
|
419
510
|
const userContent = request.prompt.trim();
|
|
420
511
|
const prompt = [
|
|
421
512
|
{
|
|
@@ -644,6 +735,9 @@ function normalizeAzureApiVersion(value) {
|
|
|
644
735
|
function resolveTargetDefinition(definition, env = process.env) {
|
|
645
736
|
const parsed = BASE_TARGET_SCHEMA.parse(definition);
|
|
646
737
|
const provider = parsed.provider.toLowerCase();
|
|
738
|
+
const providerBatching = resolveOptionalBoolean(
|
|
739
|
+
parsed.settings?.provider_batching ?? parsed.settings?.providerBatching
|
|
740
|
+
);
|
|
647
741
|
switch (provider) {
|
|
648
742
|
case "azure":
|
|
649
743
|
case "azure-openai":
|
|
@@ -652,6 +746,7 @@ function resolveTargetDefinition(definition, env = process.env) {
|
|
|
652
746
|
name: parsed.name,
|
|
653
747
|
judgeTarget: parsed.judge_target,
|
|
654
748
|
workers: parsed.workers,
|
|
749
|
+
providerBatching,
|
|
655
750
|
config: resolveAzureConfig(parsed, env)
|
|
656
751
|
};
|
|
657
752
|
case "anthropic":
|
|
@@ -660,6 +755,7 @@ function resolveTargetDefinition(definition, env = process.env) {
|
|
|
660
755
|
name: parsed.name,
|
|
661
756
|
judgeTarget: parsed.judge_target,
|
|
662
757
|
workers: parsed.workers,
|
|
758
|
+
providerBatching,
|
|
663
759
|
config: resolveAnthropicConfig(parsed, env)
|
|
664
760
|
};
|
|
665
761
|
case "gemini":
|
|
@@ -670,6 +766,7 @@ function resolveTargetDefinition(definition, env = process.env) {
|
|
|
670
766
|
name: parsed.name,
|
|
671
767
|
judgeTarget: parsed.judge_target,
|
|
672
768
|
workers: parsed.workers,
|
|
769
|
+
providerBatching,
|
|
673
770
|
config: resolveGeminiConfig(parsed, env)
|
|
674
771
|
};
|
|
675
772
|
case "mock":
|
|
@@ -678,6 +775,7 @@ function resolveTargetDefinition(definition, env = process.env) {
|
|
|
678
775
|
name: parsed.name,
|
|
679
776
|
judgeTarget: parsed.judge_target,
|
|
680
777
|
workers: parsed.workers,
|
|
778
|
+
providerBatching,
|
|
681
779
|
config: resolveMockConfig(parsed)
|
|
682
780
|
};
|
|
683
781
|
case "vscode":
|
|
@@ -687,6 +785,7 @@ function resolveTargetDefinition(definition, env = process.env) {
|
|
|
687
785
|
name: parsed.name,
|
|
688
786
|
judgeTarget: parsed.judge_target,
|
|
689
787
|
workers: parsed.workers,
|
|
788
|
+
providerBatching,
|
|
690
789
|
config: resolveVSCodeConfig(parsed, env, provider === "vscode-insiders")
|
|
691
790
|
};
|
|
692
791
|
default:
|
|
@@ -871,15 +970,19 @@ function isLikelyEnvReference(value) {
|
|
|
871
970
|
}
|
|
872
971
|
|
|
873
972
|
// src/evaluation/providers/vscode.ts
|
|
874
|
-
import {
|
|
875
|
-
import { tmpdir } from "node:os";
|
|
973
|
+
import { readFile as readFile2 } from "node:fs/promises";
|
|
876
974
|
import path2 from "node:path";
|
|
877
|
-
import {
|
|
878
|
-
|
|
975
|
+
import {
|
|
976
|
+
dispatchAgentSession,
|
|
977
|
+
dispatchBatchAgent,
|
|
978
|
+
getSubagentRoot,
|
|
979
|
+
provisionSubagents
|
|
980
|
+
} from "subagent";
|
|
879
981
|
var VSCodeProvider = class {
|
|
880
982
|
id;
|
|
881
983
|
kind;
|
|
882
984
|
targetName;
|
|
985
|
+
supportsBatch = true;
|
|
883
986
|
config;
|
|
884
987
|
constructor(targetName, config, kind) {
|
|
885
988
|
this.id = `${kind}:${targetName}`;
|
|
@@ -892,117 +995,159 @@ var VSCodeProvider = class {
|
|
|
892
995
|
throw new Error("VS Code provider request was aborted before dispatch");
|
|
893
996
|
}
|
|
894
997
|
const attachments = normalizeAttachments(request.attachments);
|
|
895
|
-
const promptContent = buildPromptDocument(request, attachments);
|
|
896
|
-
const
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
const failure = session.error ?? "VS Code subagent did not produce a response";
|
|
913
|
-
throw new Error(failure);
|
|
914
|
-
}
|
|
915
|
-
if (this.config.dryRun) {
|
|
916
|
-
return {
|
|
917
|
-
text: "",
|
|
918
|
-
raw: {
|
|
919
|
-
session,
|
|
920
|
-
promptFile: promptPath,
|
|
921
|
-
attachments
|
|
922
|
-
}
|
|
923
|
-
};
|
|
924
|
-
}
|
|
925
|
-
const responseText = await readFile2(session.responseFile, "utf8");
|
|
998
|
+
const promptContent = buildPromptDocument(request, attachments, request.guideline_patterns);
|
|
999
|
+
const session = await dispatchAgentSession({
|
|
1000
|
+
userQuery: promptContent,
|
|
1001
|
+
// Use full prompt content instead of just request.prompt
|
|
1002
|
+
extraAttachments: attachments,
|
|
1003
|
+
wait: this.config.waitForResponse,
|
|
1004
|
+
dryRun: this.config.dryRun,
|
|
1005
|
+
vscodeCmd: this.config.command,
|
|
1006
|
+
subagentRoot: this.config.subagentRoot,
|
|
1007
|
+
workspaceTemplate: this.config.workspaceTemplate,
|
|
1008
|
+
silent: true
|
|
1009
|
+
});
|
|
1010
|
+
if (session.exitCode !== 0 || !session.responseFile) {
|
|
1011
|
+
const failure = session.error ?? "VS Code subagent did not produce a response";
|
|
1012
|
+
throw new Error(failure);
|
|
1013
|
+
}
|
|
1014
|
+
if (this.config.dryRun) {
|
|
926
1015
|
return {
|
|
927
|
-
text:
|
|
1016
|
+
text: "",
|
|
928
1017
|
raw: {
|
|
929
1018
|
session,
|
|
930
|
-
promptFile: promptPath,
|
|
931
1019
|
attachments
|
|
932
1020
|
}
|
|
933
1021
|
};
|
|
934
|
-
} finally {
|
|
935
|
-
await rm(directory, { recursive: true, force: true });
|
|
936
1022
|
}
|
|
1023
|
+
const responseText = await readFile2(session.responseFile, "utf8");
|
|
1024
|
+
return {
|
|
1025
|
+
text: responseText,
|
|
1026
|
+
raw: {
|
|
1027
|
+
session,
|
|
1028
|
+
attachments
|
|
1029
|
+
}
|
|
1030
|
+
};
|
|
1031
|
+
}
|
|
1032
|
+
async invokeBatch(requests) {
|
|
1033
|
+
if (requests.length === 0) {
|
|
1034
|
+
return [];
|
|
1035
|
+
}
|
|
1036
|
+
const normalizedRequests = requests.map((req) => ({
|
|
1037
|
+
request: req,
|
|
1038
|
+
attachments: normalizeAttachments(req.attachments)
|
|
1039
|
+
}));
|
|
1040
|
+
const combinedAttachments = mergeAttachments(
|
|
1041
|
+
normalizedRequests.map(({ attachments }) => attachments)
|
|
1042
|
+
);
|
|
1043
|
+
const userQueries = normalizedRequests.map(
|
|
1044
|
+
({ request, attachments }) => buildPromptDocument(request, attachments, request.guideline_patterns)
|
|
1045
|
+
);
|
|
1046
|
+
const session = await dispatchBatchAgent({
|
|
1047
|
+
userQueries,
|
|
1048
|
+
extraAttachments: combinedAttachments,
|
|
1049
|
+
wait: this.config.waitForResponse,
|
|
1050
|
+
dryRun: this.config.dryRun,
|
|
1051
|
+
vscodeCmd: this.config.command,
|
|
1052
|
+
subagentRoot: this.config.subagentRoot,
|
|
1053
|
+
workspaceTemplate: this.config.workspaceTemplate,
|
|
1054
|
+
silent: true
|
|
1055
|
+
});
|
|
1056
|
+
if (session.exitCode !== 0 || !session.responseFiles) {
|
|
1057
|
+
const failure = session.error ?? "VS Code subagent did not produce batch responses";
|
|
1058
|
+
throw new Error(failure);
|
|
1059
|
+
}
|
|
1060
|
+
if (this.config.dryRun) {
|
|
1061
|
+
return normalizedRequests.map(({ attachments }) => ({
|
|
1062
|
+
text: "",
|
|
1063
|
+
raw: {
|
|
1064
|
+
session,
|
|
1065
|
+
attachments,
|
|
1066
|
+
allAttachments: combinedAttachments
|
|
1067
|
+
}
|
|
1068
|
+
}));
|
|
1069
|
+
}
|
|
1070
|
+
if (session.responseFiles.length !== requests.length) {
|
|
1071
|
+
throw new Error(
|
|
1072
|
+
`VS Code batch returned ${session.responseFiles.length} responses for ${requests.length} requests`
|
|
1073
|
+
);
|
|
1074
|
+
}
|
|
1075
|
+
const responses = [];
|
|
1076
|
+
for (const [index, responseFile] of session.responseFiles.entries()) {
|
|
1077
|
+
const responseText = await readFile2(responseFile, "utf8");
|
|
1078
|
+
responses.push({
|
|
1079
|
+
text: responseText,
|
|
1080
|
+
raw: {
|
|
1081
|
+
session,
|
|
1082
|
+
attachments: normalizedRequests[index]?.attachments,
|
|
1083
|
+
allAttachments: combinedAttachments,
|
|
1084
|
+
responseFile
|
|
1085
|
+
}
|
|
1086
|
+
});
|
|
1087
|
+
}
|
|
1088
|
+
return responses;
|
|
937
1089
|
}
|
|
938
1090
|
};
|
|
939
|
-
function buildPromptDocument(request, attachments) {
|
|
1091
|
+
function buildPromptDocument(request, attachments, guidelinePatterns) {
|
|
940
1092
|
const parts = [];
|
|
941
|
-
const
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
if (request.metadata?.target) {
|
|
950
|
-
parts.push(`- Target: ${String(request.metadata.target)}`);
|
|
951
|
-
}
|
|
952
|
-
parts.push("\n## Task\n", request.prompt.trim());
|
|
953
|
-
if (request.guidelines && request.guidelines.trim().length > 0) {
|
|
954
|
-
parts.push("\n## Guidelines\n", request.guidelines.trim());
|
|
955
|
-
}
|
|
956
|
-
if (attachments && attachments.length > 0) {
|
|
957
|
-
const attachmentList = attachments.map((item) => `- ${item}`).join("\n");
|
|
958
|
-
parts.push("\n## Attachments\n", attachmentList);
|
|
1093
|
+
const guidelineFiles = collectGuidelineFiles(attachments, guidelinePatterns);
|
|
1094
|
+
const attachmentFiles = collectAttachmentFiles(attachments);
|
|
1095
|
+
const nonGuidelineAttachments = attachmentFiles.filter(
|
|
1096
|
+
(file) => !guidelineFiles.includes(file)
|
|
1097
|
+
);
|
|
1098
|
+
const prereadBlock = buildMandatoryPrereadBlock(guidelineFiles, nonGuidelineAttachments);
|
|
1099
|
+
if (prereadBlock.length > 0) {
|
|
1100
|
+
parts.push("\n", prereadBlock);
|
|
959
1101
|
}
|
|
1102
|
+
parts.push("\n[[ ## user_query ## ]]\n", request.prompt.trim());
|
|
960
1103
|
return parts.join("\n").trim();
|
|
961
1104
|
}
|
|
962
|
-
function buildMandatoryPrereadBlock(
|
|
963
|
-
if (
|
|
1105
|
+
function buildMandatoryPrereadBlock(guidelineFiles, attachmentFiles) {
|
|
1106
|
+
if (guidelineFiles.length === 0 && attachmentFiles.length === 0) {
|
|
964
1107
|
return "";
|
|
965
1108
|
}
|
|
966
|
-
const
|
|
967
|
-
const tokenList = [];
|
|
968
|
-
let counter = 0;
|
|
969
|
-
for (const absolutePath of instructionFiles) {
|
|
970
|
-
counter += 1;
|
|
1109
|
+
const buildList = (files) => files.map((absolutePath) => {
|
|
971
1110
|
const fileName = path2.basename(absolutePath);
|
|
972
1111
|
const fileUri = pathToFileUri(absolutePath);
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
|
|
986
|
-
|
|
987
|
-
|
|
988
|
-
|
|
989
|
-
`Then fetch all documentation required by the instructions before proceeding with your task.`
|
|
990
|
-
].join(" ");
|
|
991
|
-
return `[[ ## mandatory_pre_read ## ]]
|
|
992
|
-
|
|
993
|
-
${instruction}
|
|
994
|
-
|
|
995
|
-
`;
|
|
1112
|
+
return `* [${fileName}](${fileUri})`;
|
|
1113
|
+
});
|
|
1114
|
+
const sections = [];
|
|
1115
|
+
if (guidelineFiles.length > 0) {
|
|
1116
|
+
sections.push(`Read all guideline files:
|
|
1117
|
+
${buildList(guidelineFiles).join("\n")}.`);
|
|
1118
|
+
}
|
|
1119
|
+
if (attachmentFiles.length > 0) {
|
|
1120
|
+
sections.push(`Read all attachment files:
|
|
1121
|
+
${buildList(attachmentFiles).join("\n")}.`);
|
|
1122
|
+
}
|
|
1123
|
+
sections.push(
|
|
1124
|
+
"If any file is missing, fail with ERROR: missing-file <filename> and stop.",
|
|
1125
|
+
"Then apply system_instructions on the user query below."
|
|
1126
|
+
);
|
|
1127
|
+
return sections.join("\n");
|
|
996
1128
|
}
|
|
997
|
-
function
|
|
1129
|
+
function collectGuidelineFiles(attachments, guidelinePatterns) {
|
|
998
1130
|
if (!attachments || attachments.length === 0) {
|
|
999
1131
|
return [];
|
|
1000
1132
|
}
|
|
1001
1133
|
const unique = /* @__PURE__ */ new Map();
|
|
1002
1134
|
for (const attachment of attachments) {
|
|
1003
|
-
|
|
1004
|
-
|
|
1135
|
+
const absolutePath = path2.resolve(attachment);
|
|
1136
|
+
const normalized = absolutePath.split(path2.sep).join("/");
|
|
1137
|
+
if (isGuidelineFile(normalized, guidelinePatterns)) {
|
|
1138
|
+
if (!unique.has(absolutePath)) {
|
|
1139
|
+
unique.set(absolutePath, absolutePath);
|
|
1140
|
+
}
|
|
1005
1141
|
}
|
|
1142
|
+
}
|
|
1143
|
+
return Array.from(unique.values());
|
|
1144
|
+
}
|
|
1145
|
+
function collectAttachmentFiles(attachments) {
|
|
1146
|
+
if (!attachments || attachments.length === 0) {
|
|
1147
|
+
return [];
|
|
1148
|
+
}
|
|
1149
|
+
const unique = /* @__PURE__ */ new Map();
|
|
1150
|
+
for (const attachment of attachments) {
|
|
1006
1151
|
const absolutePath = path2.resolve(attachment);
|
|
1007
1152
|
if (!unique.has(absolutePath)) {
|
|
1008
1153
|
unique.set(absolutePath, absolutePath);
|
|
@@ -1010,10 +1155,6 @@ function collectInstructionFiles(attachments) {
|
|
|
1010
1155
|
}
|
|
1011
1156
|
return Array.from(unique.values());
|
|
1012
1157
|
}
|
|
1013
|
-
function isInstructionPath(filePath) {
|
|
1014
|
-
const normalized = filePath.split(path2.sep).join("/");
|
|
1015
|
-
return normalized.endsWith(".instructions.md") || normalized.includes("/instructions/") || normalized.endsWith(".prompt.md") || normalized.includes("/prompts/");
|
|
1016
|
-
}
|
|
1017
1158
|
function pathToFileUri(filePath) {
|
|
1018
1159
|
const absolutePath = path2.isAbsolute(filePath) ? filePath : path2.resolve(filePath);
|
|
1019
1160
|
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
@@ -1022,14 +1163,6 @@ function pathToFileUri(filePath) {
|
|
|
1022
1163
|
}
|
|
1023
1164
|
return `file://${normalizedPath}`;
|
|
1024
1165
|
}
|
|
1025
|
-
function composeUserQuery(request) {
|
|
1026
|
-
const segments = [];
|
|
1027
|
-
segments.push(request.prompt.trim());
|
|
1028
|
-
if (request.guidelines && request.guidelines.trim().length > 0) {
|
|
1029
|
-
segments.push("\nGuidelines:\n", request.guidelines.trim());
|
|
1030
|
-
}
|
|
1031
|
-
return segments.join("\n").trim();
|
|
1032
|
-
}
|
|
1033
1166
|
function normalizeAttachments(attachments) {
|
|
1034
1167
|
if (!attachments || attachments.length === 0) {
|
|
1035
1168
|
return void 0;
|
|
@@ -1040,6 +1173,16 @@ function normalizeAttachments(attachments) {
|
|
|
1040
1173
|
}
|
|
1041
1174
|
return Array.from(deduped);
|
|
1042
1175
|
}
|
|
1176
|
+
function mergeAttachments(all) {
|
|
1177
|
+
const deduped = /* @__PURE__ */ new Set();
|
|
1178
|
+
for (const list of all) {
|
|
1179
|
+
if (!list) continue;
|
|
1180
|
+
for (const attachment of list) {
|
|
1181
|
+
deduped.add(path2.resolve(attachment));
|
|
1182
|
+
}
|
|
1183
|
+
}
|
|
1184
|
+
return deduped.size > 0 ? Array.from(deduped) : void 0;
|
|
1185
|
+
}
|
|
1043
1186
|
async function ensureVSCodeSubagents(options) {
|
|
1044
1187
|
const { kind, count, verbose = false } = options;
|
|
1045
1188
|
const vscodeCmd = kind === "vscode-insiders" ? "code-insiders" : "code";
|
|
@@ -1136,7 +1279,7 @@ function assertTargetDefinition(value, index, filePath) {
|
|
|
1136
1279
|
judge_target: typeof judgeTarget === "string" ? judgeTarget : void 0
|
|
1137
1280
|
};
|
|
1138
1281
|
}
|
|
1139
|
-
async function
|
|
1282
|
+
async function fileExists3(filePath) {
|
|
1140
1283
|
try {
|
|
1141
1284
|
await access2(filePath, constants2.F_OK);
|
|
1142
1285
|
return true;
|
|
@@ -1146,7 +1289,7 @@ async function fileExists2(filePath) {
|
|
|
1146
1289
|
}
|
|
1147
1290
|
async function readTargetDefinitions(filePath) {
|
|
1148
1291
|
const absolutePath = path3.resolve(filePath);
|
|
1149
|
-
if (!await
|
|
1292
|
+
if (!await fileExists3(absolutePath)) {
|
|
1150
1293
|
throw new Error(`targets.yaml not found at ${absolutePath}`);
|
|
1151
1294
|
}
|
|
1152
1295
|
const raw = await readFile3(absolutePath, "utf8");
|
|
@@ -1376,7 +1519,7 @@ import { randomUUID } from "node:crypto";
|
|
|
1376
1519
|
var HeuristicGrader = class {
|
|
1377
1520
|
kind = "heuristic";
|
|
1378
1521
|
grade(context) {
|
|
1379
|
-
const expectedAspects = extractAspects(context.
|
|
1522
|
+
const expectedAspects = extractAspects(context.evalCase.expected_assistant_raw);
|
|
1380
1523
|
const result = scoreCandidateResponse(context.candidate, expectedAspects);
|
|
1381
1524
|
const misses = [...result.misses];
|
|
1382
1525
|
if (expectedAspects.length === 0 && isErrorLike(context.candidate)) {
|
|
@@ -1409,14 +1552,14 @@ var QualityGrader = class {
|
|
|
1409
1552
|
if (!judgeProvider) {
|
|
1410
1553
|
throw new Error("No judge provider available for LLM grading");
|
|
1411
1554
|
}
|
|
1412
|
-
const prompt = buildQualityPrompt(context.
|
|
1555
|
+
const prompt = buildQualityPrompt(context.evalCase, context.candidate);
|
|
1413
1556
|
const metadata = {
|
|
1414
1557
|
systemPrompt: QUALITY_SYSTEM_PROMPT
|
|
1415
1558
|
};
|
|
1416
1559
|
const response = await judgeProvider.invoke({
|
|
1417
1560
|
prompt,
|
|
1418
1561
|
metadata,
|
|
1419
|
-
|
|
1562
|
+
evalCaseId: context.evalCase.id,
|
|
1420
1563
|
attempt: context.attempt,
|
|
1421
1564
|
maxOutputTokens: this.maxOutputTokens,
|
|
1422
1565
|
temperature: this.temperature
|
|
@@ -1462,16 +1605,16 @@ var QUALITY_SYSTEM_PROMPT = [
|
|
|
1462
1605
|
function buildQualityPrompt(testCase, candidate) {
|
|
1463
1606
|
const parts = [
|
|
1464
1607
|
"[[ ## expected_outcome ## ]]",
|
|
1465
|
-
testCase.outcome,
|
|
1608
|
+
testCase.outcome.trim(),
|
|
1466
1609
|
"",
|
|
1467
1610
|
"[[ ## request ## ]]",
|
|
1468
|
-
testCase.task,
|
|
1611
|
+
testCase.task.trim(),
|
|
1469
1612
|
"",
|
|
1470
1613
|
"[[ ## reference_answer ## ]]",
|
|
1471
|
-
testCase.expected_assistant_raw,
|
|
1614
|
+
testCase.expected_assistant_raw.trim(),
|
|
1472
1615
|
"",
|
|
1473
1616
|
"[[ ## generated_answer ## ]]",
|
|
1474
|
-
candidate,
|
|
1617
|
+
candidate.trim(),
|
|
1475
1618
|
"",
|
|
1476
1619
|
"Respond with a single JSON object matching the schema described in the system prompt."
|
|
1477
1620
|
];
|
|
@@ -1720,10 +1863,10 @@ async function runEvaluation(options) {
|
|
|
1720
1863
|
onResult,
|
|
1721
1864
|
onProgress
|
|
1722
1865
|
} = options;
|
|
1723
|
-
const load =
|
|
1724
|
-
const
|
|
1725
|
-
const
|
|
1726
|
-
if (
|
|
1866
|
+
const load = loadEvalCases;
|
|
1867
|
+
const evalCases = await load(testFilePath, repoRoot, { verbose });
|
|
1868
|
+
const filteredEvalCases = filterEvalCases(evalCases, evalId);
|
|
1869
|
+
if (filteredEvalCases.length === 0) {
|
|
1727
1870
|
if (evalId) {
|
|
1728
1871
|
throw new Error(`Test case with id '${evalId}' not found in ${testFilePath}`);
|
|
1729
1872
|
}
|
|
@@ -1769,35 +1912,62 @@ async function runEvaluation(options) {
|
|
|
1769
1912
|
};
|
|
1770
1913
|
const graderRegistry = buildGraderRegistry(graders, resolveJudgeProvider);
|
|
1771
1914
|
const primaryProvider = getOrCreateProvider(target);
|
|
1772
|
-
|
|
1773
|
-
|
|
1915
|
+
const providerSupportsBatch = target.providerBatching === true && primaryProvider.supportsBatch === true && typeof primaryProvider.invokeBatch === "function";
|
|
1916
|
+
if (target.providerBatching && !providerSupportsBatch && verbose) {
|
|
1917
|
+
console.warn(
|
|
1918
|
+
`Provider batching requested for target '${target.name}', but provider does not advertise batch support. Using per-case dispatch.`
|
|
1919
|
+
);
|
|
1920
|
+
}
|
|
1921
|
+
if (onProgress && filteredEvalCases.length > 0) {
|
|
1922
|
+
for (let i = 0; i < filteredEvalCases.length; i++) {
|
|
1774
1923
|
await onProgress({
|
|
1775
1924
|
workerId: i + 1,
|
|
1776
|
-
evalId:
|
|
1925
|
+
evalId: filteredEvalCases[i].id,
|
|
1777
1926
|
status: "pending"
|
|
1778
1927
|
});
|
|
1779
1928
|
}
|
|
1780
1929
|
}
|
|
1930
|
+
if (providerSupportsBatch) {
|
|
1931
|
+
try {
|
|
1932
|
+
return await runBatchEvaluation({
|
|
1933
|
+
evalCases: filteredEvalCases,
|
|
1934
|
+
provider: primaryProvider,
|
|
1935
|
+
target,
|
|
1936
|
+
graderRegistry,
|
|
1937
|
+
promptDumpDir,
|
|
1938
|
+
nowFn: now ?? (() => /* @__PURE__ */ new Date()),
|
|
1939
|
+
onProgress,
|
|
1940
|
+
onResult,
|
|
1941
|
+
verbose,
|
|
1942
|
+
resolveJudgeProvider
|
|
1943
|
+
});
|
|
1944
|
+
} catch (error) {
|
|
1945
|
+
if (verbose) {
|
|
1946
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1947
|
+
console.warn(`Provider batch execution failed, falling back to per-case dispatch: ${message}`);
|
|
1948
|
+
}
|
|
1949
|
+
}
|
|
1950
|
+
}
|
|
1781
1951
|
const workers = options.maxConcurrency ?? target.workers ?? 1;
|
|
1782
1952
|
const limit = pLimit(workers);
|
|
1783
1953
|
let nextWorkerId = 1;
|
|
1784
1954
|
const workerIdByEvalId = /* @__PURE__ */ new Map();
|
|
1785
|
-
const promises =
|
|
1786
|
-
(
|
|
1955
|
+
const promises = filteredEvalCases.map(
|
|
1956
|
+
(evalCase) => limit(async () => {
|
|
1787
1957
|
const workerId = nextWorkerId++;
|
|
1788
|
-
workerIdByEvalId.set(
|
|
1958
|
+
workerIdByEvalId.set(evalCase.id, workerId);
|
|
1789
1959
|
if (onProgress) {
|
|
1790
1960
|
await onProgress({
|
|
1791
1961
|
workerId,
|
|
1792
|
-
evalId:
|
|
1962
|
+
evalId: evalCase.id,
|
|
1793
1963
|
status: "running",
|
|
1794
1964
|
startedAt: Date.now()
|
|
1795
1965
|
});
|
|
1796
1966
|
}
|
|
1797
1967
|
try {
|
|
1798
1968
|
const judgeProvider = await resolveJudgeProvider(target);
|
|
1799
|
-
const result = await
|
|
1800
|
-
|
|
1969
|
+
const result = await runEvalCase({
|
|
1970
|
+
evalCase,
|
|
1801
1971
|
provider: primaryProvider,
|
|
1802
1972
|
target,
|
|
1803
1973
|
graders: graderRegistry,
|
|
@@ -1812,7 +1982,7 @@ async function runEvaluation(options) {
|
|
|
1812
1982
|
if (onProgress) {
|
|
1813
1983
|
await onProgress({
|
|
1814
1984
|
workerId,
|
|
1815
|
-
evalId:
|
|
1985
|
+
evalId: evalCase.id,
|
|
1816
1986
|
status: "completed",
|
|
1817
1987
|
startedAt: 0,
|
|
1818
1988
|
// Not used for completed status
|
|
@@ -1827,7 +1997,7 @@ async function runEvaluation(options) {
|
|
|
1827
1997
|
if (onProgress) {
|
|
1828
1998
|
await onProgress({
|
|
1829
1999
|
workerId,
|
|
1830
|
-
evalId:
|
|
2000
|
+
evalId: evalCase.id,
|
|
1831
2001
|
status: "failed",
|
|
1832
2002
|
completedAt: Date.now(),
|
|
1833
2003
|
error: error instanceof Error ? error.message : String(error)
|
|
@@ -1844,10 +2014,10 @@ async function runEvaluation(options) {
|
|
|
1844
2014
|
if (outcome.status === "fulfilled") {
|
|
1845
2015
|
results.push(outcome.value);
|
|
1846
2016
|
} else {
|
|
1847
|
-
const
|
|
1848
|
-
const promptInputs = await buildPromptInputs(
|
|
2017
|
+
const evalCase = filteredEvalCases[i];
|
|
2018
|
+
const promptInputs = await buildPromptInputs(evalCase);
|
|
1849
2019
|
const errorResult = buildErrorResult(
|
|
1850
|
-
|
|
2020
|
+
evalCase,
|
|
1851
2021
|
target.name,
|
|
1852
2022
|
(now ?? (() => /* @__PURE__ */ new Date()))(),
|
|
1853
2023
|
outcome.reason,
|
|
@@ -1861,9 +2031,140 @@ async function runEvaluation(options) {
|
|
|
1861
2031
|
}
|
|
1862
2032
|
return results;
|
|
1863
2033
|
}
|
|
1864
|
-
async function
|
|
2034
|
+
async function runBatchEvaluation(options) {
|
|
2035
|
+
const {
|
|
2036
|
+
evalCases,
|
|
2037
|
+
provider,
|
|
2038
|
+
target,
|
|
2039
|
+
graderRegistry,
|
|
2040
|
+
promptDumpDir,
|
|
2041
|
+
nowFn,
|
|
2042
|
+
onProgress,
|
|
2043
|
+
onResult,
|
|
2044
|
+
resolveJudgeProvider
|
|
2045
|
+
} = options;
|
|
2046
|
+
const promptInputsList = [];
|
|
2047
|
+
for (const evalCase of evalCases) {
|
|
2048
|
+
const promptInputs = await buildPromptInputs(evalCase);
|
|
2049
|
+
if (promptDumpDir) {
|
|
2050
|
+
await dumpPrompt(promptDumpDir, evalCase, promptInputs);
|
|
2051
|
+
}
|
|
2052
|
+
promptInputsList.push(promptInputs);
|
|
2053
|
+
}
|
|
2054
|
+
const batchRequests = evalCases.map((evalCase, index) => {
|
|
2055
|
+
const promptInputs = promptInputsList[index];
|
|
2056
|
+
return {
|
|
2057
|
+
prompt: promptInputs.request,
|
|
2058
|
+
guidelines: promptInputs.guidelines,
|
|
2059
|
+
guideline_patterns: evalCase.guideline_patterns,
|
|
2060
|
+
attachments: evalCase.file_paths,
|
|
2061
|
+
evalCaseId: evalCase.id,
|
|
2062
|
+
metadata: {
|
|
2063
|
+
systemPrompt: promptInputs.systemMessage ?? ""
|
|
2064
|
+
}
|
|
2065
|
+
};
|
|
2066
|
+
});
|
|
2067
|
+
const batchResponse = await provider.invokeBatch?.(batchRequests);
|
|
2068
|
+
if (!Array.isArray(batchResponse)) {
|
|
2069
|
+
throw new Error("Provider batching failed: invokeBatch did not return an array");
|
|
2070
|
+
}
|
|
2071
|
+
if (batchResponse.length !== evalCases.length) {
|
|
2072
|
+
throw new Error(
|
|
2073
|
+
`Provider batching failed: expected ${evalCases.length} responses, received ${batchResponse.length}`
|
|
2074
|
+
);
|
|
2075
|
+
}
|
|
2076
|
+
if (onProgress) {
|
|
2077
|
+
const startedAt = Date.now();
|
|
2078
|
+
for (let i = 0; i < evalCases.length; i++) {
|
|
2079
|
+
await onProgress({
|
|
2080
|
+
workerId: 1,
|
|
2081
|
+
evalId: evalCases[i].id,
|
|
2082
|
+
status: "running",
|
|
2083
|
+
startedAt
|
|
2084
|
+
});
|
|
2085
|
+
}
|
|
2086
|
+
}
|
|
2087
|
+
const results = [];
|
|
2088
|
+
for (let i = 0; i < evalCases.length; i++) {
|
|
2089
|
+
const evalCase = evalCases[i];
|
|
2090
|
+
const promptInputs = promptInputsList[i];
|
|
2091
|
+
const providerResponse = batchResponse[i];
|
|
2092
|
+
const now = nowFn();
|
|
2093
|
+
const graderKind = evalCase.grader ?? "heuristic";
|
|
2094
|
+
const activeGrader = graderRegistry[graderKind] ?? graderRegistry.heuristic;
|
|
2095
|
+
if (!activeGrader) {
|
|
2096
|
+
throw new Error(`No grader registered for kind '${graderKind}'`);
|
|
2097
|
+
}
|
|
2098
|
+
let grade;
|
|
2099
|
+
try {
|
|
2100
|
+
grade = await activeGrader.grade({
|
|
2101
|
+
evalCase,
|
|
2102
|
+
candidate: providerResponse.text ?? "",
|
|
2103
|
+
target,
|
|
2104
|
+
provider,
|
|
2105
|
+
attempt: 0,
|
|
2106
|
+
promptInputs,
|
|
2107
|
+
now,
|
|
2108
|
+
judgeProvider: await resolveJudgeProvider(target)
|
|
2109
|
+
});
|
|
2110
|
+
} catch (error) {
|
|
2111
|
+
const errorResult = buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
|
|
2112
|
+
results.push(errorResult);
|
|
2113
|
+
if (onResult) {
|
|
2114
|
+
await onResult(errorResult);
|
|
2115
|
+
}
|
|
2116
|
+
if (onProgress) {
|
|
2117
|
+
await onProgress({
|
|
2118
|
+
workerId: 1,
|
|
2119
|
+
evalId: evalCase.id,
|
|
2120
|
+
status: "failed",
|
|
2121
|
+
completedAt: Date.now(),
|
|
2122
|
+
error: error instanceof Error ? error.message : String(error)
|
|
2123
|
+
});
|
|
2124
|
+
}
|
|
2125
|
+
continue;
|
|
2126
|
+
}
|
|
2127
|
+
const completedAt = nowFn();
|
|
2128
|
+
const rawRequest = {
|
|
2129
|
+
request: promptInputs.request,
|
|
2130
|
+
guidelines: promptInputs.guidelines,
|
|
2131
|
+
guideline_paths: evalCase.guideline_paths,
|
|
2132
|
+
system_message: promptInputs.systemMessage ?? ""
|
|
2133
|
+
};
|
|
2134
|
+
const result = {
|
|
2135
|
+
eval_id: evalCase.id,
|
|
2136
|
+
conversation_id: evalCase.conversation_id,
|
|
2137
|
+
score: grade.score,
|
|
2138
|
+
hits: grade.hits,
|
|
2139
|
+
misses: grade.misses,
|
|
2140
|
+
model_answer: providerResponse.text ?? "",
|
|
2141
|
+
expected_aspect_count: grade.expectedAspectCount,
|
|
2142
|
+
target: target.name,
|
|
2143
|
+
timestamp: completedAt.toISOString(),
|
|
2144
|
+
reasoning: grade.reasoning,
|
|
2145
|
+
raw_aspects: grade.rawAspects,
|
|
2146
|
+
raw_request: rawRequest,
|
|
2147
|
+
grader_raw_request: grade.graderRawRequest
|
|
2148
|
+
};
|
|
2149
|
+
results.push(result);
|
|
2150
|
+
if (onResult) {
|
|
2151
|
+
await onResult(result);
|
|
2152
|
+
}
|
|
2153
|
+
if (onProgress) {
|
|
2154
|
+
await onProgress({
|
|
2155
|
+
workerId: 1,
|
|
2156
|
+
evalId: evalCase.id,
|
|
2157
|
+
status: "completed",
|
|
2158
|
+
startedAt: 0,
|
|
2159
|
+
completedAt: Date.now()
|
|
2160
|
+
});
|
|
2161
|
+
}
|
|
2162
|
+
}
|
|
2163
|
+
return results;
|
|
2164
|
+
}
|
|
2165
|
+
async function runEvalCase(options) {
|
|
1865
2166
|
const {
|
|
1866
|
-
|
|
2167
|
+
evalCase,
|
|
1867
2168
|
provider,
|
|
1868
2169
|
target,
|
|
1869
2170
|
graders,
|
|
@@ -1876,11 +2177,11 @@ async function runTestCase(options) {
|
|
|
1876
2177
|
signal,
|
|
1877
2178
|
judgeProvider
|
|
1878
2179
|
} = options;
|
|
1879
|
-
const promptInputs = await buildPromptInputs(
|
|
2180
|
+
const promptInputs = await buildPromptInputs(evalCase);
|
|
1880
2181
|
if (promptDumpDir) {
|
|
1881
|
-
await dumpPrompt(promptDumpDir,
|
|
2182
|
+
await dumpPrompt(promptDumpDir, evalCase, promptInputs);
|
|
1882
2183
|
}
|
|
1883
|
-
const cacheKey = useCache ? createCacheKey(provider, target,
|
|
2184
|
+
const cacheKey = useCache ? createCacheKey(provider, target, evalCase, promptInputs) : void 0;
|
|
1884
2185
|
let cachedResponse;
|
|
1885
2186
|
if (cacheKey && cache) {
|
|
1886
2187
|
cachedResponse = await cache.get(cacheKey);
|
|
@@ -1893,7 +2194,7 @@ async function runTestCase(options) {
|
|
|
1893
2194
|
while (!providerResponse && attempt < attemptBudget) {
|
|
1894
2195
|
try {
|
|
1895
2196
|
providerResponse = await invokeProvider(provider, {
|
|
1896
|
-
|
|
2197
|
+
evalCase,
|
|
1897
2198
|
target,
|
|
1898
2199
|
promptInputs,
|
|
1899
2200
|
attempt,
|
|
@@ -1906,12 +2207,12 @@ async function runTestCase(options) {
|
|
|
1906
2207
|
attempt += 1;
|
|
1907
2208
|
continue;
|
|
1908
2209
|
}
|
|
1909
|
-
return buildErrorResult(
|
|
2210
|
+
return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
|
|
1910
2211
|
}
|
|
1911
2212
|
}
|
|
1912
2213
|
if (!providerResponse) {
|
|
1913
2214
|
return buildErrorResult(
|
|
1914
|
-
|
|
2215
|
+
evalCase,
|
|
1915
2216
|
target.name,
|
|
1916
2217
|
nowFn(),
|
|
1917
2218
|
lastError ?? new Error("Provider did not return a response"),
|
|
@@ -1921,7 +2222,7 @@ async function runTestCase(options) {
|
|
|
1921
2222
|
if (cacheKey && cache && !cachedResponse) {
|
|
1922
2223
|
await cache.set(cacheKey, providerResponse);
|
|
1923
2224
|
}
|
|
1924
|
-
const graderKind =
|
|
2225
|
+
const graderKind = evalCase.grader ?? "heuristic";
|
|
1925
2226
|
const activeGrader = graders[graderKind] ?? graders.heuristic;
|
|
1926
2227
|
if (!activeGrader) {
|
|
1927
2228
|
throw new Error(`No grader registered for kind '${graderKind}'`);
|
|
@@ -1930,7 +2231,7 @@ async function runTestCase(options) {
|
|
|
1930
2231
|
try {
|
|
1931
2232
|
const gradeTimestamp = nowFn();
|
|
1932
2233
|
grade = await activeGrader.grade({
|
|
1933
|
-
|
|
2234
|
+
evalCase,
|
|
1934
2235
|
candidate: providerResponse.text ?? "",
|
|
1935
2236
|
target,
|
|
1936
2237
|
provider,
|
|
@@ -1940,17 +2241,18 @@ async function runTestCase(options) {
|
|
|
1940
2241
|
judgeProvider
|
|
1941
2242
|
});
|
|
1942
2243
|
} catch (error) {
|
|
1943
|
-
return buildErrorResult(
|
|
2244
|
+
return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
|
|
1944
2245
|
}
|
|
1945
2246
|
const completedAt = nowFn();
|
|
1946
2247
|
const rawRequest = {
|
|
1947
2248
|
request: promptInputs.request,
|
|
1948
2249
|
guidelines: promptInputs.guidelines,
|
|
1949
|
-
guideline_paths:
|
|
2250
|
+
guideline_paths: evalCase.guideline_paths,
|
|
2251
|
+
system_message: promptInputs.systemMessage ?? ""
|
|
1950
2252
|
};
|
|
1951
2253
|
return {
|
|
1952
|
-
eval_id:
|
|
1953
|
-
conversation_id:
|
|
2254
|
+
eval_id: evalCase.id,
|
|
2255
|
+
conversation_id: evalCase.conversation_id,
|
|
1954
2256
|
score: grade.score,
|
|
1955
2257
|
hits: grade.hits,
|
|
1956
2258
|
misses: grade.misses,
|
|
@@ -1964,11 +2266,11 @@ async function runTestCase(options) {
|
|
|
1964
2266
|
grader_raw_request: grade.graderRawRequest
|
|
1965
2267
|
};
|
|
1966
2268
|
}
|
|
1967
|
-
function
|
|
2269
|
+
function filterEvalCases(evalCases, evalId) {
|
|
1968
2270
|
if (!evalId) {
|
|
1969
|
-
return
|
|
2271
|
+
return evalCases;
|
|
1970
2272
|
}
|
|
1971
|
-
return
|
|
2273
|
+
return evalCases.filter((evalCase) => evalCase.id === evalId);
|
|
1972
2274
|
}
|
|
1973
2275
|
function buildGraderRegistry(overrides, resolveJudgeProvider) {
|
|
1974
2276
|
const heuristic = overrides?.heuristic ?? new HeuristicGrader();
|
|
@@ -1986,16 +2288,16 @@ function buildGraderRegistry(overrides, resolveJudgeProvider) {
|
|
|
1986
2288
|
llm_judge: llmJudge
|
|
1987
2289
|
};
|
|
1988
2290
|
}
|
|
1989
|
-
async function dumpPrompt(directory,
|
|
2291
|
+
async function dumpPrompt(directory, evalCase, promptInputs) {
|
|
1990
2292
|
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
1991
|
-
const filename = `${timestamp}_${sanitizeFilename(
|
|
2293
|
+
const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
|
|
1992
2294
|
const filePath = path4.resolve(directory, filename);
|
|
1993
2295
|
await mkdir(path4.dirname(filePath), { recursive: true });
|
|
1994
2296
|
const payload = {
|
|
1995
|
-
eval_id:
|
|
2297
|
+
eval_id: evalCase.id,
|
|
1996
2298
|
request: promptInputs.request,
|
|
1997
2299
|
guidelines: promptInputs.guidelines,
|
|
1998
|
-
guideline_paths:
|
|
2300
|
+
guideline_paths: evalCase.guideline_paths
|
|
1999
2301
|
};
|
|
2000
2302
|
await writeFile2(filePath, JSON.stringify(payload, null, 2), "utf8");
|
|
2001
2303
|
}
|
|
@@ -2007,7 +2309,7 @@ function sanitizeFilename(value) {
|
|
|
2007
2309
|
return sanitized.length > 0 ? sanitized : randomUUID2();
|
|
2008
2310
|
}
|
|
2009
2311
|
async function invokeProvider(provider, options) {
|
|
2010
|
-
const {
|
|
2312
|
+
const { evalCase, target, promptInputs, attempt, agentTimeoutMs, signal } = options;
|
|
2011
2313
|
const controller = new AbortController();
|
|
2012
2314
|
const timeout = agentTimeoutMs ? setTimeout(() => controller.abort(), agentTimeoutMs) : void 0;
|
|
2013
2315
|
if (signal) {
|
|
@@ -2017,12 +2319,12 @@ async function invokeProvider(provider, options) {
|
|
|
2017
2319
|
return await provider.invoke({
|
|
2018
2320
|
prompt: promptInputs.request,
|
|
2019
2321
|
guidelines: promptInputs.guidelines,
|
|
2020
|
-
|
|
2021
|
-
|
|
2322
|
+
guideline_patterns: evalCase.guideline_patterns,
|
|
2323
|
+
attachments: evalCase.file_paths,
|
|
2324
|
+
evalCaseId: evalCase.id,
|
|
2022
2325
|
attempt,
|
|
2023
2326
|
metadata: {
|
|
2024
|
-
|
|
2025
|
-
grader: testCase.grader
|
|
2327
|
+
systemPrompt: promptInputs.systemMessage ?? ""
|
|
2026
2328
|
},
|
|
2027
2329
|
signal: controller.signal
|
|
2028
2330
|
});
|
|
@@ -2032,17 +2334,18 @@ async function invokeProvider(provider, options) {
|
|
|
2032
2334
|
}
|
|
2033
2335
|
}
|
|
2034
2336
|
}
|
|
2035
|
-
function buildErrorResult(
|
|
2337
|
+
function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs) {
|
|
2036
2338
|
const message = error instanceof Error ? error.message : String(error);
|
|
2037
2339
|
const rawRequest = {
|
|
2038
2340
|
request: promptInputs.request,
|
|
2039
2341
|
guidelines: promptInputs.guidelines,
|
|
2040
|
-
guideline_paths:
|
|
2342
|
+
guideline_paths: evalCase.guideline_paths,
|
|
2343
|
+
system_message: promptInputs.systemMessage ?? "",
|
|
2041
2344
|
error: message
|
|
2042
2345
|
};
|
|
2043
2346
|
return {
|
|
2044
|
-
eval_id:
|
|
2045
|
-
conversation_id:
|
|
2347
|
+
eval_id: evalCase.id,
|
|
2348
|
+
conversation_id: evalCase.conversation_id,
|
|
2046
2349
|
score: 0,
|
|
2047
2350
|
hits: [],
|
|
2048
2351
|
misses: [`Error: ${message}`],
|
|
@@ -2054,13 +2357,14 @@ function buildErrorResult(testCase, targetName, timestamp, error, promptInputs)
|
|
|
2054
2357
|
raw_request: rawRequest
|
|
2055
2358
|
};
|
|
2056
2359
|
}
|
|
2057
|
-
function createCacheKey(provider, target,
|
|
2360
|
+
function createCacheKey(provider, target, evalCase, promptInputs) {
|
|
2058
2361
|
const hash = createHash("sha256");
|
|
2059
2362
|
hash.update(provider.id);
|
|
2060
2363
|
hash.update(target.name);
|
|
2061
|
-
hash.update(
|
|
2364
|
+
hash.update(evalCase.id);
|
|
2062
2365
|
hash.update(promptInputs.request);
|
|
2063
2366
|
hash.update(promptInputs.guidelines);
|
|
2367
|
+
hash.update(promptInputs.systemMessage ?? "");
|
|
2064
2368
|
return hash.digest("hex");
|
|
2065
2369
|
}
|
|
2066
2370
|
function isTimeoutLike(error) {
|
|
@@ -2088,7 +2392,9 @@ export {
|
|
|
2088
2392
|
HeuristicGrader,
|
|
2089
2393
|
QualityGrader,
|
|
2090
2394
|
TEST_MESSAGE_ROLES,
|
|
2395
|
+
buildDirectoryChain,
|
|
2091
2396
|
buildPromptInputs,
|
|
2397
|
+
buildSearchRoots,
|
|
2092
2398
|
calculateHits,
|
|
2093
2399
|
calculateMisses,
|
|
2094
2400
|
createAgentKernel,
|
|
@@ -2096,6 +2402,8 @@ export {
|
|
|
2096
2402
|
ensureVSCodeSubagents,
|
|
2097
2403
|
extractAspects,
|
|
2098
2404
|
extractCodeBlocks,
|
|
2405
|
+
fileExists,
|
|
2406
|
+
findGitRoot,
|
|
2099
2407
|
getHitCount,
|
|
2100
2408
|
isErrorLike,
|
|
2101
2409
|
isGraderKind,
|
|
@@ -2105,12 +2413,13 @@ export {
|
|
|
2105
2413
|
isTestMessage,
|
|
2106
2414
|
isTestMessageRole,
|
|
2107
2415
|
listTargetNames,
|
|
2108
|
-
|
|
2416
|
+
loadEvalCases,
|
|
2109
2417
|
readTargetDefinitions,
|
|
2110
2418
|
resolveAndCreateProvider,
|
|
2419
|
+
resolveFileReference,
|
|
2111
2420
|
resolveTargetDefinition,
|
|
2421
|
+
runEvalCase,
|
|
2112
2422
|
runEvaluation,
|
|
2113
|
-
runTestCase,
|
|
2114
2423
|
scoreCandidateResponse
|
|
2115
2424
|
};
|
|
2116
2425
|
//# sourceMappingURL=index.js.map
|