@agentv/core 0.2.6 → 0.2.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-QVS4OL44.js → chunk-XXNQA4EW.js} +27 -1
- package/dist/chunk-XXNQA4EW.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +93 -8
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.d.cts +7 -2
- package/dist/evaluation/validation/index.d.ts +7 -2
- package/dist/evaluation/validation/index.js +91 -7
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +274 -182
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +39 -10
- package/dist/index.d.ts +39 -10
- package/dist/index.js +237 -187
- package/dist/index.js.map +1 -1
- package/package.json +6 -2
- package/dist/chunk-QVS4OL44.js.map +0 -1
package/dist/index.cjs
CHANGED
|
@@ -34,7 +34,9 @@ __export(index_exports, {
|
|
|
34
34
|
HeuristicGrader: () => HeuristicGrader,
|
|
35
35
|
QualityGrader: () => QualityGrader,
|
|
36
36
|
TEST_MESSAGE_ROLES: () => TEST_MESSAGE_ROLES,
|
|
37
|
+
buildDirectoryChain: () => buildDirectoryChain,
|
|
37
38
|
buildPromptInputs: () => buildPromptInputs,
|
|
39
|
+
buildSearchRoots: () => buildSearchRoots,
|
|
38
40
|
calculateHits: () => calculateHits,
|
|
39
41
|
calculateMisses: () => calculateMisses,
|
|
40
42
|
createAgentKernel: () => createAgentKernel,
|
|
@@ -42,6 +44,8 @@ __export(index_exports, {
|
|
|
42
44
|
ensureVSCodeSubagents: () => ensureVSCodeSubagents,
|
|
43
45
|
extractAspects: () => extractAspects,
|
|
44
46
|
extractCodeBlocks: () => extractCodeBlocks,
|
|
47
|
+
fileExists: () => fileExists,
|
|
48
|
+
findGitRoot: () => findGitRoot,
|
|
45
49
|
getHitCount: () => getHitCount,
|
|
46
50
|
isErrorLike: () => isErrorLike,
|
|
47
51
|
isGraderKind: () => isGraderKind,
|
|
@@ -51,12 +55,13 @@ __export(index_exports, {
|
|
|
51
55
|
isTestMessage: () => isTestMessage,
|
|
52
56
|
isTestMessageRole: () => isTestMessageRole,
|
|
53
57
|
listTargetNames: () => listTargetNames,
|
|
54
|
-
|
|
58
|
+
loadEvalCases: () => loadEvalCases,
|
|
55
59
|
readTargetDefinitions: () => readTargetDefinitions,
|
|
56
60
|
resolveAndCreateProvider: () => resolveAndCreateProvider,
|
|
61
|
+
resolveFileReference: () => resolveFileReference,
|
|
57
62
|
resolveTargetDefinition: () => resolveTargetDefinition,
|
|
63
|
+
runEvalCase: () => runEvalCase,
|
|
58
64
|
runEvaluation: () => runEvaluation,
|
|
59
|
-
runTestCase: () => runTestCase,
|
|
60
65
|
scoreCandidateResponse: () => scoreCandidateResponse
|
|
61
66
|
});
|
|
62
67
|
module.exports = __toCommonJS(index_exports);
|
|
@@ -113,6 +118,7 @@ function getHitCount(result) {
|
|
|
113
118
|
}
|
|
114
119
|
|
|
115
120
|
// src/evaluation/yaml-parser.ts
|
|
121
|
+
var import_micromatch = __toESM(require("micromatch"), 1);
|
|
116
122
|
var import_node_fs2 = require("fs");
|
|
117
123
|
var import_promises2 = require("fs/promises");
|
|
118
124
|
var import_node_path2 = __toESM(require("path"), 1);
|
|
@@ -131,6 +137,46 @@ async function fileExists(filePath) {
|
|
|
131
137
|
return false;
|
|
132
138
|
}
|
|
133
139
|
}
|
|
140
|
+
async function findGitRoot(startPath) {
|
|
141
|
+
let currentDir = import_node_path.default.dirname(import_node_path.default.resolve(startPath));
|
|
142
|
+
const root = import_node_path.default.parse(currentDir).root;
|
|
143
|
+
while (currentDir !== root) {
|
|
144
|
+
const gitPath = import_node_path.default.join(currentDir, ".git");
|
|
145
|
+
if (await fileExists(gitPath)) {
|
|
146
|
+
return currentDir;
|
|
147
|
+
}
|
|
148
|
+
const parentDir = import_node_path.default.dirname(currentDir);
|
|
149
|
+
if (parentDir === currentDir) {
|
|
150
|
+
break;
|
|
151
|
+
}
|
|
152
|
+
currentDir = parentDir;
|
|
153
|
+
}
|
|
154
|
+
return null;
|
|
155
|
+
}
|
|
156
|
+
function buildDirectoryChain(filePath, repoRoot) {
|
|
157
|
+
const directories = [];
|
|
158
|
+
const seen = /* @__PURE__ */ new Set();
|
|
159
|
+
const boundary = import_node_path.default.resolve(repoRoot);
|
|
160
|
+
let current = import_node_path.default.resolve(import_node_path.default.dirname(filePath));
|
|
161
|
+
while (current !== void 0) {
|
|
162
|
+
if (!seen.has(current)) {
|
|
163
|
+
directories.push(current);
|
|
164
|
+
seen.add(current);
|
|
165
|
+
}
|
|
166
|
+
if (current === boundary) {
|
|
167
|
+
break;
|
|
168
|
+
}
|
|
169
|
+
const parent = import_node_path.default.dirname(current);
|
|
170
|
+
if (parent === current) {
|
|
171
|
+
break;
|
|
172
|
+
}
|
|
173
|
+
current = parent;
|
|
174
|
+
}
|
|
175
|
+
if (!seen.has(boundary)) {
|
|
176
|
+
directories.push(boundary);
|
|
177
|
+
}
|
|
178
|
+
return directories;
|
|
179
|
+
}
|
|
134
180
|
function buildSearchRoots(evalPath, repoRoot) {
|
|
135
181
|
const uniqueRoots = [];
|
|
136
182
|
const addRoot = (root) => {
|
|
@@ -188,9 +234,52 @@ var CODE_BLOCK_PATTERN = /```[\s\S]*?```/g;
|
|
|
188
234
|
var ANSI_YELLOW = "\x1B[33m";
|
|
189
235
|
var ANSI_RESET = "\x1B[0m";
|
|
190
236
|
var SCHEMA_EVAL_V2 = "agentv-eval-v2";
|
|
191
|
-
|
|
237
|
+
var SCHEMA_CONFIG_V2 = "agentv-config-v2";
|
|
238
|
+
async function loadConfig(evalFilePath, repoRoot) {
|
|
239
|
+
const directories = buildDirectoryChain(evalFilePath, repoRoot);
|
|
240
|
+
for (const directory of directories) {
|
|
241
|
+
const configPath = import_node_path2.default.join(directory, ".agentv", "config.yaml");
|
|
242
|
+
if (!await fileExists2(configPath)) {
|
|
243
|
+
continue;
|
|
244
|
+
}
|
|
245
|
+
try {
|
|
246
|
+
const rawConfig = await (0, import_promises2.readFile)(configPath, "utf8");
|
|
247
|
+
const parsed = (0, import_yaml.parse)(rawConfig);
|
|
248
|
+
if (!isJsonObject(parsed)) {
|
|
249
|
+
logWarning(`Invalid .agentv/config.yaml format at ${configPath}`);
|
|
250
|
+
continue;
|
|
251
|
+
}
|
|
252
|
+
const config = parsed;
|
|
253
|
+
const schema = config.$schema;
|
|
254
|
+
if (schema !== SCHEMA_CONFIG_V2) {
|
|
255
|
+
const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${configPath}. Expected '${SCHEMA_CONFIG_V2}'` : `Missing required field '$schema' in ${configPath}.
|
|
256
|
+
Please add '$schema: ${SCHEMA_CONFIG_V2}' at the top of the file.`;
|
|
257
|
+
logWarning(message);
|
|
258
|
+
continue;
|
|
259
|
+
}
|
|
260
|
+
const guidelinePatterns = config.guideline_patterns;
|
|
261
|
+
if (guidelinePatterns !== void 0 && !Array.isArray(guidelinePatterns)) {
|
|
262
|
+
logWarning(`Invalid guideline_patterns in ${configPath}, expected array`);
|
|
263
|
+
continue;
|
|
264
|
+
}
|
|
265
|
+
if (Array.isArray(guidelinePatterns) && !guidelinePatterns.every((p) => typeof p === "string")) {
|
|
266
|
+
logWarning(`Invalid guideline_patterns in ${configPath}, all entries must be strings`);
|
|
267
|
+
continue;
|
|
268
|
+
}
|
|
269
|
+
return {
|
|
270
|
+
guideline_patterns: guidelinePatterns
|
|
271
|
+
};
|
|
272
|
+
} catch (error) {
|
|
273
|
+
logWarning(`Could not read .agentv/config.yaml at ${configPath}: ${error.message}`);
|
|
274
|
+
continue;
|
|
275
|
+
}
|
|
276
|
+
}
|
|
277
|
+
return null;
|
|
278
|
+
}
|
|
279
|
+
function isGuidelineFile(filePath, patterns) {
|
|
192
280
|
const normalized = filePath.split("\\").join("/");
|
|
193
|
-
|
|
281
|
+
const patternsToUse = patterns ?? [];
|
|
282
|
+
return import_micromatch.default.isMatch(normalized, patternsToUse);
|
|
194
283
|
}
|
|
195
284
|
function extractCodeBlocks(segments) {
|
|
196
285
|
const codeBlocks = [];
|
|
@@ -210,43 +299,45 @@ function extractCodeBlocks(segments) {
|
|
|
210
299
|
}
|
|
211
300
|
return codeBlocks;
|
|
212
301
|
}
|
|
213
|
-
async function
|
|
302
|
+
async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
214
303
|
const verbose = options?.verbose ?? false;
|
|
215
|
-
const absoluteTestPath = import_node_path2.default.resolve(
|
|
304
|
+
const absoluteTestPath = import_node_path2.default.resolve(evalFilePath);
|
|
216
305
|
if (!await fileExists2(absoluteTestPath)) {
|
|
217
|
-
throw new Error(`Test file not found: ${
|
|
306
|
+
throw new Error(`Test file not found: ${evalFilePath}`);
|
|
218
307
|
}
|
|
219
308
|
const repoRootPath = resolveToAbsolutePath(repoRoot);
|
|
220
309
|
const searchRoots = buildSearchRoots(absoluteTestPath, repoRootPath);
|
|
310
|
+
const config = await loadConfig(absoluteTestPath, repoRootPath);
|
|
311
|
+
const guidelinePatterns = config?.guideline_patterns;
|
|
221
312
|
const rawFile = await (0, import_promises2.readFile)(absoluteTestPath, "utf8");
|
|
222
313
|
const parsed = (0, import_yaml.parse)(rawFile);
|
|
223
314
|
if (!isJsonObject(parsed)) {
|
|
224
|
-
throw new Error(`Invalid test file format: ${
|
|
315
|
+
throw new Error(`Invalid test file format: ${evalFilePath}`);
|
|
225
316
|
}
|
|
226
317
|
const suite = parsed;
|
|
227
318
|
const schema = suite.$schema;
|
|
228
319
|
if (schema !== SCHEMA_EVAL_V2) {
|
|
229
|
-
const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${
|
|
320
|
+
const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${evalFilePath}. Expected '${SCHEMA_EVAL_V2}'` : `Missing required field '$schema' in ${evalFilePath}.
|
|
230
321
|
Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
231
322
|
throw new Error(message);
|
|
232
323
|
}
|
|
233
324
|
const rawTestcases = suite.evalcases;
|
|
234
325
|
if (!Array.isArray(rawTestcases)) {
|
|
235
|
-
throw new Error(`Invalid test file format: ${
|
|
326
|
+
throw new Error(`Invalid test file format: ${evalFilePath} - missing 'evalcases' field`);
|
|
236
327
|
}
|
|
237
328
|
const globalGrader = coerceGrader(suite.grader) ?? "llm_judge";
|
|
238
329
|
const results = [];
|
|
239
|
-
for (const
|
|
240
|
-
if (!isJsonObject(
|
|
330
|
+
for (const rawEvalcase of rawTestcases) {
|
|
331
|
+
if (!isJsonObject(rawEvalcase)) {
|
|
241
332
|
logWarning("Skipping invalid test case entry (expected object)");
|
|
242
333
|
continue;
|
|
243
334
|
}
|
|
244
|
-
const
|
|
245
|
-
const id = asString(
|
|
246
|
-
const conversationId = asString(
|
|
247
|
-
const outcome = asString(
|
|
248
|
-
const inputMessagesValue =
|
|
249
|
-
const expectedMessagesValue =
|
|
335
|
+
const evalcase = rawEvalcase;
|
|
336
|
+
const id = asString(evalcase.id);
|
|
337
|
+
const conversationId = asString(evalcase.conversation_id);
|
|
338
|
+
const outcome = asString(evalcase.outcome);
|
|
339
|
+
const inputMessagesValue = evalcase.input_messages;
|
|
340
|
+
const expectedMessagesValue = evalcase.expected_messages;
|
|
250
341
|
if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
|
|
251
342
|
logWarning(`Skipping incomplete test case: ${id ?? "unknown"}`);
|
|
252
343
|
continue;
|
|
@@ -259,6 +350,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
259
350
|
const expectedMessages = expectedMessagesValue.filter((msg) => isTestMessage(msg));
|
|
260
351
|
const assistantMessages = expectedMessages.filter((message) => message.role === "assistant");
|
|
261
352
|
const userMessages = inputMessages.filter((message) => message.role === "user");
|
|
353
|
+
const systemMessages = inputMessages.filter((message) => message.role === "system");
|
|
262
354
|
if (assistantMessages.length === 0) {
|
|
263
355
|
logWarning(`No assistant message found for test case: ${id}`);
|
|
264
356
|
continue;
|
|
@@ -266,6 +358,29 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
266
358
|
if (assistantMessages.length > 1) {
|
|
267
359
|
logWarning(`Multiple assistant messages found for test case: ${id}, using first`);
|
|
268
360
|
}
|
|
361
|
+
if (systemMessages.length > 1) {
|
|
362
|
+
logWarning(`Multiple system messages found for test case: ${id}, using first`);
|
|
363
|
+
}
|
|
364
|
+
let systemMessageContent;
|
|
365
|
+
if (systemMessages.length > 0) {
|
|
366
|
+
const content = systemMessages[0]?.content;
|
|
367
|
+
if (typeof content === "string") {
|
|
368
|
+
systemMessageContent = content;
|
|
369
|
+
} else if (Array.isArray(content)) {
|
|
370
|
+
const textParts = [];
|
|
371
|
+
for (const segment of content) {
|
|
372
|
+
if (isJsonObject(segment)) {
|
|
373
|
+
const value = segment.value;
|
|
374
|
+
if (typeof value === "string") {
|
|
375
|
+
textParts.push(value);
|
|
376
|
+
}
|
|
377
|
+
}
|
|
378
|
+
}
|
|
379
|
+
if (textParts.length > 0) {
|
|
380
|
+
systemMessageContent = textParts.join("\n\n");
|
|
381
|
+
}
|
|
382
|
+
}
|
|
383
|
+
}
|
|
269
384
|
const userSegments = [];
|
|
270
385
|
const guidelinePaths = [];
|
|
271
386
|
const userTextParts = [];
|
|
@@ -297,7 +412,8 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
297
412
|
}
|
|
298
413
|
try {
|
|
299
414
|
const fileContent = (await (0, import_promises2.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
|
|
300
|
-
|
|
415
|
+
const relativeToRepo = import_node_path2.default.relative(repoRootPath, resolvedPath);
|
|
416
|
+
if (isGuidelineFile(relativeToRepo, guidelinePatterns)) {
|
|
301
417
|
guidelinePaths.push(import_node_path2.default.resolve(resolvedPath));
|
|
302
418
|
if (verbose) {
|
|
303
419
|
console.log(` [Guideline] Found: ${displayPath}`);
|
|
@@ -307,7 +423,8 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
307
423
|
userSegments.push({
|
|
308
424
|
type: "file",
|
|
309
425
|
path: displayPath,
|
|
310
|
-
text: fileContent
|
|
426
|
+
text: fileContent,
|
|
427
|
+
resolvedPath: import_node_path2.default.resolve(resolvedPath)
|
|
311
428
|
});
|
|
312
429
|
if (verbose) {
|
|
313
430
|
console.log(` [File] Found: ${displayPath}`);
|
|
@@ -331,14 +448,27 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
|
331
448
|
const assistantContent = assistantMessages[0]?.content;
|
|
332
449
|
const expectedAssistantRaw = await resolveAssistantContent(assistantContent, searchRoots, verbose);
|
|
333
450
|
const userTextPrompt = userTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
334
|
-
const testCaseGrader = coerceGrader(
|
|
451
|
+
const testCaseGrader = coerceGrader(evalcase.grader) ?? globalGrader;
|
|
452
|
+
const userFilePaths = [];
|
|
453
|
+
for (const segment of userSegments) {
|
|
454
|
+
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
455
|
+
userFilePaths.push(segment.resolvedPath);
|
|
456
|
+
}
|
|
457
|
+
}
|
|
458
|
+
const allFilePaths = [
|
|
459
|
+
...guidelinePaths.map((guidelinePath) => import_node_path2.default.resolve(guidelinePath)),
|
|
460
|
+
...userFilePaths
|
|
461
|
+
];
|
|
335
462
|
const testCase = {
|
|
336
463
|
id,
|
|
337
464
|
conversation_id: conversationId,
|
|
338
465
|
task: userTextPrompt,
|
|
339
466
|
user_segments: userSegments,
|
|
467
|
+
system_message: systemMessageContent,
|
|
340
468
|
expected_assistant_raw: expectedAssistantRaw,
|
|
341
469
|
guideline_paths: guidelinePaths.map((guidelinePath) => import_node_path2.default.resolve(guidelinePath)),
|
|
470
|
+
guideline_patterns: guidelinePatterns,
|
|
471
|
+
file_paths: allFilePaths,
|
|
342
472
|
code_snippets: codeSnippets,
|
|
343
473
|
outcome,
|
|
344
474
|
grader: testCaseGrader
|
|
@@ -404,7 +534,7 @@ ${body}`);
|
|
|
404
534
|
}
|
|
405
535
|
const request = requestParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
|
|
406
536
|
const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
|
|
407
|
-
return { request, guidelines };
|
|
537
|
+
return { request, guidelines, systemMessage: testCase.system_message };
|
|
408
538
|
}
|
|
409
539
|
async function fileExists2(absolutePath) {
|
|
410
540
|
try {
|
|
@@ -530,15 +660,18 @@ function buildChatPrompt(request) {
|
|
|
530
660
|
return request.chatPrompt;
|
|
531
661
|
}
|
|
532
662
|
const systemSegments = [];
|
|
533
|
-
if (request.guidelines && request.guidelines.trim().length > 0) {
|
|
534
|
-
systemSegments.push(`Guidelines:
|
|
535
|
-
${request.guidelines.trim()}`);
|
|
536
|
-
}
|
|
537
663
|
const metadataSystemPrompt = typeof request.metadata?.systemPrompt === "string" ? request.metadata.systemPrompt : void 0;
|
|
538
664
|
if (metadataSystemPrompt && metadataSystemPrompt.trim().length > 0) {
|
|
539
665
|
systemSegments.push(metadataSystemPrompt.trim());
|
|
666
|
+
} else {
|
|
667
|
+
systemSegments.push(DEFAULT_SYSTEM_PROMPT);
|
|
540
668
|
}
|
|
541
|
-
|
|
669
|
+
if (request.guidelines && request.guidelines.trim().length > 0) {
|
|
670
|
+
systemSegments.push(`[[ ## Guidelines ## ]]
|
|
671
|
+
|
|
672
|
+
${request.guidelines.trim()}`);
|
|
673
|
+
}
|
|
674
|
+
const systemContent = systemSegments.join("\n\n");
|
|
542
675
|
const userContent = request.prompt.trim();
|
|
543
676
|
const prompt = [
|
|
544
677
|
{
|
|
@@ -995,10 +1128,8 @@ function isLikelyEnvReference(value) {
|
|
|
995
1128
|
|
|
996
1129
|
// src/evaluation/providers/vscode.ts
|
|
997
1130
|
var import_promises3 = require("fs/promises");
|
|
998
|
-
var import_node_os = require("os");
|
|
999
1131
|
var import_node_path3 = __toESM(require("path"), 1);
|
|
1000
1132
|
var import_subagent = require("subagent");
|
|
1001
|
-
var PROMPT_FILE_PREFIX = "agentv-vscode-";
|
|
1002
1133
|
var VSCodeProvider = class {
|
|
1003
1134
|
id;
|
|
1004
1135
|
kind;
|
|
@@ -1015,128 +1146,89 @@ var VSCodeProvider = class {
|
|
|
1015
1146
|
throw new Error("VS Code provider request was aborted before dispatch");
|
|
1016
1147
|
}
|
|
1017
1148
|
const attachments = normalizeAttachments(request.attachments);
|
|
1018
|
-
const promptContent = buildPromptDocument(request, attachments);
|
|
1019
|
-
const
|
|
1020
|
-
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
|
|
1035
|
-
const failure = session.error ?? "VS Code subagent did not produce a response";
|
|
1036
|
-
throw new Error(failure);
|
|
1037
|
-
}
|
|
1038
|
-
if (this.config.dryRun) {
|
|
1039
|
-
return {
|
|
1040
|
-
text: "",
|
|
1041
|
-
raw: {
|
|
1042
|
-
session,
|
|
1043
|
-
promptFile: promptPath,
|
|
1044
|
-
attachments
|
|
1045
|
-
}
|
|
1046
|
-
};
|
|
1047
|
-
}
|
|
1048
|
-
const responseText = await (0, import_promises3.readFile)(session.responseFile, "utf8");
|
|
1149
|
+
const promptContent = buildPromptDocument(request, attachments, request.guideline_patterns);
|
|
1150
|
+
const session = await (0, import_subagent.dispatchAgentSession)({
|
|
1151
|
+
userQuery: promptContent,
|
|
1152
|
+
// Use full prompt content instead of just request.prompt
|
|
1153
|
+
extraAttachments: attachments,
|
|
1154
|
+
wait: this.config.waitForResponse,
|
|
1155
|
+
dryRun: this.config.dryRun,
|
|
1156
|
+
vscodeCmd: this.config.command,
|
|
1157
|
+
subagentRoot: this.config.subagentRoot,
|
|
1158
|
+
workspaceTemplate: this.config.workspaceTemplate,
|
|
1159
|
+
silent: true
|
|
1160
|
+
});
|
|
1161
|
+
if (session.exitCode !== 0 || !session.responseFile) {
|
|
1162
|
+
const failure = session.error ?? "VS Code subagent did not produce a response";
|
|
1163
|
+
throw new Error(failure);
|
|
1164
|
+
}
|
|
1165
|
+
if (this.config.dryRun) {
|
|
1049
1166
|
return {
|
|
1050
|
-
text:
|
|
1167
|
+
text: "",
|
|
1051
1168
|
raw: {
|
|
1052
1169
|
session,
|
|
1053
|
-
promptFile: promptPath,
|
|
1054
1170
|
attachments
|
|
1055
1171
|
}
|
|
1056
1172
|
};
|
|
1057
|
-
} finally {
|
|
1058
|
-
await (0, import_promises3.rm)(directory, { recursive: true, force: true });
|
|
1059
1173
|
}
|
|
1174
|
+
const responseText = await (0, import_promises3.readFile)(session.responseFile, "utf8");
|
|
1175
|
+
return {
|
|
1176
|
+
text: responseText,
|
|
1177
|
+
raw: {
|
|
1178
|
+
session,
|
|
1179
|
+
attachments
|
|
1180
|
+
}
|
|
1181
|
+
};
|
|
1060
1182
|
}
|
|
1061
1183
|
};
|
|
1062
|
-
function buildPromptDocument(request, attachments) {
|
|
1184
|
+
function buildPromptDocument(request, attachments, guidelinePatterns) {
|
|
1063
1185
|
const parts = [];
|
|
1064
|
-
const
|
|
1065
|
-
if (
|
|
1066
|
-
parts.push(buildMandatoryPrereadBlock(
|
|
1067
|
-
}
|
|
1068
|
-
parts.push(`# AgentV Request`);
|
|
1069
|
-
if (request.testCaseId) {
|
|
1070
|
-
parts.push(`- Test Case: ${request.testCaseId}`);
|
|
1071
|
-
}
|
|
1072
|
-
if (request.metadata?.target) {
|
|
1073
|
-
parts.push(`- Target: ${String(request.metadata.target)}`);
|
|
1074
|
-
}
|
|
1075
|
-
parts.push("\n## Task\n", request.prompt.trim());
|
|
1076
|
-
if (request.guidelines && request.guidelines.trim().length > 0) {
|
|
1077
|
-
parts.push("\n## Guidelines\n", request.guidelines.trim());
|
|
1078
|
-
}
|
|
1079
|
-
if (attachments && attachments.length > 0) {
|
|
1080
|
-
const attachmentList = attachments.map((item) => `- ${item}`).join("\n");
|
|
1081
|
-
parts.push("\n## Attachments\n", attachmentList);
|
|
1186
|
+
const guidelineFiles = collectGuidelineFiles(attachments, guidelinePatterns);
|
|
1187
|
+
if (guidelineFiles.length > 0) {
|
|
1188
|
+
parts.push("\n", buildMandatoryPrereadBlock(guidelineFiles));
|
|
1082
1189
|
}
|
|
1190
|
+
parts.push("\n[[ ## user_query ## ]]\n", request.prompt.trim());
|
|
1083
1191
|
return parts.join("\n").trim();
|
|
1084
1192
|
}
|
|
1085
|
-
function buildMandatoryPrereadBlock(
|
|
1086
|
-
if (
|
|
1193
|
+
function buildMandatoryPrereadBlock(guidelineFiles) {
|
|
1194
|
+
if (guidelineFiles.length === 0) {
|
|
1087
1195
|
return "";
|
|
1088
1196
|
}
|
|
1089
1197
|
const fileList = [];
|
|
1090
|
-
const tokenList = [];
|
|
1091
1198
|
let counter = 0;
|
|
1092
|
-
for (const absolutePath of
|
|
1199
|
+
for (const absolutePath of guidelineFiles) {
|
|
1093
1200
|
counter += 1;
|
|
1094
1201
|
const fileName = import_node_path3.default.basename(absolutePath);
|
|
1095
1202
|
const fileUri = pathToFileUri(absolutePath);
|
|
1096
|
-
fileList.push(
|
|
1097
|
-
tokenList.push(`INSTRUCTIONS_READ: \`${fileName}\` i=${counter} SHA256=<hex>`);
|
|
1203
|
+
fileList.push(`* [${fileName}](${fileUri})`);
|
|
1098
1204
|
}
|
|
1099
|
-
const filesText = fileList.join("
|
|
1100
|
-
const tokensText = tokenList.join("\n");
|
|
1205
|
+
const filesText = fileList.join("\n");
|
|
1101
1206
|
const instruction = [
|
|
1102
|
-
`Read all
|
|
1103
|
-
|
|
1104
|
-
"`Get-FileHash -Algorithm SHA256 -LiteralPath '<file-path>' | Select-Object -ExpandProperty Hash`.",
|
|
1105
|
-
`Then include, at the top of your reply, these exact tokens on separate lines:
|
|
1207
|
+
`Read all guideline files:
|
|
1208
|
+
${filesText}.
|
|
1106
1209
|
`,
|
|
1107
|
-
tokensText,
|
|
1108
|
-
`
|
|
1109
|
-
Replace \`<hex>\` with the actual SHA256 hash value computed from the PowerShell command.`,
|
|
1110
1210
|
`If any file is missing, fail with ERROR: missing-file <filename> and stop.
|
|
1111
1211
|
`,
|
|
1112
|
-
`Then
|
|
1113
|
-
].join("
|
|
1114
|
-
return
|
|
1115
|
-
|
|
1116
|
-
${instruction}
|
|
1117
|
-
|
|
1118
|
-
`;
|
|
1212
|
+
`Then apply system_instructions on the user query below.`
|
|
1213
|
+
].join("");
|
|
1214
|
+
return `${instruction}`;
|
|
1119
1215
|
}
|
|
1120
|
-
function
|
|
1216
|
+
function collectGuidelineFiles(attachments, guidelinePatterns) {
|
|
1121
1217
|
if (!attachments || attachments.length === 0) {
|
|
1122
1218
|
return [];
|
|
1123
1219
|
}
|
|
1124
1220
|
const unique = /* @__PURE__ */ new Map();
|
|
1125
1221
|
for (const attachment of attachments) {
|
|
1126
|
-
if (!isInstructionPath(attachment)) {
|
|
1127
|
-
continue;
|
|
1128
|
-
}
|
|
1129
1222
|
const absolutePath = import_node_path3.default.resolve(attachment);
|
|
1130
|
-
|
|
1131
|
-
|
|
1223
|
+
const normalized = absolutePath.split(import_node_path3.default.sep).join("/");
|
|
1224
|
+
if (isGuidelineFile(normalized, guidelinePatterns)) {
|
|
1225
|
+
if (!unique.has(absolutePath)) {
|
|
1226
|
+
unique.set(absolutePath, absolutePath);
|
|
1227
|
+
}
|
|
1132
1228
|
}
|
|
1133
1229
|
}
|
|
1134
1230
|
return Array.from(unique.values());
|
|
1135
1231
|
}
|
|
1136
|
-
function isInstructionPath(filePath) {
|
|
1137
|
-
const normalized = filePath.split(import_node_path3.default.sep).join("/");
|
|
1138
|
-
return normalized.endsWith(".instructions.md") || normalized.includes("/instructions/") || normalized.endsWith(".prompt.md") || normalized.includes("/prompts/");
|
|
1139
|
-
}
|
|
1140
1232
|
function pathToFileUri(filePath) {
|
|
1141
1233
|
const absolutePath = import_node_path3.default.isAbsolute(filePath) ? filePath : import_node_path3.default.resolve(filePath);
|
|
1142
1234
|
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
@@ -1145,14 +1237,6 @@ function pathToFileUri(filePath) {
|
|
|
1145
1237
|
}
|
|
1146
1238
|
return `file://${normalizedPath}`;
|
|
1147
1239
|
}
|
|
1148
|
-
function composeUserQuery(request) {
|
|
1149
|
-
const segments = [];
|
|
1150
|
-
segments.push(request.prompt.trim());
|
|
1151
|
-
if (request.guidelines && request.guidelines.trim().length > 0) {
|
|
1152
|
-
segments.push("\nGuidelines:\n", request.guidelines.trim());
|
|
1153
|
-
}
|
|
1154
|
-
return segments.join("\n").trim();
|
|
1155
|
-
}
|
|
1156
1240
|
function normalizeAttachments(attachments) {
|
|
1157
1241
|
if (!attachments || attachments.length === 0) {
|
|
1158
1242
|
return void 0;
|
|
@@ -1504,7 +1588,7 @@ var import_node_crypto = require("crypto");
|
|
|
1504
1588
|
var HeuristicGrader = class {
|
|
1505
1589
|
kind = "heuristic";
|
|
1506
1590
|
grade(context) {
|
|
1507
|
-
const expectedAspects = extractAspects(context.
|
|
1591
|
+
const expectedAspects = extractAspects(context.evalCase.expected_assistant_raw);
|
|
1508
1592
|
const result = scoreCandidateResponse(context.candidate, expectedAspects);
|
|
1509
1593
|
const misses = [...result.misses];
|
|
1510
1594
|
if (expectedAspects.length === 0 && isErrorLike(context.candidate)) {
|
|
@@ -1537,14 +1621,14 @@ var QualityGrader = class {
|
|
|
1537
1621
|
if (!judgeProvider) {
|
|
1538
1622
|
throw new Error("No judge provider available for LLM grading");
|
|
1539
1623
|
}
|
|
1540
|
-
const prompt = buildQualityPrompt(context.
|
|
1624
|
+
const prompt = buildQualityPrompt(context.evalCase, context.candidate);
|
|
1541
1625
|
const metadata = {
|
|
1542
1626
|
systemPrompt: QUALITY_SYSTEM_PROMPT
|
|
1543
1627
|
};
|
|
1544
1628
|
const response = await judgeProvider.invoke({
|
|
1545
1629
|
prompt,
|
|
1546
1630
|
metadata,
|
|
1547
|
-
|
|
1631
|
+
evalCaseId: context.evalCase.id,
|
|
1548
1632
|
attempt: context.attempt,
|
|
1549
1633
|
maxOutputTokens: this.maxOutputTokens,
|
|
1550
1634
|
temperature: this.temperature
|
|
@@ -1590,16 +1674,16 @@ var QUALITY_SYSTEM_PROMPT = [
|
|
|
1590
1674
|
function buildQualityPrompt(testCase, candidate) {
|
|
1591
1675
|
const parts = [
|
|
1592
1676
|
"[[ ## expected_outcome ## ]]",
|
|
1593
|
-
testCase.outcome,
|
|
1677
|
+
testCase.outcome.trim(),
|
|
1594
1678
|
"",
|
|
1595
1679
|
"[[ ## request ## ]]",
|
|
1596
|
-
testCase.task,
|
|
1680
|
+
testCase.task.trim(),
|
|
1597
1681
|
"",
|
|
1598
1682
|
"[[ ## reference_answer ## ]]",
|
|
1599
|
-
testCase.expected_assistant_raw,
|
|
1683
|
+
testCase.expected_assistant_raw.trim(),
|
|
1600
1684
|
"",
|
|
1601
1685
|
"[[ ## generated_answer ## ]]",
|
|
1602
|
-
candidate,
|
|
1686
|
+
candidate.trim(),
|
|
1603
1687
|
"",
|
|
1604
1688
|
"Respond with a single JSON object matching the schema described in the system prompt."
|
|
1605
1689
|
];
|
|
@@ -1848,10 +1932,10 @@ async function runEvaluation(options) {
|
|
|
1848
1932
|
onResult,
|
|
1849
1933
|
onProgress
|
|
1850
1934
|
} = options;
|
|
1851
|
-
const load =
|
|
1852
|
-
const
|
|
1853
|
-
const
|
|
1854
|
-
if (
|
|
1935
|
+
const load = loadEvalCases;
|
|
1936
|
+
const evalCases = await load(testFilePath, repoRoot, { verbose });
|
|
1937
|
+
const filteredEvalCases = filterEvalCases(evalCases, evalId);
|
|
1938
|
+
if (filteredEvalCases.length === 0) {
|
|
1855
1939
|
if (evalId) {
|
|
1856
1940
|
throw new Error(`Test case with id '${evalId}' not found in ${testFilePath}`);
|
|
1857
1941
|
}
|
|
@@ -1897,11 +1981,11 @@ async function runEvaluation(options) {
|
|
|
1897
1981
|
};
|
|
1898
1982
|
const graderRegistry = buildGraderRegistry(graders, resolveJudgeProvider);
|
|
1899
1983
|
const primaryProvider = getOrCreateProvider(target);
|
|
1900
|
-
if (onProgress &&
|
|
1901
|
-
for (let i = 0; i <
|
|
1984
|
+
if (onProgress && filteredEvalCases.length > 0) {
|
|
1985
|
+
for (let i = 0; i < filteredEvalCases.length; i++) {
|
|
1902
1986
|
await onProgress({
|
|
1903
1987
|
workerId: i + 1,
|
|
1904
|
-
evalId:
|
|
1988
|
+
evalId: filteredEvalCases[i].id,
|
|
1905
1989
|
status: "pending"
|
|
1906
1990
|
});
|
|
1907
1991
|
}
|
|
@@ -1910,22 +1994,22 @@ async function runEvaluation(options) {
|
|
|
1910
1994
|
const limit = pLimit(workers);
|
|
1911
1995
|
let nextWorkerId = 1;
|
|
1912
1996
|
const workerIdByEvalId = /* @__PURE__ */ new Map();
|
|
1913
|
-
const promises =
|
|
1914
|
-
(
|
|
1997
|
+
const promises = filteredEvalCases.map(
|
|
1998
|
+
(evalCase) => limit(async () => {
|
|
1915
1999
|
const workerId = nextWorkerId++;
|
|
1916
|
-
workerIdByEvalId.set(
|
|
2000
|
+
workerIdByEvalId.set(evalCase.id, workerId);
|
|
1917
2001
|
if (onProgress) {
|
|
1918
2002
|
await onProgress({
|
|
1919
2003
|
workerId,
|
|
1920
|
-
evalId:
|
|
2004
|
+
evalId: evalCase.id,
|
|
1921
2005
|
status: "running",
|
|
1922
2006
|
startedAt: Date.now()
|
|
1923
2007
|
});
|
|
1924
2008
|
}
|
|
1925
2009
|
try {
|
|
1926
2010
|
const judgeProvider = await resolveJudgeProvider(target);
|
|
1927
|
-
const result = await
|
|
1928
|
-
|
|
2011
|
+
const result = await runEvalCase({
|
|
2012
|
+
evalCase,
|
|
1929
2013
|
provider: primaryProvider,
|
|
1930
2014
|
target,
|
|
1931
2015
|
graders: graderRegistry,
|
|
@@ -1940,7 +2024,7 @@ async function runEvaluation(options) {
|
|
|
1940
2024
|
if (onProgress) {
|
|
1941
2025
|
await onProgress({
|
|
1942
2026
|
workerId,
|
|
1943
|
-
evalId:
|
|
2027
|
+
evalId: evalCase.id,
|
|
1944
2028
|
status: "completed",
|
|
1945
2029
|
startedAt: 0,
|
|
1946
2030
|
// Not used for completed status
|
|
@@ -1955,7 +2039,7 @@ async function runEvaluation(options) {
|
|
|
1955
2039
|
if (onProgress) {
|
|
1956
2040
|
await onProgress({
|
|
1957
2041
|
workerId,
|
|
1958
|
-
evalId:
|
|
2042
|
+
evalId: evalCase.id,
|
|
1959
2043
|
status: "failed",
|
|
1960
2044
|
completedAt: Date.now(),
|
|
1961
2045
|
error: error instanceof Error ? error.message : String(error)
|
|
@@ -1972,10 +2056,10 @@ async function runEvaluation(options) {
|
|
|
1972
2056
|
if (outcome.status === "fulfilled") {
|
|
1973
2057
|
results.push(outcome.value);
|
|
1974
2058
|
} else {
|
|
1975
|
-
const
|
|
1976
|
-
const promptInputs = await buildPromptInputs(
|
|
2059
|
+
const evalCase = filteredEvalCases[i];
|
|
2060
|
+
const promptInputs = await buildPromptInputs(evalCase);
|
|
1977
2061
|
const errorResult = buildErrorResult(
|
|
1978
|
-
|
|
2062
|
+
evalCase,
|
|
1979
2063
|
target.name,
|
|
1980
2064
|
(now ?? (() => /* @__PURE__ */ new Date()))(),
|
|
1981
2065
|
outcome.reason,
|
|
@@ -1989,9 +2073,9 @@ async function runEvaluation(options) {
|
|
|
1989
2073
|
}
|
|
1990
2074
|
return results;
|
|
1991
2075
|
}
|
|
1992
|
-
async function
|
|
2076
|
+
async function runEvalCase(options) {
|
|
1993
2077
|
const {
|
|
1994
|
-
|
|
2078
|
+
evalCase,
|
|
1995
2079
|
provider,
|
|
1996
2080
|
target,
|
|
1997
2081
|
graders,
|
|
@@ -2004,11 +2088,11 @@ async function runTestCase(options) {
|
|
|
2004
2088
|
signal,
|
|
2005
2089
|
judgeProvider
|
|
2006
2090
|
} = options;
|
|
2007
|
-
const promptInputs = await buildPromptInputs(
|
|
2091
|
+
const promptInputs = await buildPromptInputs(evalCase);
|
|
2008
2092
|
if (promptDumpDir) {
|
|
2009
|
-
await dumpPrompt(promptDumpDir,
|
|
2093
|
+
await dumpPrompt(promptDumpDir, evalCase, promptInputs);
|
|
2010
2094
|
}
|
|
2011
|
-
const cacheKey = useCache ? createCacheKey(provider, target,
|
|
2095
|
+
const cacheKey = useCache ? createCacheKey(provider, target, evalCase, promptInputs) : void 0;
|
|
2012
2096
|
let cachedResponse;
|
|
2013
2097
|
if (cacheKey && cache) {
|
|
2014
2098
|
cachedResponse = await cache.get(cacheKey);
|
|
@@ -2021,7 +2105,7 @@ async function runTestCase(options) {
|
|
|
2021
2105
|
while (!providerResponse && attempt < attemptBudget) {
|
|
2022
2106
|
try {
|
|
2023
2107
|
providerResponse = await invokeProvider(provider, {
|
|
2024
|
-
|
|
2108
|
+
evalCase,
|
|
2025
2109
|
target,
|
|
2026
2110
|
promptInputs,
|
|
2027
2111
|
attempt,
|
|
@@ -2034,12 +2118,12 @@ async function runTestCase(options) {
|
|
|
2034
2118
|
attempt += 1;
|
|
2035
2119
|
continue;
|
|
2036
2120
|
}
|
|
2037
|
-
return buildErrorResult(
|
|
2121
|
+
return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
|
|
2038
2122
|
}
|
|
2039
2123
|
}
|
|
2040
2124
|
if (!providerResponse) {
|
|
2041
2125
|
return buildErrorResult(
|
|
2042
|
-
|
|
2126
|
+
evalCase,
|
|
2043
2127
|
target.name,
|
|
2044
2128
|
nowFn(),
|
|
2045
2129
|
lastError ?? new Error("Provider did not return a response"),
|
|
@@ -2049,7 +2133,7 @@ async function runTestCase(options) {
|
|
|
2049
2133
|
if (cacheKey && cache && !cachedResponse) {
|
|
2050
2134
|
await cache.set(cacheKey, providerResponse);
|
|
2051
2135
|
}
|
|
2052
|
-
const graderKind =
|
|
2136
|
+
const graderKind = evalCase.grader ?? "heuristic";
|
|
2053
2137
|
const activeGrader = graders[graderKind] ?? graders.heuristic;
|
|
2054
2138
|
if (!activeGrader) {
|
|
2055
2139
|
throw new Error(`No grader registered for kind '${graderKind}'`);
|
|
@@ -2058,7 +2142,7 @@ async function runTestCase(options) {
|
|
|
2058
2142
|
try {
|
|
2059
2143
|
const gradeTimestamp = nowFn();
|
|
2060
2144
|
grade = await activeGrader.grade({
|
|
2061
|
-
|
|
2145
|
+
evalCase,
|
|
2062
2146
|
candidate: providerResponse.text ?? "",
|
|
2063
2147
|
target,
|
|
2064
2148
|
provider,
|
|
@@ -2068,17 +2152,18 @@ async function runTestCase(options) {
|
|
|
2068
2152
|
judgeProvider
|
|
2069
2153
|
});
|
|
2070
2154
|
} catch (error) {
|
|
2071
|
-
return buildErrorResult(
|
|
2155
|
+
return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
|
|
2072
2156
|
}
|
|
2073
2157
|
const completedAt = nowFn();
|
|
2074
2158
|
const rawRequest = {
|
|
2075
2159
|
request: promptInputs.request,
|
|
2076
2160
|
guidelines: promptInputs.guidelines,
|
|
2077
|
-
guideline_paths:
|
|
2161
|
+
guideline_paths: evalCase.guideline_paths,
|
|
2162
|
+
system_message: promptInputs.systemMessage ?? ""
|
|
2078
2163
|
};
|
|
2079
2164
|
return {
|
|
2080
|
-
eval_id:
|
|
2081
|
-
conversation_id:
|
|
2165
|
+
eval_id: evalCase.id,
|
|
2166
|
+
conversation_id: evalCase.conversation_id,
|
|
2082
2167
|
score: grade.score,
|
|
2083
2168
|
hits: grade.hits,
|
|
2084
2169
|
misses: grade.misses,
|
|
@@ -2092,11 +2177,11 @@ async function runTestCase(options) {
|
|
|
2092
2177
|
grader_raw_request: grade.graderRawRequest
|
|
2093
2178
|
};
|
|
2094
2179
|
}
|
|
2095
|
-
function
|
|
2180
|
+
function filterEvalCases(evalCases, evalId) {
|
|
2096
2181
|
if (!evalId) {
|
|
2097
|
-
return
|
|
2182
|
+
return evalCases;
|
|
2098
2183
|
}
|
|
2099
|
-
return
|
|
2184
|
+
return evalCases.filter((evalCase) => evalCase.id === evalId);
|
|
2100
2185
|
}
|
|
2101
2186
|
function buildGraderRegistry(overrides, resolveJudgeProvider) {
|
|
2102
2187
|
const heuristic = overrides?.heuristic ?? new HeuristicGrader();
|
|
@@ -2114,16 +2199,16 @@ function buildGraderRegistry(overrides, resolveJudgeProvider) {
|
|
|
2114
2199
|
llm_judge: llmJudge
|
|
2115
2200
|
};
|
|
2116
2201
|
}
|
|
2117
|
-
async function dumpPrompt(directory,
|
|
2202
|
+
async function dumpPrompt(directory, evalCase, promptInputs) {
|
|
2118
2203
|
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
2119
|
-
const filename = `${timestamp}_${sanitizeFilename(
|
|
2204
|
+
const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
|
|
2120
2205
|
const filePath = import_node_path5.default.resolve(directory, filename);
|
|
2121
2206
|
await (0, import_promises5.mkdir)(import_node_path5.default.dirname(filePath), { recursive: true });
|
|
2122
2207
|
const payload = {
|
|
2123
|
-
eval_id:
|
|
2208
|
+
eval_id: evalCase.id,
|
|
2124
2209
|
request: promptInputs.request,
|
|
2125
2210
|
guidelines: promptInputs.guidelines,
|
|
2126
|
-
guideline_paths:
|
|
2211
|
+
guideline_paths: evalCase.guideline_paths
|
|
2127
2212
|
};
|
|
2128
2213
|
await (0, import_promises5.writeFile)(filePath, JSON.stringify(payload, null, 2), "utf8");
|
|
2129
2214
|
}
|
|
@@ -2135,7 +2220,7 @@ function sanitizeFilename(value) {
|
|
|
2135
2220
|
return sanitized.length > 0 ? sanitized : (0, import_node_crypto2.randomUUID)();
|
|
2136
2221
|
}
|
|
2137
2222
|
async function invokeProvider(provider, options) {
|
|
2138
|
-
const {
|
|
2223
|
+
const { evalCase, target, promptInputs, attempt, agentTimeoutMs, signal } = options;
|
|
2139
2224
|
const controller = new AbortController();
|
|
2140
2225
|
const timeout = agentTimeoutMs ? setTimeout(() => controller.abort(), agentTimeoutMs) : void 0;
|
|
2141
2226
|
if (signal) {
|
|
@@ -2145,12 +2230,12 @@ async function invokeProvider(provider, options) {
|
|
|
2145
2230
|
return await provider.invoke({
|
|
2146
2231
|
prompt: promptInputs.request,
|
|
2147
2232
|
guidelines: promptInputs.guidelines,
|
|
2148
|
-
|
|
2149
|
-
|
|
2233
|
+
guideline_patterns: evalCase.guideline_patterns,
|
|
2234
|
+
attachments: evalCase.file_paths,
|
|
2235
|
+
evalCaseId: evalCase.id,
|
|
2150
2236
|
attempt,
|
|
2151
2237
|
metadata: {
|
|
2152
|
-
|
|
2153
|
-
grader: testCase.grader
|
|
2238
|
+
systemPrompt: promptInputs.systemMessage ?? ""
|
|
2154
2239
|
},
|
|
2155
2240
|
signal: controller.signal
|
|
2156
2241
|
});
|
|
@@ -2160,17 +2245,18 @@ async function invokeProvider(provider, options) {
|
|
|
2160
2245
|
}
|
|
2161
2246
|
}
|
|
2162
2247
|
}
|
|
2163
|
-
function buildErrorResult(
|
|
2248
|
+
function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs) {
|
|
2164
2249
|
const message = error instanceof Error ? error.message : String(error);
|
|
2165
2250
|
const rawRequest = {
|
|
2166
2251
|
request: promptInputs.request,
|
|
2167
2252
|
guidelines: promptInputs.guidelines,
|
|
2168
|
-
guideline_paths:
|
|
2253
|
+
guideline_paths: evalCase.guideline_paths,
|
|
2254
|
+
system_message: promptInputs.systemMessage ?? "",
|
|
2169
2255
|
error: message
|
|
2170
2256
|
};
|
|
2171
2257
|
return {
|
|
2172
|
-
eval_id:
|
|
2173
|
-
conversation_id:
|
|
2258
|
+
eval_id: evalCase.id,
|
|
2259
|
+
conversation_id: evalCase.conversation_id,
|
|
2174
2260
|
score: 0,
|
|
2175
2261
|
hits: [],
|
|
2176
2262
|
misses: [`Error: ${message}`],
|
|
@@ -2182,13 +2268,14 @@ function buildErrorResult(testCase, targetName, timestamp, error, promptInputs)
|
|
|
2182
2268
|
raw_request: rawRequest
|
|
2183
2269
|
};
|
|
2184
2270
|
}
|
|
2185
|
-
function createCacheKey(provider, target,
|
|
2271
|
+
function createCacheKey(provider, target, evalCase, promptInputs) {
|
|
2186
2272
|
const hash = (0, import_node_crypto2.createHash)("sha256");
|
|
2187
2273
|
hash.update(provider.id);
|
|
2188
2274
|
hash.update(target.name);
|
|
2189
|
-
hash.update(
|
|
2275
|
+
hash.update(evalCase.id);
|
|
2190
2276
|
hash.update(promptInputs.request);
|
|
2191
2277
|
hash.update(promptInputs.guidelines);
|
|
2278
|
+
hash.update(promptInputs.systemMessage ?? "");
|
|
2192
2279
|
return hash.digest("hex");
|
|
2193
2280
|
}
|
|
2194
2281
|
function isTimeoutLike(error) {
|
|
@@ -2217,7 +2304,9 @@ function createAgentKernel() {
|
|
|
2217
2304
|
HeuristicGrader,
|
|
2218
2305
|
QualityGrader,
|
|
2219
2306
|
TEST_MESSAGE_ROLES,
|
|
2307
|
+
buildDirectoryChain,
|
|
2220
2308
|
buildPromptInputs,
|
|
2309
|
+
buildSearchRoots,
|
|
2221
2310
|
calculateHits,
|
|
2222
2311
|
calculateMisses,
|
|
2223
2312
|
createAgentKernel,
|
|
@@ -2225,6 +2314,8 @@ function createAgentKernel() {
|
|
|
2225
2314
|
ensureVSCodeSubagents,
|
|
2226
2315
|
extractAspects,
|
|
2227
2316
|
extractCodeBlocks,
|
|
2317
|
+
fileExists,
|
|
2318
|
+
findGitRoot,
|
|
2228
2319
|
getHitCount,
|
|
2229
2320
|
isErrorLike,
|
|
2230
2321
|
isGraderKind,
|
|
@@ -2234,12 +2325,13 @@ function createAgentKernel() {
|
|
|
2234
2325
|
isTestMessage,
|
|
2235
2326
|
isTestMessageRole,
|
|
2236
2327
|
listTargetNames,
|
|
2237
|
-
|
|
2328
|
+
loadEvalCases,
|
|
2238
2329
|
readTargetDefinitions,
|
|
2239
2330
|
resolveAndCreateProvider,
|
|
2331
|
+
resolveFileReference,
|
|
2240
2332
|
resolveTargetDefinition,
|
|
2333
|
+
runEvalCase,
|
|
2241
2334
|
runEvaluation,
|
|
2242
|
-
runTestCase,
|
|
2243
2335
|
scoreCandidateResponse
|
|
2244
2336
|
});
|
|
2245
2337
|
//# sourceMappingURL=index.cjs.map
|