@agentv/core 0.2.6 → 0.2.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -34,7 +34,9 @@ __export(index_exports, {
34
34
  HeuristicGrader: () => HeuristicGrader,
35
35
  QualityGrader: () => QualityGrader,
36
36
  TEST_MESSAGE_ROLES: () => TEST_MESSAGE_ROLES,
37
+ buildDirectoryChain: () => buildDirectoryChain,
37
38
  buildPromptInputs: () => buildPromptInputs,
39
+ buildSearchRoots: () => buildSearchRoots,
38
40
  calculateHits: () => calculateHits,
39
41
  calculateMisses: () => calculateMisses,
40
42
  createAgentKernel: () => createAgentKernel,
@@ -42,6 +44,8 @@ __export(index_exports, {
42
44
  ensureVSCodeSubagents: () => ensureVSCodeSubagents,
43
45
  extractAspects: () => extractAspects,
44
46
  extractCodeBlocks: () => extractCodeBlocks,
47
+ fileExists: () => fileExists,
48
+ findGitRoot: () => findGitRoot,
45
49
  getHitCount: () => getHitCount,
46
50
  isErrorLike: () => isErrorLike,
47
51
  isGraderKind: () => isGraderKind,
@@ -51,12 +55,13 @@ __export(index_exports, {
51
55
  isTestMessage: () => isTestMessage,
52
56
  isTestMessageRole: () => isTestMessageRole,
53
57
  listTargetNames: () => listTargetNames,
54
- loadTestCases: () => loadTestCases,
58
+ loadEvalCases: () => loadEvalCases,
55
59
  readTargetDefinitions: () => readTargetDefinitions,
56
60
  resolveAndCreateProvider: () => resolveAndCreateProvider,
61
+ resolveFileReference: () => resolveFileReference,
57
62
  resolveTargetDefinition: () => resolveTargetDefinition,
63
+ runEvalCase: () => runEvalCase,
58
64
  runEvaluation: () => runEvaluation,
59
- runTestCase: () => runTestCase,
60
65
  scoreCandidateResponse: () => scoreCandidateResponse
61
66
  });
62
67
  module.exports = __toCommonJS(index_exports);
@@ -113,6 +118,7 @@ function getHitCount(result) {
113
118
  }
114
119
 
115
120
  // src/evaluation/yaml-parser.ts
121
+ var import_micromatch = __toESM(require("micromatch"), 1);
116
122
  var import_node_fs2 = require("fs");
117
123
  var import_promises2 = require("fs/promises");
118
124
  var import_node_path2 = __toESM(require("path"), 1);
@@ -131,6 +137,46 @@ async function fileExists(filePath) {
131
137
  return false;
132
138
  }
133
139
  }
140
+ async function findGitRoot(startPath) {
141
+ let currentDir = import_node_path.default.dirname(import_node_path.default.resolve(startPath));
142
+ const root = import_node_path.default.parse(currentDir).root;
143
+ while (currentDir !== root) {
144
+ const gitPath = import_node_path.default.join(currentDir, ".git");
145
+ if (await fileExists(gitPath)) {
146
+ return currentDir;
147
+ }
148
+ const parentDir = import_node_path.default.dirname(currentDir);
149
+ if (parentDir === currentDir) {
150
+ break;
151
+ }
152
+ currentDir = parentDir;
153
+ }
154
+ return null;
155
+ }
156
+ function buildDirectoryChain(filePath, repoRoot) {
157
+ const directories = [];
158
+ const seen = /* @__PURE__ */ new Set();
159
+ const boundary = import_node_path.default.resolve(repoRoot);
160
+ let current = import_node_path.default.resolve(import_node_path.default.dirname(filePath));
161
+ while (current !== void 0) {
162
+ if (!seen.has(current)) {
163
+ directories.push(current);
164
+ seen.add(current);
165
+ }
166
+ if (current === boundary) {
167
+ break;
168
+ }
169
+ const parent = import_node_path.default.dirname(current);
170
+ if (parent === current) {
171
+ break;
172
+ }
173
+ current = parent;
174
+ }
175
+ if (!seen.has(boundary)) {
176
+ directories.push(boundary);
177
+ }
178
+ return directories;
179
+ }
134
180
  function buildSearchRoots(evalPath, repoRoot) {
135
181
  const uniqueRoots = [];
136
182
  const addRoot = (root) => {
@@ -188,9 +234,52 @@ var CODE_BLOCK_PATTERN = /```[\s\S]*?```/g;
188
234
  var ANSI_YELLOW = "\x1B[33m";
189
235
  var ANSI_RESET = "\x1B[0m";
190
236
  var SCHEMA_EVAL_V2 = "agentv-eval-v2";
191
- function isGuidelineFile(filePath) {
237
+ var SCHEMA_CONFIG_V2 = "agentv-config-v2";
238
+ async function loadConfig(evalFilePath, repoRoot) {
239
+ const directories = buildDirectoryChain(evalFilePath, repoRoot);
240
+ for (const directory of directories) {
241
+ const configPath = import_node_path2.default.join(directory, ".agentv", "config.yaml");
242
+ if (!await fileExists2(configPath)) {
243
+ continue;
244
+ }
245
+ try {
246
+ const rawConfig = await (0, import_promises2.readFile)(configPath, "utf8");
247
+ const parsed = (0, import_yaml.parse)(rawConfig);
248
+ if (!isJsonObject(parsed)) {
249
+ logWarning(`Invalid .agentv/config.yaml format at ${configPath}`);
250
+ continue;
251
+ }
252
+ const config = parsed;
253
+ const schema = config.$schema;
254
+ if (schema !== SCHEMA_CONFIG_V2) {
255
+ const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${configPath}. Expected '${SCHEMA_CONFIG_V2}'` : `Missing required field '$schema' in ${configPath}.
256
+ Please add '$schema: ${SCHEMA_CONFIG_V2}' at the top of the file.`;
257
+ logWarning(message);
258
+ continue;
259
+ }
260
+ const guidelinePatterns = config.guideline_patterns;
261
+ if (guidelinePatterns !== void 0 && !Array.isArray(guidelinePatterns)) {
262
+ logWarning(`Invalid guideline_patterns in ${configPath}, expected array`);
263
+ continue;
264
+ }
265
+ if (Array.isArray(guidelinePatterns) && !guidelinePatterns.every((p) => typeof p === "string")) {
266
+ logWarning(`Invalid guideline_patterns in ${configPath}, all entries must be strings`);
267
+ continue;
268
+ }
269
+ return {
270
+ guideline_patterns: guidelinePatterns
271
+ };
272
+ } catch (error) {
273
+ logWarning(`Could not read .agentv/config.yaml at ${configPath}: ${error.message}`);
274
+ continue;
275
+ }
276
+ }
277
+ return null;
278
+ }
279
+ function isGuidelineFile(filePath, patterns) {
192
280
  const normalized = filePath.split("\\").join("/");
193
- return normalized.endsWith(".instructions.md") || normalized.includes("/instructions/") || normalized.endsWith(".prompt.md") || normalized.includes("/prompts/");
281
+ const patternsToUse = patterns ?? [];
282
+ return import_micromatch.default.isMatch(normalized, patternsToUse);
194
283
  }
195
284
  function extractCodeBlocks(segments) {
196
285
  const codeBlocks = [];
@@ -210,43 +299,45 @@ function extractCodeBlocks(segments) {
210
299
  }
211
300
  return codeBlocks;
212
301
  }
213
- async function loadTestCases(testFilePath, repoRoot, options) {
302
+ async function loadEvalCases(evalFilePath, repoRoot, options) {
214
303
  const verbose = options?.verbose ?? false;
215
- const absoluteTestPath = import_node_path2.default.resolve(testFilePath);
304
+ const absoluteTestPath = import_node_path2.default.resolve(evalFilePath);
216
305
  if (!await fileExists2(absoluteTestPath)) {
217
- throw new Error(`Test file not found: ${testFilePath}`);
306
+ throw new Error(`Test file not found: ${evalFilePath}`);
218
307
  }
219
308
  const repoRootPath = resolveToAbsolutePath(repoRoot);
220
309
  const searchRoots = buildSearchRoots(absoluteTestPath, repoRootPath);
310
+ const config = await loadConfig(absoluteTestPath, repoRootPath);
311
+ const guidelinePatterns = config?.guideline_patterns;
221
312
  const rawFile = await (0, import_promises2.readFile)(absoluteTestPath, "utf8");
222
313
  const parsed = (0, import_yaml.parse)(rawFile);
223
314
  if (!isJsonObject(parsed)) {
224
- throw new Error(`Invalid test file format: ${testFilePath}`);
315
+ throw new Error(`Invalid test file format: ${evalFilePath}`);
225
316
  }
226
317
  const suite = parsed;
227
318
  const schema = suite.$schema;
228
319
  if (schema !== SCHEMA_EVAL_V2) {
229
- const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${testFilePath}. Expected '${SCHEMA_EVAL_V2}'` : `Missing required field '$schema' in ${testFilePath}.
320
+ const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${evalFilePath}. Expected '${SCHEMA_EVAL_V2}'` : `Missing required field '$schema' in ${evalFilePath}.
230
321
  Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
231
322
  throw new Error(message);
232
323
  }
233
324
  const rawTestcases = suite.evalcases;
234
325
  if (!Array.isArray(rawTestcases)) {
235
- throw new Error(`Invalid test file format: ${testFilePath} - missing 'evalcases' field`);
326
+ throw new Error(`Invalid test file format: ${evalFilePath} - missing 'evalcases' field`);
236
327
  }
237
328
  const globalGrader = coerceGrader(suite.grader) ?? "llm_judge";
238
329
  const results = [];
239
- for (const rawTestcase of rawTestcases) {
240
- if (!isJsonObject(rawTestcase)) {
330
+ for (const rawEvalcase of rawTestcases) {
331
+ if (!isJsonObject(rawEvalcase)) {
241
332
  logWarning("Skipping invalid test case entry (expected object)");
242
333
  continue;
243
334
  }
244
- const testcase = rawTestcase;
245
- const id = asString(testcase.id);
246
- const conversationId = asString(testcase.conversation_id);
247
- const outcome = asString(testcase.outcome);
248
- const inputMessagesValue = testcase.input_messages;
249
- const expectedMessagesValue = testcase.expected_messages;
335
+ const evalcase = rawEvalcase;
336
+ const id = asString(evalcase.id);
337
+ const conversationId = asString(evalcase.conversation_id);
338
+ const outcome = asString(evalcase.outcome);
339
+ const inputMessagesValue = evalcase.input_messages;
340
+ const expectedMessagesValue = evalcase.expected_messages;
250
341
  if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
251
342
  logWarning(`Skipping incomplete test case: ${id ?? "unknown"}`);
252
343
  continue;
@@ -259,6 +350,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
259
350
  const expectedMessages = expectedMessagesValue.filter((msg) => isTestMessage(msg));
260
351
  const assistantMessages = expectedMessages.filter((message) => message.role === "assistant");
261
352
  const userMessages = inputMessages.filter((message) => message.role === "user");
353
+ const systemMessages = inputMessages.filter((message) => message.role === "system");
262
354
  if (assistantMessages.length === 0) {
263
355
  logWarning(`No assistant message found for test case: ${id}`);
264
356
  continue;
@@ -266,6 +358,29 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
266
358
  if (assistantMessages.length > 1) {
267
359
  logWarning(`Multiple assistant messages found for test case: ${id}, using first`);
268
360
  }
361
+ if (systemMessages.length > 1) {
362
+ logWarning(`Multiple system messages found for test case: ${id}, using first`);
363
+ }
364
+ let systemMessageContent;
365
+ if (systemMessages.length > 0) {
366
+ const content = systemMessages[0]?.content;
367
+ if (typeof content === "string") {
368
+ systemMessageContent = content;
369
+ } else if (Array.isArray(content)) {
370
+ const textParts = [];
371
+ for (const segment of content) {
372
+ if (isJsonObject(segment)) {
373
+ const value = segment.value;
374
+ if (typeof value === "string") {
375
+ textParts.push(value);
376
+ }
377
+ }
378
+ }
379
+ if (textParts.length > 0) {
380
+ systemMessageContent = textParts.join("\n\n");
381
+ }
382
+ }
383
+ }
269
384
  const userSegments = [];
270
385
  const guidelinePaths = [];
271
386
  const userTextParts = [];
@@ -297,7 +412,8 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
297
412
  }
298
413
  try {
299
414
  const fileContent = (await (0, import_promises2.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
300
- if (isGuidelineFile(displayPath)) {
415
+ const relativeToRepo = import_node_path2.default.relative(repoRootPath, resolvedPath);
416
+ if (isGuidelineFile(relativeToRepo, guidelinePatterns)) {
301
417
  guidelinePaths.push(import_node_path2.default.resolve(resolvedPath));
302
418
  if (verbose) {
303
419
  console.log(` [Guideline] Found: ${displayPath}`);
@@ -307,7 +423,8 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
307
423
  userSegments.push({
308
424
  type: "file",
309
425
  path: displayPath,
310
- text: fileContent
426
+ text: fileContent,
427
+ resolvedPath: import_node_path2.default.resolve(resolvedPath)
311
428
  });
312
429
  if (verbose) {
313
430
  console.log(` [File] Found: ${displayPath}`);
@@ -331,14 +448,27 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
331
448
  const assistantContent = assistantMessages[0]?.content;
332
449
  const expectedAssistantRaw = await resolveAssistantContent(assistantContent, searchRoots, verbose);
333
450
  const userTextPrompt = userTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
334
- const testCaseGrader = coerceGrader(testcase.grader) ?? globalGrader;
451
+ const testCaseGrader = coerceGrader(evalcase.grader) ?? globalGrader;
452
+ const userFilePaths = [];
453
+ for (const segment of userSegments) {
454
+ if (segment.type === "file" && typeof segment.resolvedPath === "string") {
455
+ userFilePaths.push(segment.resolvedPath);
456
+ }
457
+ }
458
+ const allFilePaths = [
459
+ ...guidelinePaths.map((guidelinePath) => import_node_path2.default.resolve(guidelinePath)),
460
+ ...userFilePaths
461
+ ];
335
462
  const testCase = {
336
463
  id,
337
464
  conversation_id: conversationId,
338
465
  task: userTextPrompt,
339
466
  user_segments: userSegments,
467
+ system_message: systemMessageContent,
340
468
  expected_assistant_raw: expectedAssistantRaw,
341
469
  guideline_paths: guidelinePaths.map((guidelinePath) => import_node_path2.default.resolve(guidelinePath)),
470
+ guideline_patterns: guidelinePatterns,
471
+ file_paths: allFilePaths,
342
472
  code_snippets: codeSnippets,
343
473
  outcome,
344
474
  grader: testCaseGrader
@@ -404,7 +534,7 @@ ${body}`);
404
534
  }
405
535
  const request = requestParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
406
536
  const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
407
- return { request, guidelines };
537
+ return { request, guidelines, systemMessage: testCase.system_message };
408
538
  }
409
539
  async function fileExists2(absolutePath) {
410
540
  try {
@@ -530,15 +660,18 @@ function buildChatPrompt(request) {
530
660
  return request.chatPrompt;
531
661
  }
532
662
  const systemSegments = [];
533
- if (request.guidelines && request.guidelines.trim().length > 0) {
534
- systemSegments.push(`Guidelines:
535
- ${request.guidelines.trim()}`);
536
- }
537
663
  const metadataSystemPrompt = typeof request.metadata?.systemPrompt === "string" ? request.metadata.systemPrompt : void 0;
538
664
  if (metadataSystemPrompt && metadataSystemPrompt.trim().length > 0) {
539
665
  systemSegments.push(metadataSystemPrompt.trim());
666
+ } else {
667
+ systemSegments.push(DEFAULT_SYSTEM_PROMPT);
540
668
  }
541
- const systemContent = systemSegments.length > 0 ? systemSegments.join("\n\n") : DEFAULT_SYSTEM_PROMPT;
669
+ if (request.guidelines && request.guidelines.trim().length > 0) {
670
+ systemSegments.push(`[[ ## Guidelines ## ]]
671
+
672
+ ${request.guidelines.trim()}`);
673
+ }
674
+ const systemContent = systemSegments.join("\n\n");
542
675
  const userContent = request.prompt.trim();
543
676
  const prompt = [
544
677
  {
@@ -995,10 +1128,8 @@ function isLikelyEnvReference(value) {
995
1128
 
996
1129
  // src/evaluation/providers/vscode.ts
997
1130
  var import_promises3 = require("fs/promises");
998
- var import_node_os = require("os");
999
1131
  var import_node_path3 = __toESM(require("path"), 1);
1000
1132
  var import_subagent = require("subagent");
1001
- var PROMPT_FILE_PREFIX = "agentv-vscode-";
1002
1133
  var VSCodeProvider = class {
1003
1134
  id;
1004
1135
  kind;
@@ -1015,128 +1146,89 @@ var VSCodeProvider = class {
1015
1146
  throw new Error("VS Code provider request was aborted before dispatch");
1016
1147
  }
1017
1148
  const attachments = normalizeAttachments(request.attachments);
1018
- const promptContent = buildPromptDocument(request, attachments);
1019
- const directory = await (0, import_promises3.mkdtemp)(import_node_path3.default.join((0, import_node_os.tmpdir)(), PROMPT_FILE_PREFIX));
1020
- const promptPath = import_node_path3.default.join(directory, `${request.testCaseId ?? "request"}.prompt.md`);
1021
- try {
1022
- await (0, import_promises3.writeFile)(promptPath, promptContent, "utf8");
1023
- const session = await (0, import_subagent.dispatchAgentSession)({
1024
- userQuery: composeUserQuery(request),
1025
- promptFile: promptPath,
1026
- extraAttachments: attachments,
1027
- wait: this.config.waitForResponse,
1028
- dryRun: this.config.dryRun,
1029
- vscodeCmd: this.config.command,
1030
- subagentRoot: this.config.subagentRoot,
1031
- workspaceTemplate: this.config.workspaceTemplate,
1032
- silent: true
1033
- });
1034
- if (session.exitCode !== 0 || !session.responseFile) {
1035
- const failure = session.error ?? "VS Code subagent did not produce a response";
1036
- throw new Error(failure);
1037
- }
1038
- if (this.config.dryRun) {
1039
- return {
1040
- text: "",
1041
- raw: {
1042
- session,
1043
- promptFile: promptPath,
1044
- attachments
1045
- }
1046
- };
1047
- }
1048
- const responseText = await (0, import_promises3.readFile)(session.responseFile, "utf8");
1149
+ const promptContent = buildPromptDocument(request, attachments, request.guideline_patterns);
1150
+ const session = await (0, import_subagent.dispatchAgentSession)({
1151
+ userQuery: promptContent,
1152
+ // Use full prompt content instead of just request.prompt
1153
+ extraAttachments: attachments,
1154
+ wait: this.config.waitForResponse,
1155
+ dryRun: this.config.dryRun,
1156
+ vscodeCmd: this.config.command,
1157
+ subagentRoot: this.config.subagentRoot,
1158
+ workspaceTemplate: this.config.workspaceTemplate,
1159
+ silent: true
1160
+ });
1161
+ if (session.exitCode !== 0 || !session.responseFile) {
1162
+ const failure = session.error ?? "VS Code subagent did not produce a response";
1163
+ throw new Error(failure);
1164
+ }
1165
+ if (this.config.dryRun) {
1049
1166
  return {
1050
- text: responseText,
1167
+ text: "",
1051
1168
  raw: {
1052
1169
  session,
1053
- promptFile: promptPath,
1054
1170
  attachments
1055
1171
  }
1056
1172
  };
1057
- } finally {
1058
- await (0, import_promises3.rm)(directory, { recursive: true, force: true });
1059
1173
  }
1174
+ const responseText = await (0, import_promises3.readFile)(session.responseFile, "utf8");
1175
+ return {
1176
+ text: responseText,
1177
+ raw: {
1178
+ session,
1179
+ attachments
1180
+ }
1181
+ };
1060
1182
  }
1061
1183
  };
1062
- function buildPromptDocument(request, attachments) {
1184
+ function buildPromptDocument(request, attachments, guidelinePatterns) {
1063
1185
  const parts = [];
1064
- const instructionFiles = collectInstructionFiles(attachments);
1065
- if (instructionFiles.length > 0) {
1066
- parts.push(buildMandatoryPrereadBlock(instructionFiles));
1067
- }
1068
- parts.push(`# AgentV Request`);
1069
- if (request.testCaseId) {
1070
- parts.push(`- Test Case: ${request.testCaseId}`);
1071
- }
1072
- if (request.metadata?.target) {
1073
- parts.push(`- Target: ${String(request.metadata.target)}`);
1074
- }
1075
- parts.push("\n## Task\n", request.prompt.trim());
1076
- if (request.guidelines && request.guidelines.trim().length > 0) {
1077
- parts.push("\n## Guidelines\n", request.guidelines.trim());
1078
- }
1079
- if (attachments && attachments.length > 0) {
1080
- const attachmentList = attachments.map((item) => `- ${item}`).join("\n");
1081
- parts.push("\n## Attachments\n", attachmentList);
1186
+ const guidelineFiles = collectGuidelineFiles(attachments, guidelinePatterns);
1187
+ if (guidelineFiles.length > 0) {
1188
+ parts.push("\n", buildMandatoryPrereadBlock(guidelineFiles));
1082
1189
  }
1190
+ parts.push("\n[[ ## user_query ## ]]\n", request.prompt.trim());
1083
1191
  return parts.join("\n").trim();
1084
1192
  }
1085
- function buildMandatoryPrereadBlock(instructionFiles) {
1086
- if (instructionFiles.length === 0) {
1193
+ function buildMandatoryPrereadBlock(guidelineFiles) {
1194
+ if (guidelineFiles.length === 0) {
1087
1195
  return "";
1088
1196
  }
1089
1197
  const fileList = [];
1090
- const tokenList = [];
1091
1198
  let counter = 0;
1092
- for (const absolutePath of instructionFiles) {
1199
+ for (const absolutePath of guidelineFiles) {
1093
1200
  counter += 1;
1094
1201
  const fileName = import_node_path3.default.basename(absolutePath);
1095
1202
  const fileUri = pathToFileUri(absolutePath);
1096
- fileList.push(`[${fileName}](${fileUri})`);
1097
- tokenList.push(`INSTRUCTIONS_READ: \`${fileName}\` i=${counter} SHA256=<hex>`);
1203
+ fileList.push(`* [${fileName}](${fileUri})`);
1098
1204
  }
1099
- const filesText = fileList.join(", ");
1100
- const tokensText = tokenList.join("\n");
1205
+ const filesText = fileList.join("\n");
1101
1206
  const instruction = [
1102
- `Read all instruction files: ${filesText}.`,
1103
- `After reading each file, compute its SHA256 hash using this PowerShell command:`,
1104
- "`Get-FileHash -Algorithm SHA256 -LiteralPath '<file-path>' | Select-Object -ExpandProperty Hash`.",
1105
- `Then include, at the top of your reply, these exact tokens on separate lines:
1207
+ `Read all guideline files:
1208
+ ${filesText}.
1106
1209
  `,
1107
- tokensText,
1108
- `
1109
- Replace \`<hex>\` with the actual SHA256 hash value computed from the PowerShell command.`,
1110
1210
  `If any file is missing, fail with ERROR: missing-file <filename> and stop.
1111
1211
  `,
1112
- `Then fetch all documentation required by the instructions before proceeding with your task.`
1113
- ].join(" ");
1114
- return `[[ ## mandatory_pre_read ## ]]
1115
-
1116
- ${instruction}
1117
-
1118
- `;
1212
+ `Then apply system_instructions on the user query below.`
1213
+ ].join("");
1214
+ return `${instruction}`;
1119
1215
  }
1120
- function collectInstructionFiles(attachments) {
1216
+ function collectGuidelineFiles(attachments, guidelinePatterns) {
1121
1217
  if (!attachments || attachments.length === 0) {
1122
1218
  return [];
1123
1219
  }
1124
1220
  const unique = /* @__PURE__ */ new Map();
1125
1221
  for (const attachment of attachments) {
1126
- if (!isInstructionPath(attachment)) {
1127
- continue;
1128
- }
1129
1222
  const absolutePath = import_node_path3.default.resolve(attachment);
1130
- if (!unique.has(absolutePath)) {
1131
- unique.set(absolutePath, absolutePath);
1223
+ const normalized = absolutePath.split(import_node_path3.default.sep).join("/");
1224
+ if (isGuidelineFile(normalized, guidelinePatterns)) {
1225
+ if (!unique.has(absolutePath)) {
1226
+ unique.set(absolutePath, absolutePath);
1227
+ }
1132
1228
  }
1133
1229
  }
1134
1230
  return Array.from(unique.values());
1135
1231
  }
1136
- function isInstructionPath(filePath) {
1137
- const normalized = filePath.split(import_node_path3.default.sep).join("/");
1138
- return normalized.endsWith(".instructions.md") || normalized.includes("/instructions/") || normalized.endsWith(".prompt.md") || normalized.includes("/prompts/");
1139
- }
1140
1232
  function pathToFileUri(filePath) {
1141
1233
  const absolutePath = import_node_path3.default.isAbsolute(filePath) ? filePath : import_node_path3.default.resolve(filePath);
1142
1234
  const normalizedPath = absolutePath.replace(/\\/g, "/");
@@ -1145,14 +1237,6 @@ function pathToFileUri(filePath) {
1145
1237
  }
1146
1238
  return `file://${normalizedPath}`;
1147
1239
  }
1148
- function composeUserQuery(request) {
1149
- const segments = [];
1150
- segments.push(request.prompt.trim());
1151
- if (request.guidelines && request.guidelines.trim().length > 0) {
1152
- segments.push("\nGuidelines:\n", request.guidelines.trim());
1153
- }
1154
- return segments.join("\n").trim();
1155
- }
1156
1240
  function normalizeAttachments(attachments) {
1157
1241
  if (!attachments || attachments.length === 0) {
1158
1242
  return void 0;
@@ -1504,7 +1588,7 @@ var import_node_crypto = require("crypto");
1504
1588
  var HeuristicGrader = class {
1505
1589
  kind = "heuristic";
1506
1590
  grade(context) {
1507
- const expectedAspects = extractAspects(context.testCase.expected_assistant_raw);
1591
+ const expectedAspects = extractAspects(context.evalCase.expected_assistant_raw);
1508
1592
  const result = scoreCandidateResponse(context.candidate, expectedAspects);
1509
1593
  const misses = [...result.misses];
1510
1594
  if (expectedAspects.length === 0 && isErrorLike(context.candidate)) {
@@ -1537,14 +1621,14 @@ var QualityGrader = class {
1537
1621
  if (!judgeProvider) {
1538
1622
  throw new Error("No judge provider available for LLM grading");
1539
1623
  }
1540
- const prompt = buildQualityPrompt(context.testCase, context.candidate);
1624
+ const prompt = buildQualityPrompt(context.evalCase, context.candidate);
1541
1625
  const metadata = {
1542
1626
  systemPrompt: QUALITY_SYSTEM_PROMPT
1543
1627
  };
1544
1628
  const response = await judgeProvider.invoke({
1545
1629
  prompt,
1546
1630
  metadata,
1547
- testCaseId: context.testCase.id,
1631
+ evalCaseId: context.evalCase.id,
1548
1632
  attempt: context.attempt,
1549
1633
  maxOutputTokens: this.maxOutputTokens,
1550
1634
  temperature: this.temperature
@@ -1590,16 +1674,16 @@ var QUALITY_SYSTEM_PROMPT = [
1590
1674
  function buildQualityPrompt(testCase, candidate) {
1591
1675
  const parts = [
1592
1676
  "[[ ## expected_outcome ## ]]",
1593
- testCase.outcome,
1677
+ testCase.outcome.trim(),
1594
1678
  "",
1595
1679
  "[[ ## request ## ]]",
1596
- testCase.task,
1680
+ testCase.task.trim(),
1597
1681
  "",
1598
1682
  "[[ ## reference_answer ## ]]",
1599
- testCase.expected_assistant_raw,
1683
+ testCase.expected_assistant_raw.trim(),
1600
1684
  "",
1601
1685
  "[[ ## generated_answer ## ]]",
1602
- candidate,
1686
+ candidate.trim(),
1603
1687
  "",
1604
1688
  "Respond with a single JSON object matching the schema described in the system prompt."
1605
1689
  ];
@@ -1848,10 +1932,10 @@ async function runEvaluation(options) {
1848
1932
  onResult,
1849
1933
  onProgress
1850
1934
  } = options;
1851
- const load = loadTestCases;
1852
- const testCases = await load(testFilePath, repoRoot, { verbose });
1853
- const filteredTestCases = filterTestCases(testCases, evalId);
1854
- if (filteredTestCases.length === 0) {
1935
+ const load = loadEvalCases;
1936
+ const evalCases = await load(testFilePath, repoRoot, { verbose });
1937
+ const filteredEvalCases = filterEvalCases(evalCases, evalId);
1938
+ if (filteredEvalCases.length === 0) {
1855
1939
  if (evalId) {
1856
1940
  throw new Error(`Test case with id '${evalId}' not found in ${testFilePath}`);
1857
1941
  }
@@ -1897,11 +1981,11 @@ async function runEvaluation(options) {
1897
1981
  };
1898
1982
  const graderRegistry = buildGraderRegistry(graders, resolveJudgeProvider);
1899
1983
  const primaryProvider = getOrCreateProvider(target);
1900
- if (onProgress && filteredTestCases.length > 0) {
1901
- for (let i = 0; i < filteredTestCases.length; i++) {
1984
+ if (onProgress && filteredEvalCases.length > 0) {
1985
+ for (let i = 0; i < filteredEvalCases.length; i++) {
1902
1986
  await onProgress({
1903
1987
  workerId: i + 1,
1904
- evalId: filteredTestCases[i].id,
1988
+ evalId: filteredEvalCases[i].id,
1905
1989
  status: "pending"
1906
1990
  });
1907
1991
  }
@@ -1910,22 +1994,22 @@ async function runEvaluation(options) {
1910
1994
  const limit = pLimit(workers);
1911
1995
  let nextWorkerId = 1;
1912
1996
  const workerIdByEvalId = /* @__PURE__ */ new Map();
1913
- const promises = filteredTestCases.map(
1914
- (testCase) => limit(async () => {
1997
+ const promises = filteredEvalCases.map(
1998
+ (evalCase) => limit(async () => {
1915
1999
  const workerId = nextWorkerId++;
1916
- workerIdByEvalId.set(testCase.id, workerId);
2000
+ workerIdByEvalId.set(evalCase.id, workerId);
1917
2001
  if (onProgress) {
1918
2002
  await onProgress({
1919
2003
  workerId,
1920
- evalId: testCase.id,
2004
+ evalId: evalCase.id,
1921
2005
  status: "running",
1922
2006
  startedAt: Date.now()
1923
2007
  });
1924
2008
  }
1925
2009
  try {
1926
2010
  const judgeProvider = await resolveJudgeProvider(target);
1927
- const result = await runTestCase({
1928
- testCase,
2011
+ const result = await runEvalCase({
2012
+ evalCase,
1929
2013
  provider: primaryProvider,
1930
2014
  target,
1931
2015
  graders: graderRegistry,
@@ -1940,7 +2024,7 @@ async function runEvaluation(options) {
1940
2024
  if (onProgress) {
1941
2025
  await onProgress({
1942
2026
  workerId,
1943
- evalId: testCase.id,
2027
+ evalId: evalCase.id,
1944
2028
  status: "completed",
1945
2029
  startedAt: 0,
1946
2030
  // Not used for completed status
@@ -1955,7 +2039,7 @@ async function runEvaluation(options) {
1955
2039
  if (onProgress) {
1956
2040
  await onProgress({
1957
2041
  workerId,
1958
- evalId: testCase.id,
2042
+ evalId: evalCase.id,
1959
2043
  status: "failed",
1960
2044
  completedAt: Date.now(),
1961
2045
  error: error instanceof Error ? error.message : String(error)
@@ -1972,10 +2056,10 @@ async function runEvaluation(options) {
1972
2056
  if (outcome.status === "fulfilled") {
1973
2057
  results.push(outcome.value);
1974
2058
  } else {
1975
- const testCase = filteredTestCases[i];
1976
- const promptInputs = await buildPromptInputs(testCase);
2059
+ const evalCase = filteredEvalCases[i];
2060
+ const promptInputs = await buildPromptInputs(evalCase);
1977
2061
  const errorResult = buildErrorResult(
1978
- testCase,
2062
+ evalCase,
1979
2063
  target.name,
1980
2064
  (now ?? (() => /* @__PURE__ */ new Date()))(),
1981
2065
  outcome.reason,
@@ -1989,9 +2073,9 @@ async function runEvaluation(options) {
1989
2073
  }
1990
2074
  return results;
1991
2075
  }
1992
- async function runTestCase(options) {
2076
+ async function runEvalCase(options) {
1993
2077
  const {
1994
- testCase,
2078
+ evalCase,
1995
2079
  provider,
1996
2080
  target,
1997
2081
  graders,
@@ -2004,11 +2088,11 @@ async function runTestCase(options) {
2004
2088
  signal,
2005
2089
  judgeProvider
2006
2090
  } = options;
2007
- const promptInputs = await buildPromptInputs(testCase);
2091
+ const promptInputs = await buildPromptInputs(evalCase);
2008
2092
  if (promptDumpDir) {
2009
- await dumpPrompt(promptDumpDir, testCase, promptInputs);
2093
+ await dumpPrompt(promptDumpDir, evalCase, promptInputs);
2010
2094
  }
2011
- const cacheKey = useCache ? createCacheKey(provider, target, testCase, promptInputs) : void 0;
2095
+ const cacheKey = useCache ? createCacheKey(provider, target, evalCase, promptInputs) : void 0;
2012
2096
  let cachedResponse;
2013
2097
  if (cacheKey && cache) {
2014
2098
  cachedResponse = await cache.get(cacheKey);
@@ -2021,7 +2105,7 @@ async function runTestCase(options) {
2021
2105
  while (!providerResponse && attempt < attemptBudget) {
2022
2106
  try {
2023
2107
  providerResponse = await invokeProvider(provider, {
2024
- testCase,
2108
+ evalCase,
2025
2109
  target,
2026
2110
  promptInputs,
2027
2111
  attempt,
@@ -2034,12 +2118,12 @@ async function runTestCase(options) {
2034
2118
  attempt += 1;
2035
2119
  continue;
2036
2120
  }
2037
- return buildErrorResult(testCase, target.name, nowFn(), error, promptInputs);
2121
+ return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
2038
2122
  }
2039
2123
  }
2040
2124
  if (!providerResponse) {
2041
2125
  return buildErrorResult(
2042
- testCase,
2126
+ evalCase,
2043
2127
  target.name,
2044
2128
  nowFn(),
2045
2129
  lastError ?? new Error("Provider did not return a response"),
@@ -2049,7 +2133,7 @@ async function runTestCase(options) {
2049
2133
  if (cacheKey && cache && !cachedResponse) {
2050
2134
  await cache.set(cacheKey, providerResponse);
2051
2135
  }
2052
- const graderKind = testCase.grader ?? "heuristic";
2136
+ const graderKind = evalCase.grader ?? "heuristic";
2053
2137
  const activeGrader = graders[graderKind] ?? graders.heuristic;
2054
2138
  if (!activeGrader) {
2055
2139
  throw new Error(`No grader registered for kind '${graderKind}'`);
@@ -2058,7 +2142,7 @@ async function runTestCase(options) {
2058
2142
  try {
2059
2143
  const gradeTimestamp = nowFn();
2060
2144
  grade = await activeGrader.grade({
2061
- testCase,
2145
+ evalCase,
2062
2146
  candidate: providerResponse.text ?? "",
2063
2147
  target,
2064
2148
  provider,
@@ -2068,17 +2152,18 @@ async function runTestCase(options) {
2068
2152
  judgeProvider
2069
2153
  });
2070
2154
  } catch (error) {
2071
- return buildErrorResult(testCase, target.name, nowFn(), error, promptInputs);
2155
+ return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
2072
2156
  }
2073
2157
  const completedAt = nowFn();
2074
2158
  const rawRequest = {
2075
2159
  request: promptInputs.request,
2076
2160
  guidelines: promptInputs.guidelines,
2077
- guideline_paths: testCase.guideline_paths
2161
+ guideline_paths: evalCase.guideline_paths,
2162
+ system_message: promptInputs.systemMessage ?? ""
2078
2163
  };
2079
2164
  return {
2080
- eval_id: testCase.id,
2081
- conversation_id: testCase.conversation_id,
2165
+ eval_id: evalCase.id,
2166
+ conversation_id: evalCase.conversation_id,
2082
2167
  score: grade.score,
2083
2168
  hits: grade.hits,
2084
2169
  misses: grade.misses,
@@ -2092,11 +2177,11 @@ async function runTestCase(options) {
2092
2177
  grader_raw_request: grade.graderRawRequest
2093
2178
  };
2094
2179
  }
2095
- function filterTestCases(testCases, evalId) {
2180
+ function filterEvalCases(evalCases, evalId) {
2096
2181
  if (!evalId) {
2097
- return testCases;
2182
+ return evalCases;
2098
2183
  }
2099
- return testCases.filter((testCase) => testCase.id === evalId);
2184
+ return evalCases.filter((evalCase) => evalCase.id === evalId);
2100
2185
  }
2101
2186
  function buildGraderRegistry(overrides, resolveJudgeProvider) {
2102
2187
  const heuristic = overrides?.heuristic ?? new HeuristicGrader();
@@ -2114,16 +2199,16 @@ function buildGraderRegistry(overrides, resolveJudgeProvider) {
2114
2199
  llm_judge: llmJudge
2115
2200
  };
2116
2201
  }
2117
- async function dumpPrompt(directory, testCase, promptInputs) {
2202
+ async function dumpPrompt(directory, evalCase, promptInputs) {
2118
2203
  const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
2119
- const filename = `${timestamp}_${sanitizeFilename(testCase.id)}.json`;
2204
+ const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
2120
2205
  const filePath = import_node_path5.default.resolve(directory, filename);
2121
2206
  await (0, import_promises5.mkdir)(import_node_path5.default.dirname(filePath), { recursive: true });
2122
2207
  const payload = {
2123
- eval_id: testCase.id,
2208
+ eval_id: evalCase.id,
2124
2209
  request: promptInputs.request,
2125
2210
  guidelines: promptInputs.guidelines,
2126
- guideline_paths: testCase.guideline_paths
2211
+ guideline_paths: evalCase.guideline_paths
2127
2212
  };
2128
2213
  await (0, import_promises5.writeFile)(filePath, JSON.stringify(payload, null, 2), "utf8");
2129
2214
  }
@@ -2135,7 +2220,7 @@ function sanitizeFilename(value) {
2135
2220
  return sanitized.length > 0 ? sanitized : (0, import_node_crypto2.randomUUID)();
2136
2221
  }
2137
2222
  async function invokeProvider(provider, options) {
2138
- const { testCase, target, promptInputs, attempt, agentTimeoutMs, signal } = options;
2223
+ const { evalCase, target, promptInputs, attempt, agentTimeoutMs, signal } = options;
2139
2224
  const controller = new AbortController();
2140
2225
  const timeout = agentTimeoutMs ? setTimeout(() => controller.abort(), agentTimeoutMs) : void 0;
2141
2226
  if (signal) {
@@ -2145,12 +2230,12 @@ async function invokeProvider(provider, options) {
2145
2230
  return await provider.invoke({
2146
2231
  prompt: promptInputs.request,
2147
2232
  guidelines: promptInputs.guidelines,
2148
- attachments: testCase.guideline_paths,
2149
- testCaseId: testCase.id,
2233
+ guideline_patterns: evalCase.guideline_patterns,
2234
+ attachments: evalCase.file_paths,
2235
+ evalCaseId: evalCase.id,
2150
2236
  attempt,
2151
2237
  metadata: {
2152
- target: target.name,
2153
- grader: testCase.grader
2238
+ systemPrompt: promptInputs.systemMessage ?? ""
2154
2239
  },
2155
2240
  signal: controller.signal
2156
2241
  });
@@ -2160,17 +2245,18 @@ async function invokeProvider(provider, options) {
2160
2245
  }
2161
2246
  }
2162
2247
  }
2163
- function buildErrorResult(testCase, targetName, timestamp, error, promptInputs) {
2248
+ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs) {
2164
2249
  const message = error instanceof Error ? error.message : String(error);
2165
2250
  const rawRequest = {
2166
2251
  request: promptInputs.request,
2167
2252
  guidelines: promptInputs.guidelines,
2168
- guideline_paths: testCase.guideline_paths,
2253
+ guideline_paths: evalCase.guideline_paths,
2254
+ system_message: promptInputs.systemMessage ?? "",
2169
2255
  error: message
2170
2256
  };
2171
2257
  return {
2172
- eval_id: testCase.id,
2173
- conversation_id: testCase.conversation_id,
2258
+ eval_id: evalCase.id,
2259
+ conversation_id: evalCase.conversation_id,
2174
2260
  score: 0,
2175
2261
  hits: [],
2176
2262
  misses: [`Error: ${message}`],
@@ -2182,13 +2268,14 @@ function buildErrorResult(testCase, targetName, timestamp, error, promptInputs)
2182
2268
  raw_request: rawRequest
2183
2269
  };
2184
2270
  }
2185
- function createCacheKey(provider, target, testCase, promptInputs) {
2271
+ function createCacheKey(provider, target, evalCase, promptInputs) {
2186
2272
  const hash = (0, import_node_crypto2.createHash)("sha256");
2187
2273
  hash.update(provider.id);
2188
2274
  hash.update(target.name);
2189
- hash.update(testCase.id);
2275
+ hash.update(evalCase.id);
2190
2276
  hash.update(promptInputs.request);
2191
2277
  hash.update(promptInputs.guidelines);
2278
+ hash.update(promptInputs.systemMessage ?? "");
2192
2279
  return hash.digest("hex");
2193
2280
  }
2194
2281
  function isTimeoutLike(error) {
@@ -2217,7 +2304,9 @@ function createAgentKernel() {
2217
2304
  HeuristicGrader,
2218
2305
  QualityGrader,
2219
2306
  TEST_MESSAGE_ROLES,
2307
+ buildDirectoryChain,
2220
2308
  buildPromptInputs,
2309
+ buildSearchRoots,
2221
2310
  calculateHits,
2222
2311
  calculateMisses,
2223
2312
  createAgentKernel,
@@ -2225,6 +2314,8 @@ function createAgentKernel() {
2225
2314
  ensureVSCodeSubagents,
2226
2315
  extractAspects,
2227
2316
  extractCodeBlocks,
2317
+ fileExists,
2318
+ findGitRoot,
2228
2319
  getHitCount,
2229
2320
  isErrorLike,
2230
2321
  isGraderKind,
@@ -2234,12 +2325,13 @@ function createAgentKernel() {
2234
2325
  isTestMessage,
2235
2326
  isTestMessageRole,
2236
2327
  listTargetNames,
2237
- loadTestCases,
2328
+ loadEvalCases,
2238
2329
  readTargetDefinitions,
2239
2330
  resolveAndCreateProvider,
2331
+ resolveFileReference,
2240
2332
  resolveTargetDefinition,
2333
+ runEvalCase,
2241
2334
  runEvaluation,
2242
- runTestCase,
2243
2335
  scoreCandidateResponse
2244
2336
  });
2245
2337
  //# sourceMappingURL=index.cjs.map