@agentv/core 0.2.6 → 0.2.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -34,7 +34,9 @@ __export(index_exports, {
34
34
  HeuristicGrader: () => HeuristicGrader,
35
35
  QualityGrader: () => QualityGrader,
36
36
  TEST_MESSAGE_ROLES: () => TEST_MESSAGE_ROLES,
37
+ buildDirectoryChain: () => buildDirectoryChain,
37
38
  buildPromptInputs: () => buildPromptInputs,
39
+ buildSearchRoots: () => buildSearchRoots,
38
40
  calculateHits: () => calculateHits,
39
41
  calculateMisses: () => calculateMisses,
40
42
  createAgentKernel: () => createAgentKernel,
@@ -42,6 +44,8 @@ __export(index_exports, {
42
44
  ensureVSCodeSubagents: () => ensureVSCodeSubagents,
43
45
  extractAspects: () => extractAspects,
44
46
  extractCodeBlocks: () => extractCodeBlocks,
47
+ fileExists: () => fileExists,
48
+ findGitRoot: () => findGitRoot,
45
49
  getHitCount: () => getHitCount,
46
50
  isErrorLike: () => isErrorLike,
47
51
  isGraderKind: () => isGraderKind,
@@ -51,12 +55,13 @@ __export(index_exports, {
51
55
  isTestMessage: () => isTestMessage,
52
56
  isTestMessageRole: () => isTestMessageRole,
53
57
  listTargetNames: () => listTargetNames,
54
- loadTestCases: () => loadTestCases,
58
+ loadEvalCases: () => loadEvalCases,
55
59
  readTargetDefinitions: () => readTargetDefinitions,
56
60
  resolveAndCreateProvider: () => resolveAndCreateProvider,
61
+ resolveFileReference: () => resolveFileReference,
57
62
  resolveTargetDefinition: () => resolveTargetDefinition,
63
+ runEvalCase: () => runEvalCase,
58
64
  runEvaluation: () => runEvaluation,
59
- runTestCase: () => runTestCase,
60
65
  scoreCandidateResponse: () => scoreCandidateResponse
61
66
  });
62
67
  module.exports = __toCommonJS(index_exports);
@@ -113,6 +118,7 @@ function getHitCount(result) {
113
118
  }
114
119
 
115
120
  // src/evaluation/yaml-parser.ts
121
+ var import_micromatch = __toESM(require("micromatch"), 1);
116
122
  var import_node_fs2 = require("fs");
117
123
  var import_promises2 = require("fs/promises");
118
124
  var import_node_path2 = __toESM(require("path"), 1);
@@ -131,6 +137,46 @@ async function fileExists(filePath) {
131
137
  return false;
132
138
  }
133
139
  }
140
+ async function findGitRoot(startPath) {
141
+ let currentDir = import_node_path.default.dirname(import_node_path.default.resolve(startPath));
142
+ const root = import_node_path.default.parse(currentDir).root;
143
+ while (currentDir !== root) {
144
+ const gitPath = import_node_path.default.join(currentDir, ".git");
145
+ if (await fileExists(gitPath)) {
146
+ return currentDir;
147
+ }
148
+ const parentDir = import_node_path.default.dirname(currentDir);
149
+ if (parentDir === currentDir) {
150
+ break;
151
+ }
152
+ currentDir = parentDir;
153
+ }
154
+ return null;
155
+ }
156
+ function buildDirectoryChain(filePath, repoRoot) {
157
+ const directories = [];
158
+ const seen = /* @__PURE__ */ new Set();
159
+ const boundary = import_node_path.default.resolve(repoRoot);
160
+ let current = import_node_path.default.resolve(import_node_path.default.dirname(filePath));
161
+ while (current !== void 0) {
162
+ if (!seen.has(current)) {
163
+ directories.push(current);
164
+ seen.add(current);
165
+ }
166
+ if (current === boundary) {
167
+ break;
168
+ }
169
+ const parent = import_node_path.default.dirname(current);
170
+ if (parent === current) {
171
+ break;
172
+ }
173
+ current = parent;
174
+ }
175
+ if (!seen.has(boundary)) {
176
+ directories.push(boundary);
177
+ }
178
+ return directories;
179
+ }
134
180
  function buildSearchRoots(evalPath, repoRoot) {
135
181
  const uniqueRoots = [];
136
182
  const addRoot = (root) => {
@@ -188,9 +234,52 @@ var CODE_BLOCK_PATTERN = /```[\s\S]*?```/g;
188
234
  var ANSI_YELLOW = "\x1B[33m";
189
235
  var ANSI_RESET = "\x1B[0m";
190
236
  var SCHEMA_EVAL_V2 = "agentv-eval-v2";
191
- function isGuidelineFile(filePath) {
237
+ var SCHEMA_CONFIG_V2 = "agentv-config-v2";
238
+ async function loadConfig(evalFilePath, repoRoot) {
239
+ const directories = buildDirectoryChain(evalFilePath, repoRoot);
240
+ for (const directory of directories) {
241
+ const configPath = import_node_path2.default.join(directory, ".agentv", "config.yaml");
242
+ if (!await fileExists2(configPath)) {
243
+ continue;
244
+ }
245
+ try {
246
+ const rawConfig = await (0, import_promises2.readFile)(configPath, "utf8");
247
+ const parsed = (0, import_yaml.parse)(rawConfig);
248
+ if (!isJsonObject(parsed)) {
249
+ logWarning(`Invalid .agentv/config.yaml format at ${configPath}`);
250
+ continue;
251
+ }
252
+ const config = parsed;
253
+ const schema = config.$schema;
254
+ if (schema !== SCHEMA_CONFIG_V2) {
255
+ const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${configPath}. Expected '${SCHEMA_CONFIG_V2}'` : `Missing required field '$schema' in ${configPath}.
256
+ Please add '$schema: ${SCHEMA_CONFIG_V2}' at the top of the file.`;
257
+ logWarning(message);
258
+ continue;
259
+ }
260
+ const guidelinePatterns = config.guideline_patterns;
261
+ if (guidelinePatterns !== void 0 && !Array.isArray(guidelinePatterns)) {
262
+ logWarning(`Invalid guideline_patterns in ${configPath}, expected array`);
263
+ continue;
264
+ }
265
+ if (Array.isArray(guidelinePatterns) && !guidelinePatterns.every((p) => typeof p === "string")) {
266
+ logWarning(`Invalid guideline_patterns in ${configPath}, all entries must be strings`);
267
+ continue;
268
+ }
269
+ return {
270
+ guideline_patterns: guidelinePatterns
271
+ };
272
+ } catch (error) {
273
+ logWarning(`Could not read .agentv/config.yaml at ${configPath}: ${error.message}`);
274
+ continue;
275
+ }
276
+ }
277
+ return null;
278
+ }
279
+ function isGuidelineFile(filePath, patterns) {
192
280
  const normalized = filePath.split("\\").join("/");
193
- return normalized.endsWith(".instructions.md") || normalized.includes("/instructions/") || normalized.endsWith(".prompt.md") || normalized.includes("/prompts/");
281
+ const patternsToUse = patterns ?? [];
282
+ return import_micromatch.default.isMatch(normalized, patternsToUse);
194
283
  }
195
284
  function extractCodeBlocks(segments) {
196
285
  const codeBlocks = [];
@@ -210,43 +299,45 @@ function extractCodeBlocks(segments) {
210
299
  }
211
300
  return codeBlocks;
212
301
  }
213
- async function loadTestCases(testFilePath, repoRoot, options) {
302
+ async function loadEvalCases(evalFilePath, repoRoot, options) {
214
303
  const verbose = options?.verbose ?? false;
215
- const absoluteTestPath = import_node_path2.default.resolve(testFilePath);
304
+ const absoluteTestPath = import_node_path2.default.resolve(evalFilePath);
216
305
  if (!await fileExists2(absoluteTestPath)) {
217
- throw new Error(`Test file not found: ${testFilePath}`);
306
+ throw new Error(`Test file not found: ${evalFilePath}`);
218
307
  }
219
308
  const repoRootPath = resolveToAbsolutePath(repoRoot);
220
309
  const searchRoots = buildSearchRoots(absoluteTestPath, repoRootPath);
310
+ const config = await loadConfig(absoluteTestPath, repoRootPath);
311
+ const guidelinePatterns = config?.guideline_patterns;
221
312
  const rawFile = await (0, import_promises2.readFile)(absoluteTestPath, "utf8");
222
313
  const parsed = (0, import_yaml.parse)(rawFile);
223
314
  if (!isJsonObject(parsed)) {
224
- throw new Error(`Invalid test file format: ${testFilePath}`);
315
+ throw new Error(`Invalid test file format: ${evalFilePath}`);
225
316
  }
226
317
  const suite = parsed;
227
318
  const schema = suite.$schema;
228
319
  if (schema !== SCHEMA_EVAL_V2) {
229
- const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${testFilePath}. Expected '${SCHEMA_EVAL_V2}'` : `Missing required field '$schema' in ${testFilePath}.
320
+ const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${evalFilePath}. Expected '${SCHEMA_EVAL_V2}'` : `Missing required field '$schema' in ${evalFilePath}.
230
321
  Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
231
322
  throw new Error(message);
232
323
  }
233
324
  const rawTestcases = suite.evalcases;
234
325
  if (!Array.isArray(rawTestcases)) {
235
- throw new Error(`Invalid test file format: ${testFilePath} - missing 'evalcases' field`);
326
+ throw new Error(`Invalid test file format: ${evalFilePath} - missing 'evalcases' field`);
236
327
  }
237
328
  const globalGrader = coerceGrader(suite.grader) ?? "llm_judge";
238
329
  const results = [];
239
- for (const rawTestcase of rawTestcases) {
240
- if (!isJsonObject(rawTestcase)) {
330
+ for (const rawEvalcase of rawTestcases) {
331
+ if (!isJsonObject(rawEvalcase)) {
241
332
  logWarning("Skipping invalid test case entry (expected object)");
242
333
  continue;
243
334
  }
244
- const testcase = rawTestcase;
245
- const id = asString(testcase.id);
246
- const conversationId = asString(testcase.conversation_id);
247
- const outcome = asString(testcase.outcome);
248
- const inputMessagesValue = testcase.input_messages;
249
- const expectedMessagesValue = testcase.expected_messages;
335
+ const evalcase = rawEvalcase;
336
+ const id = asString(evalcase.id);
337
+ const conversationId = asString(evalcase.conversation_id);
338
+ const outcome = asString(evalcase.outcome);
339
+ const inputMessagesValue = evalcase.input_messages;
340
+ const expectedMessagesValue = evalcase.expected_messages;
250
341
  if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
251
342
  logWarning(`Skipping incomplete test case: ${id ?? "unknown"}`);
252
343
  continue;
@@ -259,6 +350,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
259
350
  const expectedMessages = expectedMessagesValue.filter((msg) => isTestMessage(msg));
260
351
  const assistantMessages = expectedMessages.filter((message) => message.role === "assistant");
261
352
  const userMessages = inputMessages.filter((message) => message.role === "user");
353
+ const systemMessages = inputMessages.filter((message) => message.role === "system");
262
354
  if (assistantMessages.length === 0) {
263
355
  logWarning(`No assistant message found for test case: ${id}`);
264
356
  continue;
@@ -266,6 +358,29 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
266
358
  if (assistantMessages.length > 1) {
267
359
  logWarning(`Multiple assistant messages found for test case: ${id}, using first`);
268
360
  }
361
+ if (systemMessages.length > 1) {
362
+ logWarning(`Multiple system messages found for test case: ${id}, using first`);
363
+ }
364
+ let systemMessageContent;
365
+ if (systemMessages.length > 0) {
366
+ const content = systemMessages[0]?.content;
367
+ if (typeof content === "string") {
368
+ systemMessageContent = content;
369
+ } else if (Array.isArray(content)) {
370
+ const textParts = [];
371
+ for (const segment of content) {
372
+ if (isJsonObject(segment)) {
373
+ const value = segment.value;
374
+ if (typeof value === "string") {
375
+ textParts.push(value);
376
+ }
377
+ }
378
+ }
379
+ if (textParts.length > 0) {
380
+ systemMessageContent = textParts.join("\n\n");
381
+ }
382
+ }
383
+ }
269
384
  const userSegments = [];
270
385
  const guidelinePaths = [];
271
386
  const userTextParts = [];
@@ -297,7 +412,8 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
297
412
  }
298
413
  try {
299
414
  const fileContent = (await (0, import_promises2.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
300
- if (isGuidelineFile(displayPath)) {
415
+ const relativeToRepo = import_node_path2.default.relative(repoRootPath, resolvedPath);
416
+ if (isGuidelineFile(relativeToRepo, guidelinePatterns)) {
301
417
  guidelinePaths.push(import_node_path2.default.resolve(resolvedPath));
302
418
  if (verbose) {
303
419
  console.log(` [Guideline] Found: ${displayPath}`);
@@ -307,7 +423,8 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
307
423
  userSegments.push({
308
424
  type: "file",
309
425
  path: displayPath,
310
- text: fileContent
426
+ text: fileContent,
427
+ resolvedPath: import_node_path2.default.resolve(resolvedPath)
311
428
  });
312
429
  if (verbose) {
313
430
  console.log(` [File] Found: ${displayPath}`);
@@ -331,14 +448,27 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
331
448
  const assistantContent = assistantMessages[0]?.content;
332
449
  const expectedAssistantRaw = await resolveAssistantContent(assistantContent, searchRoots, verbose);
333
450
  const userTextPrompt = userTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
334
- const testCaseGrader = coerceGrader(testcase.grader) ?? globalGrader;
451
+ const testCaseGrader = coerceGrader(evalcase.grader) ?? globalGrader;
452
+ const userFilePaths = [];
453
+ for (const segment of userSegments) {
454
+ if (segment.type === "file" && typeof segment.resolvedPath === "string") {
455
+ userFilePaths.push(segment.resolvedPath);
456
+ }
457
+ }
458
+ const allFilePaths = [
459
+ ...guidelinePaths.map((guidelinePath) => import_node_path2.default.resolve(guidelinePath)),
460
+ ...userFilePaths
461
+ ];
335
462
  const testCase = {
336
463
  id,
337
464
  conversation_id: conversationId,
338
465
  task: userTextPrompt,
339
466
  user_segments: userSegments,
467
+ system_message: systemMessageContent,
340
468
  expected_assistant_raw: expectedAssistantRaw,
341
469
  guideline_paths: guidelinePaths.map((guidelinePath) => import_node_path2.default.resolve(guidelinePath)),
470
+ guideline_patterns: guidelinePatterns,
471
+ file_paths: allFilePaths,
342
472
  code_snippets: codeSnippets,
343
473
  outcome,
344
474
  grader: testCaseGrader
@@ -404,7 +534,7 @@ ${body}`);
404
534
  }
405
535
  const request = requestParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
406
536
  const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
407
- return { request, guidelines };
537
+ return { request, guidelines, systemMessage: testCase.system_message };
408
538
  }
409
539
  async function fileExists2(absolutePath) {
410
540
  try {
@@ -530,15 +660,18 @@ function buildChatPrompt(request) {
530
660
  return request.chatPrompt;
531
661
  }
532
662
  const systemSegments = [];
533
- if (request.guidelines && request.guidelines.trim().length > 0) {
534
- systemSegments.push(`Guidelines:
535
- ${request.guidelines.trim()}`);
536
- }
537
663
  const metadataSystemPrompt = typeof request.metadata?.systemPrompt === "string" ? request.metadata.systemPrompt : void 0;
538
664
  if (metadataSystemPrompt && metadataSystemPrompt.trim().length > 0) {
539
665
  systemSegments.push(metadataSystemPrompt.trim());
666
+ } else {
667
+ systemSegments.push(DEFAULT_SYSTEM_PROMPT);
540
668
  }
541
- const systemContent = systemSegments.length > 0 ? systemSegments.join("\n\n") : DEFAULT_SYSTEM_PROMPT;
669
+ if (request.guidelines && request.guidelines.trim().length > 0) {
670
+ systemSegments.push(`[[ ## Guidelines ## ]]
671
+
672
+ ${request.guidelines.trim()}`);
673
+ }
674
+ const systemContent = systemSegments.join("\n\n");
542
675
  const userContent = request.prompt.trim();
543
676
  const prompt = [
544
677
  {
@@ -767,6 +900,9 @@ function normalizeAzureApiVersion(value) {
767
900
  function resolveTargetDefinition(definition, env = process.env) {
768
901
  const parsed = BASE_TARGET_SCHEMA.parse(definition);
769
902
  const provider = parsed.provider.toLowerCase();
903
+ const providerBatching = resolveOptionalBoolean(
904
+ parsed.settings?.provider_batching ?? parsed.settings?.providerBatching
905
+ );
770
906
  switch (provider) {
771
907
  case "azure":
772
908
  case "azure-openai":
@@ -775,6 +911,7 @@ function resolveTargetDefinition(definition, env = process.env) {
775
911
  name: parsed.name,
776
912
  judgeTarget: parsed.judge_target,
777
913
  workers: parsed.workers,
914
+ providerBatching,
778
915
  config: resolveAzureConfig(parsed, env)
779
916
  };
780
917
  case "anthropic":
@@ -783,6 +920,7 @@ function resolveTargetDefinition(definition, env = process.env) {
783
920
  name: parsed.name,
784
921
  judgeTarget: parsed.judge_target,
785
922
  workers: parsed.workers,
923
+ providerBatching,
786
924
  config: resolveAnthropicConfig(parsed, env)
787
925
  };
788
926
  case "gemini":
@@ -793,6 +931,7 @@ function resolveTargetDefinition(definition, env = process.env) {
793
931
  name: parsed.name,
794
932
  judgeTarget: parsed.judge_target,
795
933
  workers: parsed.workers,
934
+ providerBatching,
796
935
  config: resolveGeminiConfig(parsed, env)
797
936
  };
798
937
  case "mock":
@@ -801,6 +940,7 @@ function resolveTargetDefinition(definition, env = process.env) {
801
940
  name: parsed.name,
802
941
  judgeTarget: parsed.judge_target,
803
942
  workers: parsed.workers,
943
+ providerBatching,
804
944
  config: resolveMockConfig(parsed)
805
945
  };
806
946
  case "vscode":
@@ -810,6 +950,7 @@ function resolveTargetDefinition(definition, env = process.env) {
810
950
  name: parsed.name,
811
951
  judgeTarget: parsed.judge_target,
812
952
  workers: parsed.workers,
953
+ providerBatching,
813
954
  config: resolveVSCodeConfig(parsed, env, provider === "vscode-insiders")
814
955
  };
815
956
  default:
@@ -995,14 +1136,13 @@ function isLikelyEnvReference(value) {
995
1136
 
996
1137
  // src/evaluation/providers/vscode.ts
997
1138
  var import_promises3 = require("fs/promises");
998
- var import_node_os = require("os");
999
1139
  var import_node_path3 = __toESM(require("path"), 1);
1000
1140
  var import_subagent = require("subagent");
1001
- var PROMPT_FILE_PREFIX = "agentv-vscode-";
1002
1141
  var VSCodeProvider = class {
1003
1142
  id;
1004
1143
  kind;
1005
1144
  targetName;
1145
+ supportsBatch = true;
1006
1146
  config;
1007
1147
  constructor(targetName, config, kind) {
1008
1148
  this.id = `${kind}:${targetName}`;
@@ -1015,117 +1155,159 @@ var VSCodeProvider = class {
1015
1155
  throw new Error("VS Code provider request was aborted before dispatch");
1016
1156
  }
1017
1157
  const attachments = normalizeAttachments(request.attachments);
1018
- const promptContent = buildPromptDocument(request, attachments);
1019
- const directory = await (0, import_promises3.mkdtemp)(import_node_path3.default.join((0, import_node_os.tmpdir)(), PROMPT_FILE_PREFIX));
1020
- const promptPath = import_node_path3.default.join(directory, `${request.testCaseId ?? "request"}.prompt.md`);
1021
- try {
1022
- await (0, import_promises3.writeFile)(promptPath, promptContent, "utf8");
1023
- const session = await (0, import_subagent.dispatchAgentSession)({
1024
- userQuery: composeUserQuery(request),
1025
- promptFile: promptPath,
1026
- extraAttachments: attachments,
1027
- wait: this.config.waitForResponse,
1028
- dryRun: this.config.dryRun,
1029
- vscodeCmd: this.config.command,
1030
- subagentRoot: this.config.subagentRoot,
1031
- workspaceTemplate: this.config.workspaceTemplate,
1032
- silent: true
1033
- });
1034
- if (session.exitCode !== 0 || !session.responseFile) {
1035
- const failure = session.error ?? "VS Code subagent did not produce a response";
1036
- throw new Error(failure);
1037
- }
1038
- if (this.config.dryRun) {
1039
- return {
1040
- text: "",
1041
- raw: {
1042
- session,
1043
- promptFile: promptPath,
1044
- attachments
1045
- }
1046
- };
1047
- }
1048
- const responseText = await (0, import_promises3.readFile)(session.responseFile, "utf8");
1158
+ const promptContent = buildPromptDocument(request, attachments, request.guideline_patterns);
1159
+ const session = await (0, import_subagent.dispatchAgentSession)({
1160
+ userQuery: promptContent,
1161
+ // Use full prompt content instead of just request.prompt
1162
+ extraAttachments: attachments,
1163
+ wait: this.config.waitForResponse,
1164
+ dryRun: this.config.dryRun,
1165
+ vscodeCmd: this.config.command,
1166
+ subagentRoot: this.config.subagentRoot,
1167
+ workspaceTemplate: this.config.workspaceTemplate,
1168
+ silent: true
1169
+ });
1170
+ if (session.exitCode !== 0 || !session.responseFile) {
1171
+ const failure = session.error ?? "VS Code subagent did not produce a response";
1172
+ throw new Error(failure);
1173
+ }
1174
+ if (this.config.dryRun) {
1049
1175
  return {
1050
- text: responseText,
1176
+ text: "",
1051
1177
  raw: {
1052
1178
  session,
1053
- promptFile: promptPath,
1054
1179
  attachments
1055
1180
  }
1056
1181
  };
1057
- } finally {
1058
- await (0, import_promises3.rm)(directory, { recursive: true, force: true });
1059
1182
  }
1183
+ const responseText = await (0, import_promises3.readFile)(session.responseFile, "utf8");
1184
+ return {
1185
+ text: responseText,
1186
+ raw: {
1187
+ session,
1188
+ attachments
1189
+ }
1190
+ };
1191
+ }
1192
+ async invokeBatch(requests) {
1193
+ if (requests.length === 0) {
1194
+ return [];
1195
+ }
1196
+ const normalizedRequests = requests.map((req) => ({
1197
+ request: req,
1198
+ attachments: normalizeAttachments(req.attachments)
1199
+ }));
1200
+ const combinedAttachments = mergeAttachments(
1201
+ normalizedRequests.map(({ attachments }) => attachments)
1202
+ );
1203
+ const userQueries = normalizedRequests.map(
1204
+ ({ request, attachments }) => buildPromptDocument(request, attachments, request.guideline_patterns)
1205
+ );
1206
+ const session = await (0, import_subagent.dispatchBatchAgent)({
1207
+ userQueries,
1208
+ extraAttachments: combinedAttachments,
1209
+ wait: this.config.waitForResponse,
1210
+ dryRun: this.config.dryRun,
1211
+ vscodeCmd: this.config.command,
1212
+ subagentRoot: this.config.subagentRoot,
1213
+ workspaceTemplate: this.config.workspaceTemplate,
1214
+ silent: true
1215
+ });
1216
+ if (session.exitCode !== 0 || !session.responseFiles) {
1217
+ const failure = session.error ?? "VS Code subagent did not produce batch responses";
1218
+ throw new Error(failure);
1219
+ }
1220
+ if (this.config.dryRun) {
1221
+ return normalizedRequests.map(({ attachments }) => ({
1222
+ text: "",
1223
+ raw: {
1224
+ session,
1225
+ attachments,
1226
+ allAttachments: combinedAttachments
1227
+ }
1228
+ }));
1229
+ }
1230
+ if (session.responseFiles.length !== requests.length) {
1231
+ throw new Error(
1232
+ `VS Code batch returned ${session.responseFiles.length} responses for ${requests.length} requests`
1233
+ );
1234
+ }
1235
+ const responses = [];
1236
+ for (const [index, responseFile] of session.responseFiles.entries()) {
1237
+ const responseText = await (0, import_promises3.readFile)(responseFile, "utf8");
1238
+ responses.push({
1239
+ text: responseText,
1240
+ raw: {
1241
+ session,
1242
+ attachments: normalizedRequests[index]?.attachments,
1243
+ allAttachments: combinedAttachments,
1244
+ responseFile
1245
+ }
1246
+ });
1247
+ }
1248
+ return responses;
1060
1249
  }
1061
1250
  };
1062
- function buildPromptDocument(request, attachments) {
1251
+ function buildPromptDocument(request, attachments, guidelinePatterns) {
1063
1252
  const parts = [];
1064
- const instructionFiles = collectInstructionFiles(attachments);
1065
- if (instructionFiles.length > 0) {
1066
- parts.push(buildMandatoryPrereadBlock(instructionFiles));
1067
- }
1068
- parts.push(`# AgentV Request`);
1069
- if (request.testCaseId) {
1070
- parts.push(`- Test Case: ${request.testCaseId}`);
1071
- }
1072
- if (request.metadata?.target) {
1073
- parts.push(`- Target: ${String(request.metadata.target)}`);
1074
- }
1075
- parts.push("\n## Task\n", request.prompt.trim());
1076
- if (request.guidelines && request.guidelines.trim().length > 0) {
1077
- parts.push("\n## Guidelines\n", request.guidelines.trim());
1078
- }
1079
- if (attachments && attachments.length > 0) {
1080
- const attachmentList = attachments.map((item) => `- ${item}`).join("\n");
1081
- parts.push("\n## Attachments\n", attachmentList);
1253
+ const guidelineFiles = collectGuidelineFiles(attachments, guidelinePatterns);
1254
+ const attachmentFiles = collectAttachmentFiles(attachments);
1255
+ const nonGuidelineAttachments = attachmentFiles.filter(
1256
+ (file) => !guidelineFiles.includes(file)
1257
+ );
1258
+ const prereadBlock = buildMandatoryPrereadBlock(guidelineFiles, nonGuidelineAttachments);
1259
+ if (prereadBlock.length > 0) {
1260
+ parts.push("\n", prereadBlock);
1082
1261
  }
1262
+ parts.push("\n[[ ## user_query ## ]]\n", request.prompt.trim());
1083
1263
  return parts.join("\n").trim();
1084
1264
  }
1085
- function buildMandatoryPrereadBlock(instructionFiles) {
1086
- if (instructionFiles.length === 0) {
1265
+ function buildMandatoryPrereadBlock(guidelineFiles, attachmentFiles) {
1266
+ if (guidelineFiles.length === 0 && attachmentFiles.length === 0) {
1087
1267
  return "";
1088
1268
  }
1089
- const fileList = [];
1090
- const tokenList = [];
1091
- let counter = 0;
1092
- for (const absolutePath of instructionFiles) {
1093
- counter += 1;
1269
+ const buildList = (files) => files.map((absolutePath) => {
1094
1270
  const fileName = import_node_path3.default.basename(absolutePath);
1095
1271
  const fileUri = pathToFileUri(absolutePath);
1096
- fileList.push(`[${fileName}](${fileUri})`);
1097
- tokenList.push(`INSTRUCTIONS_READ: \`${fileName}\` i=${counter} SHA256=<hex>`);
1098
- }
1099
- const filesText = fileList.join(", ");
1100
- const tokensText = tokenList.join("\n");
1101
- const instruction = [
1102
- `Read all instruction files: ${filesText}.`,
1103
- `After reading each file, compute its SHA256 hash using this PowerShell command:`,
1104
- "`Get-FileHash -Algorithm SHA256 -LiteralPath '<file-path>' | Select-Object -ExpandProperty Hash`.",
1105
- `Then include, at the top of your reply, these exact tokens on separate lines:
1106
- `,
1107
- tokensText,
1108
- `
1109
- Replace \`<hex>\` with the actual SHA256 hash value computed from the PowerShell command.`,
1110
- `If any file is missing, fail with ERROR: missing-file <filename> and stop.
1111
- `,
1112
- `Then fetch all documentation required by the instructions before proceeding with your task.`
1113
- ].join(" ");
1114
- return `[[ ## mandatory_pre_read ## ]]
1115
-
1116
- ${instruction}
1117
-
1118
- `;
1272
+ return `* [${fileName}](${fileUri})`;
1273
+ });
1274
+ const sections = [];
1275
+ if (guidelineFiles.length > 0) {
1276
+ sections.push(`Read all guideline files:
1277
+ ${buildList(guidelineFiles).join("\n")}.`);
1278
+ }
1279
+ if (attachmentFiles.length > 0) {
1280
+ sections.push(`Read all attachment files:
1281
+ ${buildList(attachmentFiles).join("\n")}.`);
1282
+ }
1283
+ sections.push(
1284
+ "If any file is missing, fail with ERROR: missing-file <filename> and stop.",
1285
+ "Then apply system_instructions on the user query below."
1286
+ );
1287
+ return sections.join("\n");
1119
1288
  }
1120
- function collectInstructionFiles(attachments) {
1289
+ function collectGuidelineFiles(attachments, guidelinePatterns) {
1121
1290
  if (!attachments || attachments.length === 0) {
1122
1291
  return [];
1123
1292
  }
1124
1293
  const unique = /* @__PURE__ */ new Map();
1125
1294
  for (const attachment of attachments) {
1126
- if (!isInstructionPath(attachment)) {
1127
- continue;
1295
+ const absolutePath = import_node_path3.default.resolve(attachment);
1296
+ const normalized = absolutePath.split(import_node_path3.default.sep).join("/");
1297
+ if (isGuidelineFile(normalized, guidelinePatterns)) {
1298
+ if (!unique.has(absolutePath)) {
1299
+ unique.set(absolutePath, absolutePath);
1300
+ }
1128
1301
  }
1302
+ }
1303
+ return Array.from(unique.values());
1304
+ }
1305
+ function collectAttachmentFiles(attachments) {
1306
+ if (!attachments || attachments.length === 0) {
1307
+ return [];
1308
+ }
1309
+ const unique = /* @__PURE__ */ new Map();
1310
+ for (const attachment of attachments) {
1129
1311
  const absolutePath = import_node_path3.default.resolve(attachment);
1130
1312
  if (!unique.has(absolutePath)) {
1131
1313
  unique.set(absolutePath, absolutePath);
@@ -1133,10 +1315,6 @@ function collectInstructionFiles(attachments) {
1133
1315
  }
1134
1316
  return Array.from(unique.values());
1135
1317
  }
1136
- function isInstructionPath(filePath) {
1137
- const normalized = filePath.split(import_node_path3.default.sep).join("/");
1138
- return normalized.endsWith(".instructions.md") || normalized.includes("/instructions/") || normalized.endsWith(".prompt.md") || normalized.includes("/prompts/");
1139
- }
1140
1318
  function pathToFileUri(filePath) {
1141
1319
  const absolutePath = import_node_path3.default.isAbsolute(filePath) ? filePath : import_node_path3.default.resolve(filePath);
1142
1320
  const normalizedPath = absolutePath.replace(/\\/g, "/");
@@ -1145,14 +1323,6 @@ function pathToFileUri(filePath) {
1145
1323
  }
1146
1324
  return `file://${normalizedPath}`;
1147
1325
  }
1148
- function composeUserQuery(request) {
1149
- const segments = [];
1150
- segments.push(request.prompt.trim());
1151
- if (request.guidelines && request.guidelines.trim().length > 0) {
1152
- segments.push("\nGuidelines:\n", request.guidelines.trim());
1153
- }
1154
- return segments.join("\n").trim();
1155
- }
1156
1326
  function normalizeAttachments(attachments) {
1157
1327
  if (!attachments || attachments.length === 0) {
1158
1328
  return void 0;
@@ -1163,6 +1333,16 @@ function normalizeAttachments(attachments) {
1163
1333
  }
1164
1334
  return Array.from(deduped);
1165
1335
  }
1336
+ function mergeAttachments(all) {
1337
+ const deduped = /* @__PURE__ */ new Set();
1338
+ for (const list of all) {
1339
+ if (!list) continue;
1340
+ for (const attachment of list) {
1341
+ deduped.add(import_node_path3.default.resolve(attachment));
1342
+ }
1343
+ }
1344
+ return deduped.size > 0 ? Array.from(deduped) : void 0;
1345
+ }
1166
1346
  async function ensureVSCodeSubagents(options) {
1167
1347
  const { kind, count, verbose = false } = options;
1168
1348
  const vscodeCmd = kind === "vscode-insiders" ? "code-insiders" : "code";
@@ -1504,7 +1684,7 @@ var import_node_crypto = require("crypto");
1504
1684
  var HeuristicGrader = class {
1505
1685
  kind = "heuristic";
1506
1686
  grade(context) {
1507
- const expectedAspects = extractAspects(context.testCase.expected_assistant_raw);
1687
+ const expectedAspects = extractAspects(context.evalCase.expected_assistant_raw);
1508
1688
  const result = scoreCandidateResponse(context.candidate, expectedAspects);
1509
1689
  const misses = [...result.misses];
1510
1690
  if (expectedAspects.length === 0 && isErrorLike(context.candidate)) {
@@ -1537,14 +1717,14 @@ var QualityGrader = class {
1537
1717
  if (!judgeProvider) {
1538
1718
  throw new Error("No judge provider available for LLM grading");
1539
1719
  }
1540
- const prompt = buildQualityPrompt(context.testCase, context.candidate);
1720
+ const prompt = buildQualityPrompt(context.evalCase, context.candidate);
1541
1721
  const metadata = {
1542
1722
  systemPrompt: QUALITY_SYSTEM_PROMPT
1543
1723
  };
1544
1724
  const response = await judgeProvider.invoke({
1545
1725
  prompt,
1546
1726
  metadata,
1547
- testCaseId: context.testCase.id,
1727
+ evalCaseId: context.evalCase.id,
1548
1728
  attempt: context.attempt,
1549
1729
  maxOutputTokens: this.maxOutputTokens,
1550
1730
  temperature: this.temperature
@@ -1590,16 +1770,16 @@ var QUALITY_SYSTEM_PROMPT = [
1590
1770
  function buildQualityPrompt(testCase, candidate) {
1591
1771
  const parts = [
1592
1772
  "[[ ## expected_outcome ## ]]",
1593
- testCase.outcome,
1773
+ testCase.outcome.trim(),
1594
1774
  "",
1595
1775
  "[[ ## request ## ]]",
1596
- testCase.task,
1776
+ testCase.task.trim(),
1597
1777
  "",
1598
1778
  "[[ ## reference_answer ## ]]",
1599
- testCase.expected_assistant_raw,
1779
+ testCase.expected_assistant_raw.trim(),
1600
1780
  "",
1601
1781
  "[[ ## generated_answer ## ]]",
1602
- candidate,
1782
+ candidate.trim(),
1603
1783
  "",
1604
1784
  "Respond with a single JSON object matching the schema described in the system prompt."
1605
1785
  ];
@@ -1848,10 +2028,10 @@ async function runEvaluation(options) {
1848
2028
  onResult,
1849
2029
  onProgress
1850
2030
  } = options;
1851
- const load = loadTestCases;
1852
- const testCases = await load(testFilePath, repoRoot, { verbose });
1853
- const filteredTestCases = filterTestCases(testCases, evalId);
1854
- if (filteredTestCases.length === 0) {
2031
+ const load = loadEvalCases;
2032
+ const evalCases = await load(testFilePath, repoRoot, { verbose });
2033
+ const filteredEvalCases = filterEvalCases(evalCases, evalId);
2034
+ if (filteredEvalCases.length === 0) {
1855
2035
  if (evalId) {
1856
2036
  throw new Error(`Test case with id '${evalId}' not found in ${testFilePath}`);
1857
2037
  }
@@ -1897,35 +2077,62 @@ async function runEvaluation(options) {
1897
2077
  };
1898
2078
  const graderRegistry = buildGraderRegistry(graders, resolveJudgeProvider);
1899
2079
  const primaryProvider = getOrCreateProvider(target);
1900
- if (onProgress && filteredTestCases.length > 0) {
1901
- for (let i = 0; i < filteredTestCases.length; i++) {
2080
+ const providerSupportsBatch = target.providerBatching === true && primaryProvider.supportsBatch === true && typeof primaryProvider.invokeBatch === "function";
2081
+ if (target.providerBatching && !providerSupportsBatch && verbose) {
2082
+ console.warn(
2083
+ `Provider batching requested for target '${target.name}', but provider does not advertise batch support. Using per-case dispatch.`
2084
+ );
2085
+ }
2086
+ if (onProgress && filteredEvalCases.length > 0) {
2087
+ for (let i = 0; i < filteredEvalCases.length; i++) {
1902
2088
  await onProgress({
1903
2089
  workerId: i + 1,
1904
- evalId: filteredTestCases[i].id,
2090
+ evalId: filteredEvalCases[i].id,
1905
2091
  status: "pending"
1906
2092
  });
1907
2093
  }
1908
2094
  }
2095
+ if (providerSupportsBatch) {
2096
+ try {
2097
+ return await runBatchEvaluation({
2098
+ evalCases: filteredEvalCases,
2099
+ provider: primaryProvider,
2100
+ target,
2101
+ graderRegistry,
2102
+ promptDumpDir,
2103
+ nowFn: now ?? (() => /* @__PURE__ */ new Date()),
2104
+ onProgress,
2105
+ onResult,
2106
+ verbose,
2107
+ resolveJudgeProvider
2108
+ });
2109
+ } catch (error) {
2110
+ if (verbose) {
2111
+ const message = error instanceof Error ? error.message : String(error);
2112
+ console.warn(`Provider batch execution failed, falling back to per-case dispatch: ${message}`);
2113
+ }
2114
+ }
2115
+ }
1909
2116
  const workers = options.maxConcurrency ?? target.workers ?? 1;
1910
2117
  const limit = pLimit(workers);
1911
2118
  let nextWorkerId = 1;
1912
2119
  const workerIdByEvalId = /* @__PURE__ */ new Map();
1913
- const promises = filteredTestCases.map(
1914
- (testCase) => limit(async () => {
2120
+ const promises = filteredEvalCases.map(
2121
+ (evalCase) => limit(async () => {
1915
2122
  const workerId = nextWorkerId++;
1916
- workerIdByEvalId.set(testCase.id, workerId);
2123
+ workerIdByEvalId.set(evalCase.id, workerId);
1917
2124
  if (onProgress) {
1918
2125
  await onProgress({
1919
2126
  workerId,
1920
- evalId: testCase.id,
2127
+ evalId: evalCase.id,
1921
2128
  status: "running",
1922
2129
  startedAt: Date.now()
1923
2130
  });
1924
2131
  }
1925
2132
  try {
1926
2133
  const judgeProvider = await resolveJudgeProvider(target);
1927
- const result = await runTestCase({
1928
- testCase,
2134
+ const result = await runEvalCase({
2135
+ evalCase,
1929
2136
  provider: primaryProvider,
1930
2137
  target,
1931
2138
  graders: graderRegistry,
@@ -1940,7 +2147,7 @@ async function runEvaluation(options) {
1940
2147
  if (onProgress) {
1941
2148
  await onProgress({
1942
2149
  workerId,
1943
- evalId: testCase.id,
2150
+ evalId: evalCase.id,
1944
2151
  status: "completed",
1945
2152
  startedAt: 0,
1946
2153
  // Not used for completed status
@@ -1955,7 +2162,7 @@ async function runEvaluation(options) {
1955
2162
  if (onProgress) {
1956
2163
  await onProgress({
1957
2164
  workerId,
1958
- evalId: testCase.id,
2165
+ evalId: evalCase.id,
1959
2166
  status: "failed",
1960
2167
  completedAt: Date.now(),
1961
2168
  error: error instanceof Error ? error.message : String(error)
@@ -1972,10 +2179,10 @@ async function runEvaluation(options) {
1972
2179
  if (outcome.status === "fulfilled") {
1973
2180
  results.push(outcome.value);
1974
2181
  } else {
1975
- const testCase = filteredTestCases[i];
1976
- const promptInputs = await buildPromptInputs(testCase);
2182
+ const evalCase = filteredEvalCases[i];
2183
+ const promptInputs = await buildPromptInputs(evalCase);
1977
2184
  const errorResult = buildErrorResult(
1978
- testCase,
2185
+ evalCase,
1979
2186
  target.name,
1980
2187
  (now ?? (() => /* @__PURE__ */ new Date()))(),
1981
2188
  outcome.reason,
@@ -1989,9 +2196,140 @@ async function runEvaluation(options) {
1989
2196
  }
1990
2197
  return results;
1991
2198
  }
1992
- async function runTestCase(options) {
2199
+ async function runBatchEvaluation(options) {
2200
+ const {
2201
+ evalCases,
2202
+ provider,
2203
+ target,
2204
+ graderRegistry,
2205
+ promptDumpDir,
2206
+ nowFn,
2207
+ onProgress,
2208
+ onResult,
2209
+ resolveJudgeProvider
2210
+ } = options;
2211
+ const promptInputsList = [];
2212
+ for (const evalCase of evalCases) {
2213
+ const promptInputs = await buildPromptInputs(evalCase);
2214
+ if (promptDumpDir) {
2215
+ await dumpPrompt(promptDumpDir, evalCase, promptInputs);
2216
+ }
2217
+ promptInputsList.push(promptInputs);
2218
+ }
2219
+ const batchRequests = evalCases.map((evalCase, index) => {
2220
+ const promptInputs = promptInputsList[index];
2221
+ return {
2222
+ prompt: promptInputs.request,
2223
+ guidelines: promptInputs.guidelines,
2224
+ guideline_patterns: evalCase.guideline_patterns,
2225
+ attachments: evalCase.file_paths,
2226
+ evalCaseId: evalCase.id,
2227
+ metadata: {
2228
+ systemPrompt: promptInputs.systemMessage ?? ""
2229
+ }
2230
+ };
2231
+ });
2232
+ const batchResponse = await provider.invokeBatch?.(batchRequests);
2233
+ if (!Array.isArray(batchResponse)) {
2234
+ throw new Error("Provider batching failed: invokeBatch did not return an array");
2235
+ }
2236
+ if (batchResponse.length !== evalCases.length) {
2237
+ throw new Error(
2238
+ `Provider batching failed: expected ${evalCases.length} responses, received ${batchResponse.length}`
2239
+ );
2240
+ }
2241
+ if (onProgress) {
2242
+ const startedAt = Date.now();
2243
+ for (let i = 0; i < evalCases.length; i++) {
2244
+ await onProgress({
2245
+ workerId: 1,
2246
+ evalId: evalCases[i].id,
2247
+ status: "running",
2248
+ startedAt
2249
+ });
2250
+ }
2251
+ }
2252
+ const results = [];
2253
+ for (let i = 0; i < evalCases.length; i++) {
2254
+ const evalCase = evalCases[i];
2255
+ const promptInputs = promptInputsList[i];
2256
+ const providerResponse = batchResponse[i];
2257
+ const now = nowFn();
2258
+ const graderKind = evalCase.grader ?? "heuristic";
2259
+ const activeGrader = graderRegistry[graderKind] ?? graderRegistry.heuristic;
2260
+ if (!activeGrader) {
2261
+ throw new Error(`No grader registered for kind '${graderKind}'`);
2262
+ }
2263
+ let grade;
2264
+ try {
2265
+ grade = await activeGrader.grade({
2266
+ evalCase,
2267
+ candidate: providerResponse.text ?? "",
2268
+ target,
2269
+ provider,
2270
+ attempt: 0,
2271
+ promptInputs,
2272
+ now,
2273
+ judgeProvider: await resolveJudgeProvider(target)
2274
+ });
2275
+ } catch (error) {
2276
+ const errorResult = buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
2277
+ results.push(errorResult);
2278
+ if (onResult) {
2279
+ await onResult(errorResult);
2280
+ }
2281
+ if (onProgress) {
2282
+ await onProgress({
2283
+ workerId: 1,
2284
+ evalId: evalCase.id,
2285
+ status: "failed",
2286
+ completedAt: Date.now(),
2287
+ error: error instanceof Error ? error.message : String(error)
2288
+ });
2289
+ }
2290
+ continue;
2291
+ }
2292
+ const completedAt = nowFn();
2293
+ const rawRequest = {
2294
+ request: promptInputs.request,
2295
+ guidelines: promptInputs.guidelines,
2296
+ guideline_paths: evalCase.guideline_paths,
2297
+ system_message: promptInputs.systemMessage ?? ""
2298
+ };
2299
+ const result = {
2300
+ eval_id: evalCase.id,
2301
+ conversation_id: evalCase.conversation_id,
2302
+ score: grade.score,
2303
+ hits: grade.hits,
2304
+ misses: grade.misses,
2305
+ model_answer: providerResponse.text ?? "",
2306
+ expected_aspect_count: grade.expectedAspectCount,
2307
+ target: target.name,
2308
+ timestamp: completedAt.toISOString(),
2309
+ reasoning: grade.reasoning,
2310
+ raw_aspects: grade.rawAspects,
2311
+ raw_request: rawRequest,
2312
+ grader_raw_request: grade.graderRawRequest
2313
+ };
2314
+ results.push(result);
2315
+ if (onResult) {
2316
+ await onResult(result);
2317
+ }
2318
+ if (onProgress) {
2319
+ await onProgress({
2320
+ workerId: 1,
2321
+ evalId: evalCase.id,
2322
+ status: "completed",
2323
+ startedAt: 0,
2324
+ completedAt: Date.now()
2325
+ });
2326
+ }
2327
+ }
2328
+ return results;
2329
+ }
2330
+ async function runEvalCase(options) {
1993
2331
  const {
1994
- testCase,
2332
+ evalCase,
1995
2333
  provider,
1996
2334
  target,
1997
2335
  graders,
@@ -2004,11 +2342,11 @@ async function runTestCase(options) {
2004
2342
  signal,
2005
2343
  judgeProvider
2006
2344
  } = options;
2007
- const promptInputs = await buildPromptInputs(testCase);
2345
+ const promptInputs = await buildPromptInputs(evalCase);
2008
2346
  if (promptDumpDir) {
2009
- await dumpPrompt(promptDumpDir, testCase, promptInputs);
2347
+ await dumpPrompt(promptDumpDir, evalCase, promptInputs);
2010
2348
  }
2011
- const cacheKey = useCache ? createCacheKey(provider, target, testCase, promptInputs) : void 0;
2349
+ const cacheKey = useCache ? createCacheKey(provider, target, evalCase, promptInputs) : void 0;
2012
2350
  let cachedResponse;
2013
2351
  if (cacheKey && cache) {
2014
2352
  cachedResponse = await cache.get(cacheKey);
@@ -2021,7 +2359,7 @@ async function runTestCase(options) {
2021
2359
  while (!providerResponse && attempt < attemptBudget) {
2022
2360
  try {
2023
2361
  providerResponse = await invokeProvider(provider, {
2024
- testCase,
2362
+ evalCase,
2025
2363
  target,
2026
2364
  promptInputs,
2027
2365
  attempt,
@@ -2034,12 +2372,12 @@ async function runTestCase(options) {
2034
2372
  attempt += 1;
2035
2373
  continue;
2036
2374
  }
2037
- return buildErrorResult(testCase, target.name, nowFn(), error, promptInputs);
2375
+ return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
2038
2376
  }
2039
2377
  }
2040
2378
  if (!providerResponse) {
2041
2379
  return buildErrorResult(
2042
- testCase,
2380
+ evalCase,
2043
2381
  target.name,
2044
2382
  nowFn(),
2045
2383
  lastError ?? new Error("Provider did not return a response"),
@@ -2049,7 +2387,7 @@ async function runTestCase(options) {
2049
2387
  if (cacheKey && cache && !cachedResponse) {
2050
2388
  await cache.set(cacheKey, providerResponse);
2051
2389
  }
2052
- const graderKind = testCase.grader ?? "heuristic";
2390
+ const graderKind = evalCase.grader ?? "heuristic";
2053
2391
  const activeGrader = graders[graderKind] ?? graders.heuristic;
2054
2392
  if (!activeGrader) {
2055
2393
  throw new Error(`No grader registered for kind '${graderKind}'`);
@@ -2058,7 +2396,7 @@ async function runTestCase(options) {
2058
2396
  try {
2059
2397
  const gradeTimestamp = nowFn();
2060
2398
  grade = await activeGrader.grade({
2061
- testCase,
2399
+ evalCase,
2062
2400
  candidate: providerResponse.text ?? "",
2063
2401
  target,
2064
2402
  provider,
@@ -2068,17 +2406,18 @@ async function runTestCase(options) {
2068
2406
  judgeProvider
2069
2407
  });
2070
2408
  } catch (error) {
2071
- return buildErrorResult(testCase, target.name, nowFn(), error, promptInputs);
2409
+ return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
2072
2410
  }
2073
2411
  const completedAt = nowFn();
2074
2412
  const rawRequest = {
2075
2413
  request: promptInputs.request,
2076
2414
  guidelines: promptInputs.guidelines,
2077
- guideline_paths: testCase.guideline_paths
2415
+ guideline_paths: evalCase.guideline_paths,
2416
+ system_message: promptInputs.systemMessage ?? ""
2078
2417
  };
2079
2418
  return {
2080
- eval_id: testCase.id,
2081
- conversation_id: testCase.conversation_id,
2419
+ eval_id: evalCase.id,
2420
+ conversation_id: evalCase.conversation_id,
2082
2421
  score: grade.score,
2083
2422
  hits: grade.hits,
2084
2423
  misses: grade.misses,
@@ -2092,11 +2431,11 @@ async function runTestCase(options) {
2092
2431
  grader_raw_request: grade.graderRawRequest
2093
2432
  };
2094
2433
  }
2095
- function filterTestCases(testCases, evalId) {
2434
+ function filterEvalCases(evalCases, evalId) {
2096
2435
  if (!evalId) {
2097
- return testCases;
2436
+ return evalCases;
2098
2437
  }
2099
- return testCases.filter((testCase) => testCase.id === evalId);
2438
+ return evalCases.filter((evalCase) => evalCase.id === evalId);
2100
2439
  }
2101
2440
  function buildGraderRegistry(overrides, resolveJudgeProvider) {
2102
2441
  const heuristic = overrides?.heuristic ?? new HeuristicGrader();
@@ -2114,16 +2453,16 @@ function buildGraderRegistry(overrides, resolveJudgeProvider) {
2114
2453
  llm_judge: llmJudge
2115
2454
  };
2116
2455
  }
2117
- async function dumpPrompt(directory, testCase, promptInputs) {
2456
+ async function dumpPrompt(directory, evalCase, promptInputs) {
2118
2457
  const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
2119
- const filename = `${timestamp}_${sanitizeFilename(testCase.id)}.json`;
2458
+ const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
2120
2459
  const filePath = import_node_path5.default.resolve(directory, filename);
2121
2460
  await (0, import_promises5.mkdir)(import_node_path5.default.dirname(filePath), { recursive: true });
2122
2461
  const payload = {
2123
- eval_id: testCase.id,
2462
+ eval_id: evalCase.id,
2124
2463
  request: promptInputs.request,
2125
2464
  guidelines: promptInputs.guidelines,
2126
- guideline_paths: testCase.guideline_paths
2465
+ guideline_paths: evalCase.guideline_paths
2127
2466
  };
2128
2467
  await (0, import_promises5.writeFile)(filePath, JSON.stringify(payload, null, 2), "utf8");
2129
2468
  }
@@ -2135,7 +2474,7 @@ function sanitizeFilename(value) {
2135
2474
  return sanitized.length > 0 ? sanitized : (0, import_node_crypto2.randomUUID)();
2136
2475
  }
2137
2476
  async function invokeProvider(provider, options) {
2138
- const { testCase, target, promptInputs, attempt, agentTimeoutMs, signal } = options;
2477
+ const { evalCase, target, promptInputs, attempt, agentTimeoutMs, signal } = options;
2139
2478
  const controller = new AbortController();
2140
2479
  const timeout = agentTimeoutMs ? setTimeout(() => controller.abort(), agentTimeoutMs) : void 0;
2141
2480
  if (signal) {
@@ -2145,12 +2484,12 @@ async function invokeProvider(provider, options) {
2145
2484
  return await provider.invoke({
2146
2485
  prompt: promptInputs.request,
2147
2486
  guidelines: promptInputs.guidelines,
2148
- attachments: testCase.guideline_paths,
2149
- testCaseId: testCase.id,
2487
+ guideline_patterns: evalCase.guideline_patterns,
2488
+ attachments: evalCase.file_paths,
2489
+ evalCaseId: evalCase.id,
2150
2490
  attempt,
2151
2491
  metadata: {
2152
- target: target.name,
2153
- grader: testCase.grader
2492
+ systemPrompt: promptInputs.systemMessage ?? ""
2154
2493
  },
2155
2494
  signal: controller.signal
2156
2495
  });
@@ -2160,17 +2499,18 @@ async function invokeProvider(provider, options) {
2160
2499
  }
2161
2500
  }
2162
2501
  }
2163
- function buildErrorResult(testCase, targetName, timestamp, error, promptInputs) {
2502
+ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs) {
2164
2503
  const message = error instanceof Error ? error.message : String(error);
2165
2504
  const rawRequest = {
2166
2505
  request: promptInputs.request,
2167
2506
  guidelines: promptInputs.guidelines,
2168
- guideline_paths: testCase.guideline_paths,
2507
+ guideline_paths: evalCase.guideline_paths,
2508
+ system_message: promptInputs.systemMessage ?? "",
2169
2509
  error: message
2170
2510
  };
2171
2511
  return {
2172
- eval_id: testCase.id,
2173
- conversation_id: testCase.conversation_id,
2512
+ eval_id: evalCase.id,
2513
+ conversation_id: evalCase.conversation_id,
2174
2514
  score: 0,
2175
2515
  hits: [],
2176
2516
  misses: [`Error: ${message}`],
@@ -2182,13 +2522,14 @@ function buildErrorResult(testCase, targetName, timestamp, error, promptInputs)
2182
2522
  raw_request: rawRequest
2183
2523
  };
2184
2524
  }
2185
- function createCacheKey(provider, target, testCase, promptInputs) {
2525
+ function createCacheKey(provider, target, evalCase, promptInputs) {
2186
2526
  const hash = (0, import_node_crypto2.createHash)("sha256");
2187
2527
  hash.update(provider.id);
2188
2528
  hash.update(target.name);
2189
- hash.update(testCase.id);
2529
+ hash.update(evalCase.id);
2190
2530
  hash.update(promptInputs.request);
2191
2531
  hash.update(promptInputs.guidelines);
2532
+ hash.update(promptInputs.systemMessage ?? "");
2192
2533
  return hash.digest("hex");
2193
2534
  }
2194
2535
  function isTimeoutLike(error) {
@@ -2217,7 +2558,9 @@ function createAgentKernel() {
2217
2558
  HeuristicGrader,
2218
2559
  QualityGrader,
2219
2560
  TEST_MESSAGE_ROLES,
2561
+ buildDirectoryChain,
2220
2562
  buildPromptInputs,
2563
+ buildSearchRoots,
2221
2564
  calculateHits,
2222
2565
  calculateMisses,
2223
2566
  createAgentKernel,
@@ -2225,6 +2568,8 @@ function createAgentKernel() {
2225
2568
  ensureVSCodeSubagents,
2226
2569
  extractAspects,
2227
2570
  extractCodeBlocks,
2571
+ fileExists,
2572
+ findGitRoot,
2228
2573
  getHitCount,
2229
2574
  isErrorLike,
2230
2575
  isGraderKind,
@@ -2234,12 +2579,13 @@ function createAgentKernel() {
2234
2579
  isTestMessage,
2235
2580
  isTestMessageRole,
2236
2581
  listTargetNames,
2237
- loadTestCases,
2582
+ loadEvalCases,
2238
2583
  readTargetDefinitions,
2239
2584
  resolveAndCreateProvider,
2585
+ resolveFileReference,
2240
2586
  resolveTargetDefinition,
2587
+ runEvalCase,
2241
2588
  runEvaluation,
2242
- runTestCase,
2243
2589
  scoreCandidateResponse
2244
2590
  });
2245
2591
  //# sourceMappingURL=index.cjs.map