@agentv/core 0.2.6 → 0.2.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,8 +1,11 @@
1
1
  import {
2
2
  TARGETS_SCHEMA_V2,
3
+ buildDirectoryChain,
3
4
  buildSearchRoots,
5
+ fileExists,
6
+ findGitRoot,
4
7
  resolveFileReference
5
- } from "./chunk-QVS4OL44.js";
8
+ } from "./chunk-P4GOYWYH.js";
6
9
 
7
10
  // src/evaluation/types.ts
8
11
  var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
@@ -56,6 +59,7 @@ function getHitCount(result) {
56
59
  }
57
60
 
58
61
  // src/evaluation/yaml-parser.ts
62
+ import micromatch from "micromatch";
59
63
  import { constants } from "node:fs";
60
64
  import { access, readFile } from "node:fs/promises";
61
65
  import path from "node:path";
@@ -65,9 +69,52 @@ var CODE_BLOCK_PATTERN = /```[\s\S]*?```/g;
65
69
  var ANSI_YELLOW = "\x1B[33m";
66
70
  var ANSI_RESET = "\x1B[0m";
67
71
  var SCHEMA_EVAL_V2 = "agentv-eval-v2";
68
- function isGuidelineFile(filePath) {
72
+ var SCHEMA_CONFIG_V2 = "agentv-config-v2";
73
+ async function loadConfig(evalFilePath, repoRoot) {
74
+ const directories = buildDirectoryChain(evalFilePath, repoRoot);
75
+ for (const directory of directories) {
76
+ const configPath = path.join(directory, ".agentv", "config.yaml");
77
+ if (!await fileExists2(configPath)) {
78
+ continue;
79
+ }
80
+ try {
81
+ const rawConfig = await readFile(configPath, "utf8");
82
+ const parsed = parse(rawConfig);
83
+ if (!isJsonObject(parsed)) {
84
+ logWarning(`Invalid .agentv/config.yaml format at ${configPath}`);
85
+ continue;
86
+ }
87
+ const config = parsed;
88
+ const schema = config.$schema;
89
+ if (schema !== SCHEMA_CONFIG_V2) {
90
+ const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${configPath}. Expected '${SCHEMA_CONFIG_V2}'` : `Missing required field '$schema' in ${configPath}.
91
+ Please add '$schema: ${SCHEMA_CONFIG_V2}' at the top of the file.`;
92
+ logWarning(message);
93
+ continue;
94
+ }
95
+ const guidelinePatterns = config.guideline_patterns;
96
+ if (guidelinePatterns !== void 0 && !Array.isArray(guidelinePatterns)) {
97
+ logWarning(`Invalid guideline_patterns in ${configPath}, expected array`);
98
+ continue;
99
+ }
100
+ if (Array.isArray(guidelinePatterns) && !guidelinePatterns.every((p) => typeof p === "string")) {
101
+ logWarning(`Invalid guideline_patterns in ${configPath}, all entries must be strings`);
102
+ continue;
103
+ }
104
+ return {
105
+ guideline_patterns: guidelinePatterns
106
+ };
107
+ } catch (error) {
108
+ logWarning(`Could not read .agentv/config.yaml at ${configPath}: ${error.message}`);
109
+ continue;
110
+ }
111
+ }
112
+ return null;
113
+ }
114
+ function isGuidelineFile(filePath, patterns) {
69
115
  const normalized = filePath.split("\\").join("/");
70
- return normalized.endsWith(".instructions.md") || normalized.includes("/instructions/") || normalized.endsWith(".prompt.md") || normalized.includes("/prompts/");
116
+ const patternsToUse = patterns ?? [];
117
+ return micromatch.isMatch(normalized, patternsToUse);
71
118
  }
72
119
  function extractCodeBlocks(segments) {
73
120
  const codeBlocks = [];
@@ -87,43 +134,45 @@ function extractCodeBlocks(segments) {
87
134
  }
88
135
  return codeBlocks;
89
136
  }
90
- async function loadTestCases(testFilePath, repoRoot, options) {
137
+ async function loadEvalCases(evalFilePath, repoRoot, options) {
91
138
  const verbose = options?.verbose ?? false;
92
- const absoluteTestPath = path.resolve(testFilePath);
93
- if (!await fileExists(absoluteTestPath)) {
94
- throw new Error(`Test file not found: ${testFilePath}`);
139
+ const absoluteTestPath = path.resolve(evalFilePath);
140
+ if (!await fileExists2(absoluteTestPath)) {
141
+ throw new Error(`Test file not found: ${evalFilePath}`);
95
142
  }
96
143
  const repoRootPath = resolveToAbsolutePath(repoRoot);
97
144
  const searchRoots = buildSearchRoots(absoluteTestPath, repoRootPath);
145
+ const config = await loadConfig(absoluteTestPath, repoRootPath);
146
+ const guidelinePatterns = config?.guideline_patterns;
98
147
  const rawFile = await readFile(absoluteTestPath, "utf8");
99
148
  const parsed = parse(rawFile);
100
149
  if (!isJsonObject(parsed)) {
101
- throw new Error(`Invalid test file format: ${testFilePath}`);
150
+ throw new Error(`Invalid test file format: ${evalFilePath}`);
102
151
  }
103
152
  const suite = parsed;
104
153
  const schema = suite.$schema;
105
154
  if (schema !== SCHEMA_EVAL_V2) {
106
- const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${testFilePath}. Expected '${SCHEMA_EVAL_V2}'` : `Missing required field '$schema' in ${testFilePath}.
155
+ const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${evalFilePath}. Expected '${SCHEMA_EVAL_V2}'` : `Missing required field '$schema' in ${evalFilePath}.
107
156
  Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
108
157
  throw new Error(message);
109
158
  }
110
159
  const rawTestcases = suite.evalcases;
111
160
  if (!Array.isArray(rawTestcases)) {
112
- throw new Error(`Invalid test file format: ${testFilePath} - missing 'evalcases' field`);
161
+ throw new Error(`Invalid test file format: ${evalFilePath} - missing 'evalcases' field`);
113
162
  }
114
163
  const globalGrader = coerceGrader(suite.grader) ?? "llm_judge";
115
164
  const results = [];
116
- for (const rawTestcase of rawTestcases) {
117
- if (!isJsonObject(rawTestcase)) {
165
+ for (const rawEvalcase of rawTestcases) {
166
+ if (!isJsonObject(rawEvalcase)) {
118
167
  logWarning("Skipping invalid test case entry (expected object)");
119
168
  continue;
120
169
  }
121
- const testcase = rawTestcase;
122
- const id = asString(testcase.id);
123
- const conversationId = asString(testcase.conversation_id);
124
- const outcome = asString(testcase.outcome);
125
- const inputMessagesValue = testcase.input_messages;
126
- const expectedMessagesValue = testcase.expected_messages;
170
+ const evalcase = rawEvalcase;
171
+ const id = asString(evalcase.id);
172
+ const conversationId = asString(evalcase.conversation_id);
173
+ const outcome = asString(evalcase.outcome);
174
+ const inputMessagesValue = evalcase.input_messages;
175
+ const expectedMessagesValue = evalcase.expected_messages;
127
176
  if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
128
177
  logWarning(`Skipping incomplete test case: ${id ?? "unknown"}`);
129
178
  continue;
@@ -136,6 +185,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
136
185
  const expectedMessages = expectedMessagesValue.filter((msg) => isTestMessage(msg));
137
186
  const assistantMessages = expectedMessages.filter((message) => message.role === "assistant");
138
187
  const userMessages = inputMessages.filter((message) => message.role === "user");
188
+ const systemMessages = inputMessages.filter((message) => message.role === "system");
139
189
  if (assistantMessages.length === 0) {
140
190
  logWarning(`No assistant message found for test case: ${id}`);
141
191
  continue;
@@ -143,6 +193,29 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
143
193
  if (assistantMessages.length > 1) {
144
194
  logWarning(`Multiple assistant messages found for test case: ${id}, using first`);
145
195
  }
196
+ if (systemMessages.length > 1) {
197
+ logWarning(`Multiple system messages found for test case: ${id}, using first`);
198
+ }
199
+ let systemMessageContent;
200
+ if (systemMessages.length > 0) {
201
+ const content = systemMessages[0]?.content;
202
+ if (typeof content === "string") {
203
+ systemMessageContent = content;
204
+ } else if (Array.isArray(content)) {
205
+ const textParts = [];
206
+ for (const segment of content) {
207
+ if (isJsonObject(segment)) {
208
+ const value = segment.value;
209
+ if (typeof value === "string") {
210
+ textParts.push(value);
211
+ }
212
+ }
213
+ }
214
+ if (textParts.length > 0) {
215
+ systemMessageContent = textParts.join("\n\n");
216
+ }
217
+ }
218
+ }
146
219
  const userSegments = [];
147
220
  const guidelinePaths = [];
148
221
  const userTextParts = [];
@@ -174,7 +247,8 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
174
247
  }
175
248
  try {
176
249
  const fileContent = (await readFile(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
177
- if (isGuidelineFile(displayPath)) {
250
+ const relativeToRepo = path.relative(repoRootPath, resolvedPath);
251
+ if (isGuidelineFile(relativeToRepo, guidelinePatterns)) {
178
252
  guidelinePaths.push(path.resolve(resolvedPath));
179
253
  if (verbose) {
180
254
  console.log(` [Guideline] Found: ${displayPath}`);
@@ -184,7 +258,8 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
184
258
  userSegments.push({
185
259
  type: "file",
186
260
  path: displayPath,
187
- text: fileContent
261
+ text: fileContent,
262
+ resolvedPath: path.resolve(resolvedPath)
188
263
  });
189
264
  if (verbose) {
190
265
  console.log(` [File] Found: ${displayPath}`);
@@ -208,14 +283,27 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
208
283
  const assistantContent = assistantMessages[0]?.content;
209
284
  const expectedAssistantRaw = await resolveAssistantContent(assistantContent, searchRoots, verbose);
210
285
  const userTextPrompt = userTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
211
- const testCaseGrader = coerceGrader(testcase.grader) ?? globalGrader;
286
+ const testCaseGrader = coerceGrader(evalcase.grader) ?? globalGrader;
287
+ const userFilePaths = [];
288
+ for (const segment of userSegments) {
289
+ if (segment.type === "file" && typeof segment.resolvedPath === "string") {
290
+ userFilePaths.push(segment.resolvedPath);
291
+ }
292
+ }
293
+ const allFilePaths = [
294
+ ...guidelinePaths.map((guidelinePath) => path.resolve(guidelinePath)),
295
+ ...userFilePaths
296
+ ];
212
297
  const testCase = {
213
298
  id,
214
299
  conversation_id: conversationId,
215
300
  task: userTextPrompt,
216
301
  user_segments: userSegments,
302
+ system_message: systemMessageContent,
217
303
  expected_assistant_raw: expectedAssistantRaw,
218
304
  guideline_paths: guidelinePaths.map((guidelinePath) => path.resolve(guidelinePath)),
305
+ guideline_patterns: guidelinePatterns,
306
+ file_paths: allFilePaths,
219
307
  code_snippets: codeSnippets,
220
308
  outcome,
221
309
  grader: testCaseGrader
@@ -240,7 +328,7 @@ async function buildPromptInputs(testCase) {
240
328
  const guidelineContents = [];
241
329
  for (const rawPath of testCase.guideline_paths) {
242
330
  const absolutePath = path.resolve(rawPath);
243
- if (!await fileExists(absolutePath)) {
331
+ if (!await fileExists2(absolutePath)) {
244
332
  logWarning(`Could not read guideline file ${absolutePath}: file does not exist`);
245
333
  continue;
246
334
  }
@@ -281,9 +369,9 @@ ${body}`);
281
369
  }
282
370
  const request = requestParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
283
371
  const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
284
- return { request, guidelines };
372
+ return { request, guidelines, systemMessage: testCase.system_message };
285
373
  }
286
- async function fileExists(absolutePath) {
374
+ async function fileExists2(absolutePath) {
287
375
  try {
288
376
  await access(absolutePath, constants.F_OK);
289
377
  return true;
@@ -407,15 +495,18 @@ function buildChatPrompt(request) {
407
495
  return request.chatPrompt;
408
496
  }
409
497
  const systemSegments = [];
410
- if (request.guidelines && request.guidelines.trim().length > 0) {
411
- systemSegments.push(`Guidelines:
412
- ${request.guidelines.trim()}`);
413
- }
414
498
  const metadataSystemPrompt = typeof request.metadata?.systemPrompt === "string" ? request.metadata.systemPrompt : void 0;
415
499
  if (metadataSystemPrompt && metadataSystemPrompt.trim().length > 0) {
416
500
  systemSegments.push(metadataSystemPrompt.trim());
501
+ } else {
502
+ systemSegments.push(DEFAULT_SYSTEM_PROMPT);
417
503
  }
418
- const systemContent = systemSegments.length > 0 ? systemSegments.join("\n\n") : DEFAULT_SYSTEM_PROMPT;
504
+ if (request.guidelines && request.guidelines.trim().length > 0) {
505
+ systemSegments.push(`[[ ## Guidelines ## ]]
506
+
507
+ ${request.guidelines.trim()}`);
508
+ }
509
+ const systemContent = systemSegments.join("\n\n");
419
510
  const userContent = request.prompt.trim();
420
511
  const prompt = [
421
512
  {
@@ -644,6 +735,9 @@ function normalizeAzureApiVersion(value) {
644
735
  function resolveTargetDefinition(definition, env = process.env) {
645
736
  const parsed = BASE_TARGET_SCHEMA.parse(definition);
646
737
  const provider = parsed.provider.toLowerCase();
738
+ const providerBatching = resolveOptionalBoolean(
739
+ parsed.settings?.provider_batching ?? parsed.settings?.providerBatching
740
+ );
647
741
  switch (provider) {
648
742
  case "azure":
649
743
  case "azure-openai":
@@ -652,6 +746,7 @@ function resolveTargetDefinition(definition, env = process.env) {
652
746
  name: parsed.name,
653
747
  judgeTarget: parsed.judge_target,
654
748
  workers: parsed.workers,
749
+ providerBatching,
655
750
  config: resolveAzureConfig(parsed, env)
656
751
  };
657
752
  case "anthropic":
@@ -660,6 +755,7 @@ function resolveTargetDefinition(definition, env = process.env) {
660
755
  name: parsed.name,
661
756
  judgeTarget: parsed.judge_target,
662
757
  workers: parsed.workers,
758
+ providerBatching,
663
759
  config: resolveAnthropicConfig(parsed, env)
664
760
  };
665
761
  case "gemini":
@@ -670,6 +766,7 @@ function resolveTargetDefinition(definition, env = process.env) {
670
766
  name: parsed.name,
671
767
  judgeTarget: parsed.judge_target,
672
768
  workers: parsed.workers,
769
+ providerBatching,
673
770
  config: resolveGeminiConfig(parsed, env)
674
771
  };
675
772
  case "mock":
@@ -678,6 +775,7 @@ function resolveTargetDefinition(definition, env = process.env) {
678
775
  name: parsed.name,
679
776
  judgeTarget: parsed.judge_target,
680
777
  workers: parsed.workers,
778
+ providerBatching,
681
779
  config: resolveMockConfig(parsed)
682
780
  };
683
781
  case "vscode":
@@ -687,6 +785,7 @@ function resolveTargetDefinition(definition, env = process.env) {
687
785
  name: parsed.name,
688
786
  judgeTarget: parsed.judge_target,
689
787
  workers: parsed.workers,
788
+ providerBatching,
690
789
  config: resolveVSCodeConfig(parsed, env, provider === "vscode-insiders")
691
790
  };
692
791
  default:
@@ -871,15 +970,19 @@ function isLikelyEnvReference(value) {
871
970
  }
872
971
 
873
972
  // src/evaluation/providers/vscode.ts
874
- import { mkdtemp, readFile as readFile2, rm, writeFile } from "node:fs/promises";
875
- import { tmpdir } from "node:os";
973
+ import { readFile as readFile2 } from "node:fs/promises";
876
974
  import path2 from "node:path";
877
- import { dispatchAgentSession, getSubagentRoot, provisionSubagents } from "subagent";
878
- var PROMPT_FILE_PREFIX = "agentv-vscode-";
975
+ import {
976
+ dispatchAgentSession,
977
+ dispatchBatchAgent,
978
+ getSubagentRoot,
979
+ provisionSubagents
980
+ } from "subagent";
879
981
  var VSCodeProvider = class {
880
982
  id;
881
983
  kind;
882
984
  targetName;
985
+ supportsBatch = true;
883
986
  config;
884
987
  constructor(targetName, config, kind) {
885
988
  this.id = `${kind}:${targetName}`;
@@ -892,117 +995,159 @@ var VSCodeProvider = class {
892
995
  throw new Error("VS Code provider request was aborted before dispatch");
893
996
  }
894
997
  const attachments = normalizeAttachments(request.attachments);
895
- const promptContent = buildPromptDocument(request, attachments);
896
- const directory = await mkdtemp(path2.join(tmpdir(), PROMPT_FILE_PREFIX));
897
- const promptPath = path2.join(directory, `${request.testCaseId ?? "request"}.prompt.md`);
898
- try {
899
- await writeFile(promptPath, promptContent, "utf8");
900
- const session = await dispatchAgentSession({
901
- userQuery: composeUserQuery(request),
902
- promptFile: promptPath,
903
- extraAttachments: attachments,
904
- wait: this.config.waitForResponse,
905
- dryRun: this.config.dryRun,
906
- vscodeCmd: this.config.command,
907
- subagentRoot: this.config.subagentRoot,
908
- workspaceTemplate: this.config.workspaceTemplate,
909
- silent: true
910
- });
911
- if (session.exitCode !== 0 || !session.responseFile) {
912
- const failure = session.error ?? "VS Code subagent did not produce a response";
913
- throw new Error(failure);
914
- }
915
- if (this.config.dryRun) {
916
- return {
917
- text: "",
918
- raw: {
919
- session,
920
- promptFile: promptPath,
921
- attachments
922
- }
923
- };
924
- }
925
- const responseText = await readFile2(session.responseFile, "utf8");
998
+ const promptContent = buildPromptDocument(request, attachments, request.guideline_patterns);
999
+ const session = await dispatchAgentSession({
1000
+ userQuery: promptContent,
1001
+ // Use full prompt content instead of just request.prompt
1002
+ extraAttachments: attachments,
1003
+ wait: this.config.waitForResponse,
1004
+ dryRun: this.config.dryRun,
1005
+ vscodeCmd: this.config.command,
1006
+ subagentRoot: this.config.subagentRoot,
1007
+ workspaceTemplate: this.config.workspaceTemplate,
1008
+ silent: true
1009
+ });
1010
+ if (session.exitCode !== 0 || !session.responseFile) {
1011
+ const failure = session.error ?? "VS Code subagent did not produce a response";
1012
+ throw new Error(failure);
1013
+ }
1014
+ if (this.config.dryRun) {
926
1015
  return {
927
- text: responseText,
1016
+ text: "",
928
1017
  raw: {
929
1018
  session,
930
- promptFile: promptPath,
931
1019
  attachments
932
1020
  }
933
1021
  };
934
- } finally {
935
- await rm(directory, { recursive: true, force: true });
936
1022
  }
1023
+ const responseText = await readFile2(session.responseFile, "utf8");
1024
+ return {
1025
+ text: responseText,
1026
+ raw: {
1027
+ session,
1028
+ attachments
1029
+ }
1030
+ };
1031
+ }
1032
+ async invokeBatch(requests) {
1033
+ if (requests.length === 0) {
1034
+ return [];
1035
+ }
1036
+ const normalizedRequests = requests.map((req) => ({
1037
+ request: req,
1038
+ attachments: normalizeAttachments(req.attachments)
1039
+ }));
1040
+ const combinedAttachments = mergeAttachments(
1041
+ normalizedRequests.map(({ attachments }) => attachments)
1042
+ );
1043
+ const userQueries = normalizedRequests.map(
1044
+ ({ request, attachments }) => buildPromptDocument(request, attachments, request.guideline_patterns)
1045
+ );
1046
+ const session = await dispatchBatchAgent({
1047
+ userQueries,
1048
+ extraAttachments: combinedAttachments,
1049
+ wait: this.config.waitForResponse,
1050
+ dryRun: this.config.dryRun,
1051
+ vscodeCmd: this.config.command,
1052
+ subagentRoot: this.config.subagentRoot,
1053
+ workspaceTemplate: this.config.workspaceTemplate,
1054
+ silent: true
1055
+ });
1056
+ if (session.exitCode !== 0 || !session.responseFiles) {
1057
+ const failure = session.error ?? "VS Code subagent did not produce batch responses";
1058
+ throw new Error(failure);
1059
+ }
1060
+ if (this.config.dryRun) {
1061
+ return normalizedRequests.map(({ attachments }) => ({
1062
+ text: "",
1063
+ raw: {
1064
+ session,
1065
+ attachments,
1066
+ allAttachments: combinedAttachments
1067
+ }
1068
+ }));
1069
+ }
1070
+ if (session.responseFiles.length !== requests.length) {
1071
+ throw new Error(
1072
+ `VS Code batch returned ${session.responseFiles.length} responses for ${requests.length} requests`
1073
+ );
1074
+ }
1075
+ const responses = [];
1076
+ for (const [index, responseFile] of session.responseFiles.entries()) {
1077
+ const responseText = await readFile2(responseFile, "utf8");
1078
+ responses.push({
1079
+ text: responseText,
1080
+ raw: {
1081
+ session,
1082
+ attachments: normalizedRequests[index]?.attachments,
1083
+ allAttachments: combinedAttachments,
1084
+ responseFile
1085
+ }
1086
+ });
1087
+ }
1088
+ return responses;
937
1089
  }
938
1090
  };
939
- function buildPromptDocument(request, attachments) {
1091
+ function buildPromptDocument(request, attachments, guidelinePatterns) {
940
1092
  const parts = [];
941
- const instructionFiles = collectInstructionFiles(attachments);
942
- if (instructionFiles.length > 0) {
943
- parts.push(buildMandatoryPrereadBlock(instructionFiles));
944
- }
945
- parts.push(`# AgentV Request`);
946
- if (request.testCaseId) {
947
- parts.push(`- Test Case: ${request.testCaseId}`);
948
- }
949
- if (request.metadata?.target) {
950
- parts.push(`- Target: ${String(request.metadata.target)}`);
951
- }
952
- parts.push("\n## Task\n", request.prompt.trim());
953
- if (request.guidelines && request.guidelines.trim().length > 0) {
954
- parts.push("\n## Guidelines\n", request.guidelines.trim());
955
- }
956
- if (attachments && attachments.length > 0) {
957
- const attachmentList = attachments.map((item) => `- ${item}`).join("\n");
958
- parts.push("\n## Attachments\n", attachmentList);
1093
+ const guidelineFiles = collectGuidelineFiles(attachments, guidelinePatterns);
1094
+ const attachmentFiles = collectAttachmentFiles(attachments);
1095
+ const nonGuidelineAttachments = attachmentFiles.filter(
1096
+ (file) => !guidelineFiles.includes(file)
1097
+ );
1098
+ const prereadBlock = buildMandatoryPrereadBlock(guidelineFiles, nonGuidelineAttachments);
1099
+ if (prereadBlock.length > 0) {
1100
+ parts.push("\n", prereadBlock);
959
1101
  }
1102
+ parts.push("\n[[ ## user_query ## ]]\n", request.prompt.trim());
960
1103
  return parts.join("\n").trim();
961
1104
  }
962
- function buildMandatoryPrereadBlock(instructionFiles) {
963
- if (instructionFiles.length === 0) {
1105
+ function buildMandatoryPrereadBlock(guidelineFiles, attachmentFiles) {
1106
+ if (guidelineFiles.length === 0 && attachmentFiles.length === 0) {
964
1107
  return "";
965
1108
  }
966
- const fileList = [];
967
- const tokenList = [];
968
- let counter = 0;
969
- for (const absolutePath of instructionFiles) {
970
- counter += 1;
1109
+ const buildList = (files) => files.map((absolutePath) => {
971
1110
  const fileName = path2.basename(absolutePath);
972
1111
  const fileUri = pathToFileUri(absolutePath);
973
- fileList.push(`[${fileName}](${fileUri})`);
974
- tokenList.push(`INSTRUCTIONS_READ: \`${fileName}\` i=${counter} SHA256=<hex>`);
975
- }
976
- const filesText = fileList.join(", ");
977
- const tokensText = tokenList.join("\n");
978
- const instruction = [
979
- `Read all instruction files: ${filesText}.`,
980
- `After reading each file, compute its SHA256 hash using this PowerShell command:`,
981
- "`Get-FileHash -Algorithm SHA256 -LiteralPath '<file-path>' | Select-Object -ExpandProperty Hash`.",
982
- `Then include, at the top of your reply, these exact tokens on separate lines:
983
- `,
984
- tokensText,
985
- `
986
- Replace \`<hex>\` with the actual SHA256 hash value computed from the PowerShell command.`,
987
- `If any file is missing, fail with ERROR: missing-file <filename> and stop.
988
- `,
989
- `Then fetch all documentation required by the instructions before proceeding with your task.`
990
- ].join(" ");
991
- return `[[ ## mandatory_pre_read ## ]]
992
-
993
- ${instruction}
994
-
995
- `;
1112
+ return `* [${fileName}](${fileUri})`;
1113
+ });
1114
+ const sections = [];
1115
+ if (guidelineFiles.length > 0) {
1116
+ sections.push(`Read all guideline files:
1117
+ ${buildList(guidelineFiles).join("\n")}.`);
1118
+ }
1119
+ if (attachmentFiles.length > 0) {
1120
+ sections.push(`Read all attachment files:
1121
+ ${buildList(attachmentFiles).join("\n")}.`);
1122
+ }
1123
+ sections.push(
1124
+ "If any file is missing, fail with ERROR: missing-file <filename> and stop.",
1125
+ "Then apply system_instructions on the user query below."
1126
+ );
1127
+ return sections.join("\n");
996
1128
  }
997
- function collectInstructionFiles(attachments) {
1129
+ function collectGuidelineFiles(attachments, guidelinePatterns) {
998
1130
  if (!attachments || attachments.length === 0) {
999
1131
  return [];
1000
1132
  }
1001
1133
  const unique = /* @__PURE__ */ new Map();
1002
1134
  for (const attachment of attachments) {
1003
- if (!isInstructionPath(attachment)) {
1004
- continue;
1135
+ const absolutePath = path2.resolve(attachment);
1136
+ const normalized = absolutePath.split(path2.sep).join("/");
1137
+ if (isGuidelineFile(normalized, guidelinePatterns)) {
1138
+ if (!unique.has(absolutePath)) {
1139
+ unique.set(absolutePath, absolutePath);
1140
+ }
1005
1141
  }
1142
+ }
1143
+ return Array.from(unique.values());
1144
+ }
1145
+ function collectAttachmentFiles(attachments) {
1146
+ if (!attachments || attachments.length === 0) {
1147
+ return [];
1148
+ }
1149
+ const unique = /* @__PURE__ */ new Map();
1150
+ for (const attachment of attachments) {
1006
1151
  const absolutePath = path2.resolve(attachment);
1007
1152
  if (!unique.has(absolutePath)) {
1008
1153
  unique.set(absolutePath, absolutePath);
@@ -1010,10 +1155,6 @@ function collectInstructionFiles(attachments) {
1010
1155
  }
1011
1156
  return Array.from(unique.values());
1012
1157
  }
1013
- function isInstructionPath(filePath) {
1014
- const normalized = filePath.split(path2.sep).join("/");
1015
- return normalized.endsWith(".instructions.md") || normalized.includes("/instructions/") || normalized.endsWith(".prompt.md") || normalized.includes("/prompts/");
1016
- }
1017
1158
  function pathToFileUri(filePath) {
1018
1159
  const absolutePath = path2.isAbsolute(filePath) ? filePath : path2.resolve(filePath);
1019
1160
  const normalizedPath = absolutePath.replace(/\\/g, "/");
@@ -1022,14 +1163,6 @@ function pathToFileUri(filePath) {
1022
1163
  }
1023
1164
  return `file://${normalizedPath}`;
1024
1165
  }
1025
- function composeUserQuery(request) {
1026
- const segments = [];
1027
- segments.push(request.prompt.trim());
1028
- if (request.guidelines && request.guidelines.trim().length > 0) {
1029
- segments.push("\nGuidelines:\n", request.guidelines.trim());
1030
- }
1031
- return segments.join("\n").trim();
1032
- }
1033
1166
  function normalizeAttachments(attachments) {
1034
1167
  if (!attachments || attachments.length === 0) {
1035
1168
  return void 0;
@@ -1040,6 +1173,16 @@ function normalizeAttachments(attachments) {
1040
1173
  }
1041
1174
  return Array.from(deduped);
1042
1175
  }
1176
+ function mergeAttachments(all) {
1177
+ const deduped = /* @__PURE__ */ new Set();
1178
+ for (const list of all) {
1179
+ if (!list) continue;
1180
+ for (const attachment of list) {
1181
+ deduped.add(path2.resolve(attachment));
1182
+ }
1183
+ }
1184
+ return deduped.size > 0 ? Array.from(deduped) : void 0;
1185
+ }
1043
1186
  async function ensureVSCodeSubagents(options) {
1044
1187
  const { kind, count, verbose = false } = options;
1045
1188
  const vscodeCmd = kind === "vscode-insiders" ? "code-insiders" : "code";
@@ -1136,7 +1279,7 @@ function assertTargetDefinition(value, index, filePath) {
1136
1279
  judge_target: typeof judgeTarget === "string" ? judgeTarget : void 0
1137
1280
  };
1138
1281
  }
1139
- async function fileExists2(filePath) {
1282
+ async function fileExists3(filePath) {
1140
1283
  try {
1141
1284
  await access2(filePath, constants2.F_OK);
1142
1285
  return true;
@@ -1146,7 +1289,7 @@ async function fileExists2(filePath) {
1146
1289
  }
1147
1290
  async function readTargetDefinitions(filePath) {
1148
1291
  const absolutePath = path3.resolve(filePath);
1149
- if (!await fileExists2(absolutePath)) {
1292
+ if (!await fileExists3(absolutePath)) {
1150
1293
  throw new Error(`targets.yaml not found at ${absolutePath}`);
1151
1294
  }
1152
1295
  const raw = await readFile3(absolutePath, "utf8");
@@ -1376,7 +1519,7 @@ import { randomUUID } from "node:crypto";
1376
1519
  var HeuristicGrader = class {
1377
1520
  kind = "heuristic";
1378
1521
  grade(context) {
1379
- const expectedAspects = extractAspects(context.testCase.expected_assistant_raw);
1522
+ const expectedAspects = extractAspects(context.evalCase.expected_assistant_raw);
1380
1523
  const result = scoreCandidateResponse(context.candidate, expectedAspects);
1381
1524
  const misses = [...result.misses];
1382
1525
  if (expectedAspects.length === 0 && isErrorLike(context.candidate)) {
@@ -1409,14 +1552,14 @@ var QualityGrader = class {
1409
1552
  if (!judgeProvider) {
1410
1553
  throw new Error("No judge provider available for LLM grading");
1411
1554
  }
1412
- const prompt = buildQualityPrompt(context.testCase, context.candidate);
1555
+ const prompt = buildQualityPrompt(context.evalCase, context.candidate);
1413
1556
  const metadata = {
1414
1557
  systemPrompt: QUALITY_SYSTEM_PROMPT
1415
1558
  };
1416
1559
  const response = await judgeProvider.invoke({
1417
1560
  prompt,
1418
1561
  metadata,
1419
- testCaseId: context.testCase.id,
1562
+ evalCaseId: context.evalCase.id,
1420
1563
  attempt: context.attempt,
1421
1564
  maxOutputTokens: this.maxOutputTokens,
1422
1565
  temperature: this.temperature
@@ -1462,16 +1605,16 @@ var QUALITY_SYSTEM_PROMPT = [
1462
1605
  function buildQualityPrompt(testCase, candidate) {
1463
1606
  const parts = [
1464
1607
  "[[ ## expected_outcome ## ]]",
1465
- testCase.outcome,
1608
+ testCase.outcome.trim(),
1466
1609
  "",
1467
1610
  "[[ ## request ## ]]",
1468
- testCase.task,
1611
+ testCase.task.trim(),
1469
1612
  "",
1470
1613
  "[[ ## reference_answer ## ]]",
1471
- testCase.expected_assistant_raw,
1614
+ testCase.expected_assistant_raw.trim(),
1472
1615
  "",
1473
1616
  "[[ ## generated_answer ## ]]",
1474
- candidate,
1617
+ candidate.trim(),
1475
1618
  "",
1476
1619
  "Respond with a single JSON object matching the schema described in the system prompt."
1477
1620
  ];
@@ -1720,10 +1863,10 @@ async function runEvaluation(options) {
1720
1863
  onResult,
1721
1864
  onProgress
1722
1865
  } = options;
1723
- const load = loadTestCases;
1724
- const testCases = await load(testFilePath, repoRoot, { verbose });
1725
- const filteredTestCases = filterTestCases(testCases, evalId);
1726
- if (filteredTestCases.length === 0) {
1866
+ const load = loadEvalCases;
1867
+ const evalCases = await load(testFilePath, repoRoot, { verbose });
1868
+ const filteredEvalCases = filterEvalCases(evalCases, evalId);
1869
+ if (filteredEvalCases.length === 0) {
1727
1870
  if (evalId) {
1728
1871
  throw new Error(`Test case with id '${evalId}' not found in ${testFilePath}`);
1729
1872
  }
@@ -1769,35 +1912,62 @@ async function runEvaluation(options) {
1769
1912
  };
1770
1913
  const graderRegistry = buildGraderRegistry(graders, resolveJudgeProvider);
1771
1914
  const primaryProvider = getOrCreateProvider(target);
1772
- if (onProgress && filteredTestCases.length > 0) {
1773
- for (let i = 0; i < filteredTestCases.length; i++) {
1915
+ const providerSupportsBatch = target.providerBatching === true && primaryProvider.supportsBatch === true && typeof primaryProvider.invokeBatch === "function";
1916
+ if (target.providerBatching && !providerSupportsBatch && verbose) {
1917
+ console.warn(
1918
+ `Provider batching requested for target '${target.name}', but provider does not advertise batch support. Using per-case dispatch.`
1919
+ );
1920
+ }
1921
+ if (onProgress && filteredEvalCases.length > 0) {
1922
+ for (let i = 0; i < filteredEvalCases.length; i++) {
1774
1923
  await onProgress({
1775
1924
  workerId: i + 1,
1776
- evalId: filteredTestCases[i].id,
1925
+ evalId: filteredEvalCases[i].id,
1777
1926
  status: "pending"
1778
1927
  });
1779
1928
  }
1780
1929
  }
1930
+ if (providerSupportsBatch) {
1931
+ try {
1932
+ return await runBatchEvaluation({
1933
+ evalCases: filteredEvalCases,
1934
+ provider: primaryProvider,
1935
+ target,
1936
+ graderRegistry,
1937
+ promptDumpDir,
1938
+ nowFn: now ?? (() => /* @__PURE__ */ new Date()),
1939
+ onProgress,
1940
+ onResult,
1941
+ verbose,
1942
+ resolveJudgeProvider
1943
+ });
1944
+ } catch (error) {
1945
+ if (verbose) {
1946
+ const message = error instanceof Error ? error.message : String(error);
1947
+ console.warn(`Provider batch execution failed, falling back to per-case dispatch: ${message}`);
1948
+ }
1949
+ }
1950
+ }
1781
1951
  const workers = options.maxConcurrency ?? target.workers ?? 1;
1782
1952
  const limit = pLimit(workers);
1783
1953
  let nextWorkerId = 1;
1784
1954
  const workerIdByEvalId = /* @__PURE__ */ new Map();
1785
- const promises = filteredTestCases.map(
1786
- (testCase) => limit(async () => {
1955
+ const promises = filteredEvalCases.map(
1956
+ (evalCase) => limit(async () => {
1787
1957
  const workerId = nextWorkerId++;
1788
- workerIdByEvalId.set(testCase.id, workerId);
1958
+ workerIdByEvalId.set(evalCase.id, workerId);
1789
1959
  if (onProgress) {
1790
1960
  await onProgress({
1791
1961
  workerId,
1792
- evalId: testCase.id,
1962
+ evalId: evalCase.id,
1793
1963
  status: "running",
1794
1964
  startedAt: Date.now()
1795
1965
  });
1796
1966
  }
1797
1967
  try {
1798
1968
  const judgeProvider = await resolveJudgeProvider(target);
1799
- const result = await runTestCase({
1800
- testCase,
1969
+ const result = await runEvalCase({
1970
+ evalCase,
1801
1971
  provider: primaryProvider,
1802
1972
  target,
1803
1973
  graders: graderRegistry,
@@ -1812,7 +1982,7 @@ async function runEvaluation(options) {
1812
1982
  if (onProgress) {
1813
1983
  await onProgress({
1814
1984
  workerId,
1815
- evalId: testCase.id,
1985
+ evalId: evalCase.id,
1816
1986
  status: "completed",
1817
1987
  startedAt: 0,
1818
1988
  // Not used for completed status
@@ -1827,7 +1997,7 @@ async function runEvaluation(options) {
1827
1997
  if (onProgress) {
1828
1998
  await onProgress({
1829
1999
  workerId,
1830
- evalId: testCase.id,
2000
+ evalId: evalCase.id,
1831
2001
  status: "failed",
1832
2002
  completedAt: Date.now(),
1833
2003
  error: error instanceof Error ? error.message : String(error)
@@ -1844,10 +2014,10 @@ async function runEvaluation(options) {
1844
2014
  if (outcome.status === "fulfilled") {
1845
2015
  results.push(outcome.value);
1846
2016
  } else {
1847
- const testCase = filteredTestCases[i];
1848
- const promptInputs = await buildPromptInputs(testCase);
2017
+ const evalCase = filteredEvalCases[i];
2018
+ const promptInputs = await buildPromptInputs(evalCase);
1849
2019
  const errorResult = buildErrorResult(
1850
- testCase,
2020
+ evalCase,
1851
2021
  target.name,
1852
2022
  (now ?? (() => /* @__PURE__ */ new Date()))(),
1853
2023
  outcome.reason,
@@ -1861,9 +2031,140 @@ async function runEvaluation(options) {
1861
2031
  }
1862
2032
  return results;
1863
2033
  }
1864
- async function runTestCase(options) {
2034
+ async function runBatchEvaluation(options) {
2035
+ const {
2036
+ evalCases,
2037
+ provider,
2038
+ target,
2039
+ graderRegistry,
2040
+ promptDumpDir,
2041
+ nowFn,
2042
+ onProgress,
2043
+ onResult,
2044
+ resolveJudgeProvider
2045
+ } = options;
2046
+ const promptInputsList = [];
2047
+ for (const evalCase of evalCases) {
2048
+ const promptInputs = await buildPromptInputs(evalCase);
2049
+ if (promptDumpDir) {
2050
+ await dumpPrompt(promptDumpDir, evalCase, promptInputs);
2051
+ }
2052
+ promptInputsList.push(promptInputs);
2053
+ }
2054
+ const batchRequests = evalCases.map((evalCase, index) => {
2055
+ const promptInputs = promptInputsList[index];
2056
+ return {
2057
+ prompt: promptInputs.request,
2058
+ guidelines: promptInputs.guidelines,
2059
+ guideline_patterns: evalCase.guideline_patterns,
2060
+ attachments: evalCase.file_paths,
2061
+ evalCaseId: evalCase.id,
2062
+ metadata: {
2063
+ systemPrompt: promptInputs.systemMessage ?? ""
2064
+ }
2065
+ };
2066
+ });
2067
+ const batchResponse = await provider.invokeBatch?.(batchRequests);
2068
+ if (!Array.isArray(batchResponse)) {
2069
+ throw new Error("Provider batching failed: invokeBatch did not return an array");
2070
+ }
2071
+ if (batchResponse.length !== evalCases.length) {
2072
+ throw new Error(
2073
+ `Provider batching failed: expected ${evalCases.length} responses, received ${batchResponse.length}`
2074
+ );
2075
+ }
2076
+ if (onProgress) {
2077
+ const startedAt = Date.now();
2078
+ for (let i = 0; i < evalCases.length; i++) {
2079
+ await onProgress({
2080
+ workerId: 1,
2081
+ evalId: evalCases[i].id,
2082
+ status: "running",
2083
+ startedAt
2084
+ });
2085
+ }
2086
+ }
2087
+ const results = [];
2088
+ for (let i = 0; i < evalCases.length; i++) {
2089
+ const evalCase = evalCases[i];
2090
+ const promptInputs = promptInputsList[i];
2091
+ const providerResponse = batchResponse[i];
2092
+ const now = nowFn();
2093
+ const graderKind = evalCase.grader ?? "heuristic";
2094
+ const activeGrader = graderRegistry[graderKind] ?? graderRegistry.heuristic;
2095
+ if (!activeGrader) {
2096
+ throw new Error(`No grader registered for kind '${graderKind}'`);
2097
+ }
2098
+ let grade;
2099
+ try {
2100
+ grade = await activeGrader.grade({
2101
+ evalCase,
2102
+ candidate: providerResponse.text ?? "",
2103
+ target,
2104
+ provider,
2105
+ attempt: 0,
2106
+ promptInputs,
2107
+ now,
2108
+ judgeProvider: await resolveJudgeProvider(target)
2109
+ });
2110
+ } catch (error) {
2111
+ const errorResult = buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
2112
+ results.push(errorResult);
2113
+ if (onResult) {
2114
+ await onResult(errorResult);
2115
+ }
2116
+ if (onProgress) {
2117
+ await onProgress({
2118
+ workerId: 1,
2119
+ evalId: evalCase.id,
2120
+ status: "failed",
2121
+ completedAt: Date.now(),
2122
+ error: error instanceof Error ? error.message : String(error)
2123
+ });
2124
+ }
2125
+ continue;
2126
+ }
2127
+ const completedAt = nowFn();
2128
+ const rawRequest = {
2129
+ request: promptInputs.request,
2130
+ guidelines: promptInputs.guidelines,
2131
+ guideline_paths: evalCase.guideline_paths,
2132
+ system_message: promptInputs.systemMessage ?? ""
2133
+ };
2134
+ const result = {
2135
+ eval_id: evalCase.id,
2136
+ conversation_id: evalCase.conversation_id,
2137
+ score: grade.score,
2138
+ hits: grade.hits,
2139
+ misses: grade.misses,
2140
+ model_answer: providerResponse.text ?? "",
2141
+ expected_aspect_count: grade.expectedAspectCount,
2142
+ target: target.name,
2143
+ timestamp: completedAt.toISOString(),
2144
+ reasoning: grade.reasoning,
2145
+ raw_aspects: grade.rawAspects,
2146
+ raw_request: rawRequest,
2147
+ grader_raw_request: grade.graderRawRequest
2148
+ };
2149
+ results.push(result);
2150
+ if (onResult) {
2151
+ await onResult(result);
2152
+ }
2153
+ if (onProgress) {
2154
+ await onProgress({
2155
+ workerId: 1,
2156
+ evalId: evalCase.id,
2157
+ status: "completed",
2158
+ startedAt: 0,
2159
+ completedAt: Date.now()
2160
+ });
2161
+ }
2162
+ }
2163
+ return results;
2164
+ }
2165
+ async function runEvalCase(options) {
1865
2166
  const {
1866
- testCase,
2167
+ evalCase,
1867
2168
  provider,
1868
2169
  target,
1869
2170
  graders,
@@ -1876,11 +2177,11 @@ async function runTestCase(options) {
1876
2177
  signal,
1877
2178
  judgeProvider
1878
2179
  } = options;
1879
- const promptInputs = await buildPromptInputs(testCase);
2180
+ const promptInputs = await buildPromptInputs(evalCase);
1880
2181
  if (promptDumpDir) {
1881
- await dumpPrompt(promptDumpDir, testCase, promptInputs);
2182
+ await dumpPrompt(promptDumpDir, evalCase, promptInputs);
1882
2183
  }
1883
- const cacheKey = useCache ? createCacheKey(provider, target, testCase, promptInputs) : void 0;
2184
+ const cacheKey = useCache ? createCacheKey(provider, target, evalCase, promptInputs) : void 0;
1884
2185
  let cachedResponse;
1885
2186
  if (cacheKey && cache) {
1886
2187
  cachedResponse = await cache.get(cacheKey);
@@ -1893,7 +2194,7 @@ async function runTestCase(options) {
1893
2194
  while (!providerResponse && attempt < attemptBudget) {
1894
2195
  try {
1895
2196
  providerResponse = await invokeProvider(provider, {
1896
- testCase,
2197
+ evalCase,
1897
2198
  target,
1898
2199
  promptInputs,
1899
2200
  attempt,
@@ -1906,12 +2207,12 @@ async function runTestCase(options) {
1906
2207
  attempt += 1;
1907
2208
  continue;
1908
2209
  }
1909
- return buildErrorResult(testCase, target.name, nowFn(), error, promptInputs);
2210
+ return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
1910
2211
  }
1911
2212
  }
1912
2213
  if (!providerResponse) {
1913
2214
  return buildErrorResult(
1914
- testCase,
2215
+ evalCase,
1915
2216
  target.name,
1916
2217
  nowFn(),
1917
2218
  lastError ?? new Error("Provider did not return a response"),
@@ -1921,7 +2222,7 @@ async function runTestCase(options) {
1921
2222
  if (cacheKey && cache && !cachedResponse) {
1922
2223
  await cache.set(cacheKey, providerResponse);
1923
2224
  }
1924
- const graderKind = testCase.grader ?? "heuristic";
2225
+ const graderKind = evalCase.grader ?? "heuristic";
1925
2226
  const activeGrader = graders[graderKind] ?? graders.heuristic;
1926
2227
  if (!activeGrader) {
1927
2228
  throw new Error(`No grader registered for kind '${graderKind}'`);
@@ -1930,7 +2231,7 @@ async function runTestCase(options) {
1930
2231
  try {
1931
2232
  const gradeTimestamp = nowFn();
1932
2233
  grade = await activeGrader.grade({
1933
- testCase,
2234
+ evalCase,
1934
2235
  candidate: providerResponse.text ?? "",
1935
2236
  target,
1936
2237
  provider,
@@ -1940,17 +2241,18 @@ async function runTestCase(options) {
1940
2241
  judgeProvider
1941
2242
  });
1942
2243
  } catch (error) {
1943
- return buildErrorResult(testCase, target.name, nowFn(), error, promptInputs);
2244
+ return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
1944
2245
  }
1945
2246
  const completedAt = nowFn();
1946
2247
  const rawRequest = {
1947
2248
  request: promptInputs.request,
1948
2249
  guidelines: promptInputs.guidelines,
1949
- guideline_paths: testCase.guideline_paths
2250
+ guideline_paths: evalCase.guideline_paths,
2251
+ system_message: promptInputs.systemMessage ?? ""
1950
2252
  };
1951
2253
  return {
1952
- eval_id: testCase.id,
1953
- conversation_id: testCase.conversation_id,
2254
+ eval_id: evalCase.id,
2255
+ conversation_id: evalCase.conversation_id,
1954
2256
  score: grade.score,
1955
2257
  hits: grade.hits,
1956
2258
  misses: grade.misses,
@@ -1964,11 +2266,11 @@ async function runTestCase(options) {
1964
2266
  grader_raw_request: grade.graderRawRequest
1965
2267
  };
1966
2268
  }
1967
- function filterTestCases(testCases, evalId) {
2269
+ function filterEvalCases(evalCases, evalId) {
1968
2270
  if (!evalId) {
1969
- return testCases;
2271
+ return evalCases;
1970
2272
  }
1971
- return testCases.filter((testCase) => testCase.id === evalId);
2273
+ return evalCases.filter((evalCase) => evalCase.id === evalId);
1972
2274
  }
1973
2275
  function buildGraderRegistry(overrides, resolveJudgeProvider) {
1974
2276
  const heuristic = overrides?.heuristic ?? new HeuristicGrader();
@@ -1986,16 +2288,16 @@ function buildGraderRegistry(overrides, resolveJudgeProvider) {
1986
2288
  llm_judge: llmJudge
1987
2289
  };
1988
2290
  }
1989
- async function dumpPrompt(directory, testCase, promptInputs) {
2291
+ async function dumpPrompt(directory, evalCase, promptInputs) {
1990
2292
  const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
1991
- const filename = `${timestamp}_${sanitizeFilename(testCase.id)}.json`;
2293
+ const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
1992
2294
  const filePath = path4.resolve(directory, filename);
1993
2295
  await mkdir(path4.dirname(filePath), { recursive: true });
1994
2296
  const payload = {
1995
- eval_id: testCase.id,
2297
+ eval_id: evalCase.id,
1996
2298
  request: promptInputs.request,
1997
2299
  guidelines: promptInputs.guidelines,
1998
- guideline_paths: testCase.guideline_paths
2300
+ guideline_paths: evalCase.guideline_paths
1999
2301
  };
2000
2302
  await writeFile2(filePath, JSON.stringify(payload, null, 2), "utf8");
2001
2303
  }
@@ -2007,7 +2309,7 @@ function sanitizeFilename(value) {
2007
2309
  return sanitized.length > 0 ? sanitized : randomUUID2();
2008
2310
  }
2009
2311
  async function invokeProvider(provider, options) {
2010
- const { testCase, target, promptInputs, attempt, agentTimeoutMs, signal } = options;
2312
+ const { evalCase, target, promptInputs, attempt, agentTimeoutMs, signal } = options;
2011
2313
  const controller = new AbortController();
2012
2314
  const timeout = agentTimeoutMs ? setTimeout(() => controller.abort(), agentTimeoutMs) : void 0;
2013
2315
  if (signal) {
@@ -2017,12 +2319,12 @@ async function invokeProvider(provider, options) {
2017
2319
  return await provider.invoke({
2018
2320
  prompt: promptInputs.request,
2019
2321
  guidelines: promptInputs.guidelines,
2020
- attachments: testCase.guideline_paths,
2021
- testCaseId: testCase.id,
2322
+ guideline_patterns: evalCase.guideline_patterns,
2323
+ attachments: evalCase.file_paths,
2324
+ evalCaseId: evalCase.id,
2022
2325
  attempt,
2023
2326
  metadata: {
2024
- target: target.name,
2025
- grader: testCase.grader
2327
+ systemPrompt: promptInputs.systemMessage ?? ""
2026
2328
  },
2027
2329
  signal: controller.signal
2028
2330
  });
@@ -2032,17 +2334,18 @@ async function invokeProvider(provider, options) {
2032
2334
  }
2033
2335
  }
2034
2336
  }
2035
- function buildErrorResult(testCase, targetName, timestamp, error, promptInputs) {
2337
+ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs) {
2036
2338
  const message = error instanceof Error ? error.message : String(error);
2037
2339
  const rawRequest = {
2038
2340
  request: promptInputs.request,
2039
2341
  guidelines: promptInputs.guidelines,
2040
- guideline_paths: testCase.guideline_paths,
2342
+ guideline_paths: evalCase.guideline_paths,
2343
+ system_message: promptInputs.systemMessage ?? "",
2041
2344
  error: message
2042
2345
  };
2043
2346
  return {
2044
- eval_id: testCase.id,
2045
- conversation_id: testCase.conversation_id,
2347
+ eval_id: evalCase.id,
2348
+ conversation_id: evalCase.conversation_id,
2046
2349
  score: 0,
2047
2350
  hits: [],
2048
2351
  misses: [`Error: ${message}`],
@@ -2054,13 +2357,14 @@ function buildErrorResult(testCase, targetName, timestamp, error, promptInputs)
2054
2357
  raw_request: rawRequest
2055
2358
  };
2056
2359
  }
2057
- function createCacheKey(provider, target, testCase, promptInputs) {
2360
+ function createCacheKey(provider, target, evalCase, promptInputs) {
2058
2361
  const hash = createHash("sha256");
2059
2362
  hash.update(provider.id);
2060
2363
  hash.update(target.name);
2061
- hash.update(testCase.id);
2364
+ hash.update(evalCase.id);
2062
2365
  hash.update(promptInputs.request);
2063
2366
  hash.update(promptInputs.guidelines);
2367
+ hash.update(promptInputs.systemMessage ?? "");
2064
2368
  return hash.digest("hex");
2065
2369
  }
2066
2370
  function isTimeoutLike(error) {
@@ -2088,7 +2392,9 @@ export {
2088
2392
  HeuristicGrader,
2089
2393
  QualityGrader,
2090
2394
  TEST_MESSAGE_ROLES,
2395
+ buildDirectoryChain,
2091
2396
  buildPromptInputs,
2397
+ buildSearchRoots,
2092
2398
  calculateHits,
2093
2399
  calculateMisses,
2094
2400
  createAgentKernel,
@@ -2096,6 +2402,8 @@ export {
2096
2402
  ensureVSCodeSubagents,
2097
2403
  extractAspects,
2098
2404
  extractCodeBlocks,
2405
+ fileExists,
2406
+ findGitRoot,
2099
2407
  getHitCount,
2100
2408
  isErrorLike,
2101
2409
  isGraderKind,
@@ -2105,12 +2413,13 @@ export {
2105
2413
  isTestMessage,
2106
2414
  isTestMessageRole,
2107
2415
  listTargetNames,
2108
- loadTestCases,
2416
+ loadEvalCases,
2109
2417
  readTargetDefinitions,
2110
2418
  resolveAndCreateProvider,
2419
+ resolveFileReference,
2111
2420
  resolveTargetDefinition,
2421
+ runEvalCase,
2112
2422
  runEvaluation,
2113
- runTestCase,
2114
2423
  scoreCandidateResponse
2115
2424
  };
2116
2425
  //# sourceMappingURL=index.js.map