@agentv/core 0.2.6 → 0.2.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,8 +1,11 @@
1
1
  import {
2
2
  TARGETS_SCHEMA_V2,
3
+ buildDirectoryChain,
3
4
  buildSearchRoots,
5
+ fileExists,
6
+ findGitRoot,
4
7
  resolveFileReference
5
- } from "./chunk-QVS4OL44.js";
8
+ } from "./chunk-XXNQA4EW.js";
6
9
 
7
10
  // src/evaluation/types.ts
8
11
  var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
@@ -56,6 +59,7 @@ function getHitCount(result) {
56
59
  }
57
60
 
58
61
  // src/evaluation/yaml-parser.ts
62
+ import micromatch from "micromatch";
59
63
  import { constants } from "node:fs";
60
64
  import { access, readFile } from "node:fs/promises";
61
65
  import path from "node:path";
@@ -65,9 +69,52 @@ var CODE_BLOCK_PATTERN = /```[\s\S]*?```/g;
65
69
  var ANSI_YELLOW = "\x1B[33m";
66
70
  var ANSI_RESET = "\x1B[0m";
67
71
  var SCHEMA_EVAL_V2 = "agentv-eval-v2";
68
- function isGuidelineFile(filePath) {
72
+ var SCHEMA_CONFIG_V2 = "agentv-config-v2";
73
+ async function loadConfig(evalFilePath, repoRoot) {
74
+ const directories = buildDirectoryChain(evalFilePath, repoRoot);
75
+ for (const directory of directories) {
76
+ const configPath = path.join(directory, ".agentv", "config.yaml");
77
+ if (!await fileExists2(configPath)) {
78
+ continue;
79
+ }
80
+ try {
81
+ const rawConfig = await readFile(configPath, "utf8");
82
+ const parsed = parse(rawConfig);
83
+ if (!isJsonObject(parsed)) {
84
+ logWarning(`Invalid .agentv/config.yaml format at ${configPath}`);
85
+ continue;
86
+ }
87
+ const config = parsed;
88
+ const schema = config.$schema;
89
+ if (schema !== SCHEMA_CONFIG_V2) {
90
+ const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${configPath}. Expected '${SCHEMA_CONFIG_V2}'` : `Missing required field '$schema' in ${configPath}.
91
+ Please add '$schema: ${SCHEMA_CONFIG_V2}' at the top of the file.`;
92
+ logWarning(message);
93
+ continue;
94
+ }
95
+ const guidelinePatterns = config.guideline_patterns;
96
+ if (guidelinePatterns !== void 0 && !Array.isArray(guidelinePatterns)) {
97
+ logWarning(`Invalid guideline_patterns in ${configPath}, expected array`);
98
+ continue;
99
+ }
100
+ if (Array.isArray(guidelinePatterns) && !guidelinePatterns.every((p) => typeof p === "string")) {
101
+ logWarning(`Invalid guideline_patterns in ${configPath}, all entries must be strings`);
102
+ continue;
103
+ }
104
+ return {
105
+ guideline_patterns: guidelinePatterns
106
+ };
107
+ } catch (error) {
108
+ logWarning(`Could not read .agentv/config.yaml at ${configPath}: ${error.message}`);
109
+ continue;
110
+ }
111
+ }
112
+ return null;
113
+ }
114
+ function isGuidelineFile(filePath, patterns) {
69
115
  const normalized = filePath.split("\\").join("/");
70
- return normalized.endsWith(".instructions.md") || normalized.includes("/instructions/") || normalized.endsWith(".prompt.md") || normalized.includes("/prompts/");
116
+ const patternsToUse = patterns ?? [];
117
+ return micromatch.isMatch(normalized, patternsToUse);
71
118
  }
72
119
  function extractCodeBlocks(segments) {
73
120
  const codeBlocks = [];
@@ -87,43 +134,45 @@ function extractCodeBlocks(segments) {
87
134
  }
88
135
  return codeBlocks;
89
136
  }
90
- async function loadTestCases(testFilePath, repoRoot, options) {
137
+ async function loadEvalCases(evalFilePath, repoRoot, options) {
91
138
  const verbose = options?.verbose ?? false;
92
- const absoluteTestPath = path.resolve(testFilePath);
93
- if (!await fileExists(absoluteTestPath)) {
94
- throw new Error(`Test file not found: ${testFilePath}`);
139
+ const absoluteTestPath = path.resolve(evalFilePath);
140
+ if (!await fileExists2(absoluteTestPath)) {
141
+ throw new Error(`Test file not found: ${evalFilePath}`);
95
142
  }
96
143
  const repoRootPath = resolveToAbsolutePath(repoRoot);
97
144
  const searchRoots = buildSearchRoots(absoluteTestPath, repoRootPath);
145
+ const config = await loadConfig(absoluteTestPath, repoRootPath);
146
+ const guidelinePatterns = config?.guideline_patterns;
98
147
  const rawFile = await readFile(absoluteTestPath, "utf8");
99
148
  const parsed = parse(rawFile);
100
149
  if (!isJsonObject(parsed)) {
101
- throw new Error(`Invalid test file format: ${testFilePath}`);
150
+ throw new Error(`Invalid test file format: ${evalFilePath}`);
102
151
  }
103
152
  const suite = parsed;
104
153
  const schema = suite.$schema;
105
154
  if (schema !== SCHEMA_EVAL_V2) {
106
- const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${testFilePath}. Expected '${SCHEMA_EVAL_V2}'` : `Missing required field '$schema' in ${testFilePath}.
155
+ const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${evalFilePath}. Expected '${SCHEMA_EVAL_V2}'` : `Missing required field '$schema' in ${evalFilePath}.
107
156
  Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
108
157
  throw new Error(message);
109
158
  }
110
159
  const rawTestcases = suite.evalcases;
111
160
  if (!Array.isArray(rawTestcases)) {
112
- throw new Error(`Invalid test file format: ${testFilePath} - missing 'evalcases' field`);
161
+ throw new Error(`Invalid test file format: ${evalFilePath} - missing 'evalcases' field`);
113
162
  }
114
163
  const globalGrader = coerceGrader(suite.grader) ?? "llm_judge";
115
164
  const results = [];
116
- for (const rawTestcase of rawTestcases) {
117
- if (!isJsonObject(rawTestcase)) {
165
+ for (const rawEvalcase of rawTestcases) {
166
+ if (!isJsonObject(rawEvalcase)) {
118
167
  logWarning("Skipping invalid test case entry (expected object)");
119
168
  continue;
120
169
  }
121
- const testcase = rawTestcase;
122
- const id = asString(testcase.id);
123
- const conversationId = asString(testcase.conversation_id);
124
- const outcome = asString(testcase.outcome);
125
- const inputMessagesValue = testcase.input_messages;
126
- const expectedMessagesValue = testcase.expected_messages;
170
+ const evalcase = rawEvalcase;
171
+ const id = asString(evalcase.id);
172
+ const conversationId = asString(evalcase.conversation_id);
173
+ const outcome = asString(evalcase.outcome);
174
+ const inputMessagesValue = evalcase.input_messages;
175
+ const expectedMessagesValue = evalcase.expected_messages;
127
176
  if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
128
177
  logWarning(`Skipping incomplete test case: ${id ?? "unknown"}`);
129
178
  continue;
@@ -136,6 +185,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
136
185
  const expectedMessages = expectedMessagesValue.filter((msg) => isTestMessage(msg));
137
186
  const assistantMessages = expectedMessages.filter((message) => message.role === "assistant");
138
187
  const userMessages = inputMessages.filter((message) => message.role === "user");
188
+ const systemMessages = inputMessages.filter((message) => message.role === "system");
139
189
  if (assistantMessages.length === 0) {
140
190
  logWarning(`No assistant message found for test case: ${id}`);
141
191
  continue;
@@ -143,6 +193,29 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
143
193
  if (assistantMessages.length > 1) {
144
194
  logWarning(`Multiple assistant messages found for test case: ${id}, using first`);
145
195
  }
196
+ if (systemMessages.length > 1) {
197
+ logWarning(`Multiple system messages found for test case: ${id}, using first`);
198
+ }
199
+ let systemMessageContent;
200
+ if (systemMessages.length > 0) {
201
+ const content = systemMessages[0]?.content;
202
+ if (typeof content === "string") {
203
+ systemMessageContent = content;
204
+ } else if (Array.isArray(content)) {
205
+ const textParts = [];
206
+ for (const segment of content) {
207
+ if (isJsonObject(segment)) {
208
+ const value = segment.value;
209
+ if (typeof value === "string") {
210
+ textParts.push(value);
211
+ }
212
+ }
213
+ }
214
+ if (textParts.length > 0) {
215
+ systemMessageContent = textParts.join("\n\n");
216
+ }
217
+ }
218
+ }
146
219
  const userSegments = [];
147
220
  const guidelinePaths = [];
148
221
  const userTextParts = [];
@@ -174,7 +247,8 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
174
247
  }
175
248
  try {
176
249
  const fileContent = (await readFile(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
177
- if (isGuidelineFile(displayPath)) {
250
+ const relativeToRepo = path.relative(repoRootPath, resolvedPath);
251
+ if (isGuidelineFile(relativeToRepo, guidelinePatterns)) {
178
252
  guidelinePaths.push(path.resolve(resolvedPath));
179
253
  if (verbose) {
180
254
  console.log(` [Guideline] Found: ${displayPath}`);
@@ -184,7 +258,8 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
184
258
  userSegments.push({
185
259
  type: "file",
186
260
  path: displayPath,
187
- text: fileContent
261
+ text: fileContent,
262
+ resolvedPath: path.resolve(resolvedPath)
188
263
  });
189
264
  if (verbose) {
190
265
  console.log(` [File] Found: ${displayPath}`);
@@ -208,14 +283,27 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
208
283
  const assistantContent = assistantMessages[0]?.content;
209
284
  const expectedAssistantRaw = await resolveAssistantContent(assistantContent, searchRoots, verbose);
210
285
  const userTextPrompt = userTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
211
- const testCaseGrader = coerceGrader(testcase.grader) ?? globalGrader;
286
+ const testCaseGrader = coerceGrader(evalcase.grader) ?? globalGrader;
287
+ const userFilePaths = [];
288
+ for (const segment of userSegments) {
289
+ if (segment.type === "file" && typeof segment.resolvedPath === "string") {
290
+ userFilePaths.push(segment.resolvedPath);
291
+ }
292
+ }
293
+ const allFilePaths = [
294
+ ...guidelinePaths.map((guidelinePath) => path.resolve(guidelinePath)),
295
+ ...userFilePaths
296
+ ];
212
297
  const testCase = {
213
298
  id,
214
299
  conversation_id: conversationId,
215
300
  task: userTextPrompt,
216
301
  user_segments: userSegments,
302
+ system_message: systemMessageContent,
217
303
  expected_assistant_raw: expectedAssistantRaw,
218
304
  guideline_paths: guidelinePaths.map((guidelinePath) => path.resolve(guidelinePath)),
305
+ guideline_patterns: guidelinePatterns,
306
+ file_paths: allFilePaths,
219
307
  code_snippets: codeSnippets,
220
308
  outcome,
221
309
  grader: testCaseGrader
@@ -240,7 +328,7 @@ async function buildPromptInputs(testCase) {
240
328
  const guidelineContents = [];
241
329
  for (const rawPath of testCase.guideline_paths) {
242
330
  const absolutePath = path.resolve(rawPath);
243
- if (!await fileExists(absolutePath)) {
331
+ if (!await fileExists2(absolutePath)) {
244
332
  logWarning(`Could not read guideline file ${absolutePath}: file does not exist`);
245
333
  continue;
246
334
  }
@@ -281,9 +369,9 @@ ${body}`);
281
369
  }
282
370
  const request = requestParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
283
371
  const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
284
- return { request, guidelines };
372
+ return { request, guidelines, systemMessage: testCase.system_message };
285
373
  }
286
- async function fileExists(absolutePath) {
374
+ async function fileExists2(absolutePath) {
287
375
  try {
288
376
  await access(absolutePath, constants.F_OK);
289
377
  return true;
@@ -407,15 +495,18 @@ function buildChatPrompt(request) {
407
495
  return request.chatPrompt;
408
496
  }
409
497
  const systemSegments = [];
410
- if (request.guidelines && request.guidelines.trim().length > 0) {
411
- systemSegments.push(`Guidelines:
412
- ${request.guidelines.trim()}`);
413
- }
414
498
  const metadataSystemPrompt = typeof request.metadata?.systemPrompt === "string" ? request.metadata.systemPrompt : void 0;
415
499
  if (metadataSystemPrompt && metadataSystemPrompt.trim().length > 0) {
416
500
  systemSegments.push(metadataSystemPrompt.trim());
501
+ } else {
502
+ systemSegments.push(DEFAULT_SYSTEM_PROMPT);
503
+ }
504
+ if (request.guidelines && request.guidelines.trim().length > 0) {
505
+ systemSegments.push(`[[ ## Guidelines ## ]]
506
+
507
+ ${request.guidelines.trim()}`);
417
508
  }
418
- const systemContent = systemSegments.length > 0 ? systemSegments.join("\n\n") : DEFAULT_SYSTEM_PROMPT;
509
+ const systemContent = systemSegments.join("\n\n");
419
510
  const userContent = request.prompt.trim();
420
511
  const prompt = [
421
512
  {
@@ -871,11 +962,9 @@ function isLikelyEnvReference(value) {
871
962
  }
872
963
 
873
964
  // src/evaluation/providers/vscode.ts
874
- import { mkdtemp, readFile as readFile2, rm, writeFile } from "node:fs/promises";
875
- import { tmpdir } from "node:os";
965
+ import { readFile as readFile2 } from "node:fs/promises";
876
966
  import path2 from "node:path";
877
967
  import { dispatchAgentSession, getSubagentRoot, provisionSubagents } from "subagent";
878
- var PROMPT_FILE_PREFIX = "agentv-vscode-";
879
968
  var VSCodeProvider = class {
880
969
  id;
881
970
  kind;
@@ -892,128 +981,89 @@ var VSCodeProvider = class {
892
981
  throw new Error("VS Code provider request was aborted before dispatch");
893
982
  }
894
983
  const attachments = normalizeAttachments(request.attachments);
895
- const promptContent = buildPromptDocument(request, attachments);
896
- const directory = await mkdtemp(path2.join(tmpdir(), PROMPT_FILE_PREFIX));
897
- const promptPath = path2.join(directory, `${request.testCaseId ?? "request"}.prompt.md`);
898
- try {
899
- await writeFile(promptPath, promptContent, "utf8");
900
- const session = await dispatchAgentSession({
901
- userQuery: composeUserQuery(request),
902
- promptFile: promptPath,
903
- extraAttachments: attachments,
904
- wait: this.config.waitForResponse,
905
- dryRun: this.config.dryRun,
906
- vscodeCmd: this.config.command,
907
- subagentRoot: this.config.subagentRoot,
908
- workspaceTemplate: this.config.workspaceTemplate,
909
- silent: true
910
- });
911
- if (session.exitCode !== 0 || !session.responseFile) {
912
- const failure = session.error ?? "VS Code subagent did not produce a response";
913
- throw new Error(failure);
914
- }
915
- if (this.config.dryRun) {
916
- return {
917
- text: "",
918
- raw: {
919
- session,
920
- promptFile: promptPath,
921
- attachments
922
- }
923
- };
924
- }
925
- const responseText = await readFile2(session.responseFile, "utf8");
984
+ const promptContent = buildPromptDocument(request, attachments, request.guideline_patterns);
985
+ const session = await dispatchAgentSession({
986
+ userQuery: promptContent,
987
+ // Use full prompt content instead of just request.prompt
988
+ extraAttachments: attachments,
989
+ wait: this.config.waitForResponse,
990
+ dryRun: this.config.dryRun,
991
+ vscodeCmd: this.config.command,
992
+ subagentRoot: this.config.subagentRoot,
993
+ workspaceTemplate: this.config.workspaceTemplate,
994
+ silent: true
995
+ });
996
+ if (session.exitCode !== 0 || !session.responseFile) {
997
+ const failure = session.error ?? "VS Code subagent did not produce a response";
998
+ throw new Error(failure);
999
+ }
1000
+ if (this.config.dryRun) {
926
1001
  return {
927
- text: responseText,
1002
+ text: "",
928
1003
  raw: {
929
1004
  session,
930
- promptFile: promptPath,
931
1005
  attachments
932
1006
  }
933
1007
  };
934
- } finally {
935
- await rm(directory, { recursive: true, force: true });
936
1008
  }
1009
+ const responseText = await readFile2(session.responseFile, "utf8");
1010
+ return {
1011
+ text: responseText,
1012
+ raw: {
1013
+ session,
1014
+ attachments
1015
+ }
1016
+ };
937
1017
  }
938
1018
  };
939
- function buildPromptDocument(request, attachments) {
1019
+ function buildPromptDocument(request, attachments, guidelinePatterns) {
940
1020
  const parts = [];
941
- const instructionFiles = collectInstructionFiles(attachments);
942
- if (instructionFiles.length > 0) {
943
- parts.push(buildMandatoryPrereadBlock(instructionFiles));
944
- }
945
- parts.push(`# AgentV Request`);
946
- if (request.testCaseId) {
947
- parts.push(`- Test Case: ${request.testCaseId}`);
948
- }
949
- if (request.metadata?.target) {
950
- parts.push(`- Target: ${String(request.metadata.target)}`);
951
- }
952
- parts.push("\n## Task\n", request.prompt.trim());
953
- if (request.guidelines && request.guidelines.trim().length > 0) {
954
- parts.push("\n## Guidelines\n", request.guidelines.trim());
955
- }
956
- if (attachments && attachments.length > 0) {
957
- const attachmentList = attachments.map((item) => `- ${item}`).join("\n");
958
- parts.push("\n## Attachments\n", attachmentList);
1021
+ const guidelineFiles = collectGuidelineFiles(attachments, guidelinePatterns);
1022
+ if (guidelineFiles.length > 0) {
1023
+ parts.push("\n", buildMandatoryPrereadBlock(guidelineFiles));
959
1024
  }
1025
+ parts.push("\n[[ ## user_query ## ]]\n", request.prompt.trim());
960
1026
  return parts.join("\n").trim();
961
1027
  }
962
- function buildMandatoryPrereadBlock(instructionFiles) {
963
- if (instructionFiles.length === 0) {
1028
+ function buildMandatoryPrereadBlock(guidelineFiles) {
1029
+ if (guidelineFiles.length === 0) {
964
1030
  return "";
965
1031
  }
966
1032
  const fileList = [];
967
- const tokenList = [];
968
1033
  let counter = 0;
969
- for (const absolutePath of instructionFiles) {
1034
+ for (const absolutePath of guidelineFiles) {
970
1035
  counter += 1;
971
1036
  const fileName = path2.basename(absolutePath);
972
1037
  const fileUri = pathToFileUri(absolutePath);
973
- fileList.push(`[${fileName}](${fileUri})`);
974
- tokenList.push(`INSTRUCTIONS_READ: \`${fileName}\` i=${counter} SHA256=<hex>`);
1038
+ fileList.push(`* [${fileName}](${fileUri})`);
975
1039
  }
976
- const filesText = fileList.join(", ");
977
- const tokensText = tokenList.join("\n");
1040
+ const filesText = fileList.join("\n");
978
1041
  const instruction = [
979
- `Read all instruction files: ${filesText}.`,
980
- `After reading each file, compute its SHA256 hash using this PowerShell command:`,
981
- "`Get-FileHash -Algorithm SHA256 -LiteralPath '<file-path>' | Select-Object -ExpandProperty Hash`.",
982
- `Then include, at the top of your reply, these exact tokens on separate lines:
1042
+ `Read all guideline files:
1043
+ ${filesText}.
983
1044
  `,
984
- tokensText,
985
- `
986
- Replace \`<hex>\` with the actual SHA256 hash value computed from the PowerShell command.`,
987
1045
  `If any file is missing, fail with ERROR: missing-file <filename> and stop.
988
1046
  `,
989
- `Then fetch all documentation required by the instructions before proceeding with your task.`
990
- ].join(" ");
991
- return `[[ ## mandatory_pre_read ## ]]
992
-
993
- ${instruction}
994
-
995
- `;
1047
+ `Then apply system_instructions on the user query below.`
1048
+ ].join("");
1049
+ return `${instruction}`;
996
1050
  }
997
- function collectInstructionFiles(attachments) {
1051
+ function collectGuidelineFiles(attachments, guidelinePatterns) {
998
1052
  if (!attachments || attachments.length === 0) {
999
1053
  return [];
1000
1054
  }
1001
1055
  const unique = /* @__PURE__ */ new Map();
1002
1056
  for (const attachment of attachments) {
1003
- if (!isInstructionPath(attachment)) {
1004
- continue;
1005
- }
1006
1057
  const absolutePath = path2.resolve(attachment);
1007
- if (!unique.has(absolutePath)) {
1008
- unique.set(absolutePath, absolutePath);
1058
+ const normalized = absolutePath.split(path2.sep).join("/");
1059
+ if (isGuidelineFile(normalized, guidelinePatterns)) {
1060
+ if (!unique.has(absolutePath)) {
1061
+ unique.set(absolutePath, absolutePath);
1062
+ }
1009
1063
  }
1010
1064
  }
1011
1065
  return Array.from(unique.values());
1012
1066
  }
1013
- function isInstructionPath(filePath) {
1014
- const normalized = filePath.split(path2.sep).join("/");
1015
- return normalized.endsWith(".instructions.md") || normalized.includes("/instructions/") || normalized.endsWith(".prompt.md") || normalized.includes("/prompts/");
1016
- }
1017
1067
  function pathToFileUri(filePath) {
1018
1068
  const absolutePath = path2.isAbsolute(filePath) ? filePath : path2.resolve(filePath);
1019
1069
  const normalizedPath = absolutePath.replace(/\\/g, "/");
@@ -1022,14 +1072,6 @@ function pathToFileUri(filePath) {
1022
1072
  }
1023
1073
  return `file://${normalizedPath}`;
1024
1074
  }
1025
- function composeUserQuery(request) {
1026
- const segments = [];
1027
- segments.push(request.prompt.trim());
1028
- if (request.guidelines && request.guidelines.trim().length > 0) {
1029
- segments.push("\nGuidelines:\n", request.guidelines.trim());
1030
- }
1031
- return segments.join("\n").trim();
1032
- }
1033
1075
  function normalizeAttachments(attachments) {
1034
1076
  if (!attachments || attachments.length === 0) {
1035
1077
  return void 0;
@@ -1136,7 +1178,7 @@ function assertTargetDefinition(value, index, filePath) {
1136
1178
  judge_target: typeof judgeTarget === "string" ? judgeTarget : void 0
1137
1179
  };
1138
1180
  }
1139
- async function fileExists2(filePath) {
1181
+ async function fileExists3(filePath) {
1140
1182
  try {
1141
1183
  await access2(filePath, constants2.F_OK);
1142
1184
  return true;
@@ -1146,7 +1188,7 @@ async function fileExists2(filePath) {
1146
1188
  }
1147
1189
  async function readTargetDefinitions(filePath) {
1148
1190
  const absolutePath = path3.resolve(filePath);
1149
- if (!await fileExists2(absolutePath)) {
1191
+ if (!await fileExists3(absolutePath)) {
1150
1192
  throw new Error(`targets.yaml not found at ${absolutePath}`);
1151
1193
  }
1152
1194
  const raw = await readFile3(absolutePath, "utf8");
@@ -1376,7 +1418,7 @@ import { randomUUID } from "node:crypto";
1376
1418
  var HeuristicGrader = class {
1377
1419
  kind = "heuristic";
1378
1420
  grade(context) {
1379
- const expectedAspects = extractAspects(context.testCase.expected_assistant_raw);
1421
+ const expectedAspects = extractAspects(context.evalCase.expected_assistant_raw);
1380
1422
  const result = scoreCandidateResponse(context.candidate, expectedAspects);
1381
1423
  const misses = [...result.misses];
1382
1424
  if (expectedAspects.length === 0 && isErrorLike(context.candidate)) {
@@ -1409,14 +1451,14 @@ var QualityGrader = class {
1409
1451
  if (!judgeProvider) {
1410
1452
  throw new Error("No judge provider available for LLM grading");
1411
1453
  }
1412
- const prompt = buildQualityPrompt(context.testCase, context.candidate);
1454
+ const prompt = buildQualityPrompt(context.evalCase, context.candidate);
1413
1455
  const metadata = {
1414
1456
  systemPrompt: QUALITY_SYSTEM_PROMPT
1415
1457
  };
1416
1458
  const response = await judgeProvider.invoke({
1417
1459
  prompt,
1418
1460
  metadata,
1419
- testCaseId: context.testCase.id,
1461
+ evalCaseId: context.evalCase.id,
1420
1462
  attempt: context.attempt,
1421
1463
  maxOutputTokens: this.maxOutputTokens,
1422
1464
  temperature: this.temperature
@@ -1462,16 +1504,16 @@ var QUALITY_SYSTEM_PROMPT = [
1462
1504
  function buildQualityPrompt(testCase, candidate) {
1463
1505
  const parts = [
1464
1506
  "[[ ## expected_outcome ## ]]",
1465
- testCase.outcome,
1507
+ testCase.outcome.trim(),
1466
1508
  "",
1467
1509
  "[[ ## request ## ]]",
1468
- testCase.task,
1510
+ testCase.task.trim(),
1469
1511
  "",
1470
1512
  "[[ ## reference_answer ## ]]",
1471
- testCase.expected_assistant_raw,
1513
+ testCase.expected_assistant_raw.trim(),
1472
1514
  "",
1473
1515
  "[[ ## generated_answer ## ]]",
1474
- candidate,
1516
+ candidate.trim(),
1475
1517
  "",
1476
1518
  "Respond with a single JSON object matching the schema described in the system prompt."
1477
1519
  ];
@@ -1720,10 +1762,10 @@ async function runEvaluation(options) {
1720
1762
  onResult,
1721
1763
  onProgress
1722
1764
  } = options;
1723
- const load = loadTestCases;
1724
- const testCases = await load(testFilePath, repoRoot, { verbose });
1725
- const filteredTestCases = filterTestCases(testCases, evalId);
1726
- if (filteredTestCases.length === 0) {
1765
+ const load = loadEvalCases;
1766
+ const evalCases = await load(testFilePath, repoRoot, { verbose });
1767
+ const filteredEvalCases = filterEvalCases(evalCases, evalId);
1768
+ if (filteredEvalCases.length === 0) {
1727
1769
  if (evalId) {
1728
1770
  throw new Error(`Test case with id '${evalId}' not found in ${testFilePath}`);
1729
1771
  }
@@ -1769,11 +1811,11 @@ async function runEvaluation(options) {
1769
1811
  };
1770
1812
  const graderRegistry = buildGraderRegistry(graders, resolveJudgeProvider);
1771
1813
  const primaryProvider = getOrCreateProvider(target);
1772
- if (onProgress && filteredTestCases.length > 0) {
1773
- for (let i = 0; i < filteredTestCases.length; i++) {
1814
+ if (onProgress && filteredEvalCases.length > 0) {
1815
+ for (let i = 0; i < filteredEvalCases.length; i++) {
1774
1816
  await onProgress({
1775
1817
  workerId: i + 1,
1776
- evalId: filteredTestCases[i].id,
1818
+ evalId: filteredEvalCases[i].id,
1777
1819
  status: "pending"
1778
1820
  });
1779
1821
  }
@@ -1782,22 +1824,22 @@ async function runEvaluation(options) {
1782
1824
  const limit = pLimit(workers);
1783
1825
  let nextWorkerId = 1;
1784
1826
  const workerIdByEvalId = /* @__PURE__ */ new Map();
1785
- const promises = filteredTestCases.map(
1786
- (testCase) => limit(async () => {
1827
+ const promises = filteredEvalCases.map(
1828
+ (evalCase) => limit(async () => {
1787
1829
  const workerId = nextWorkerId++;
1788
- workerIdByEvalId.set(testCase.id, workerId);
1830
+ workerIdByEvalId.set(evalCase.id, workerId);
1789
1831
  if (onProgress) {
1790
1832
  await onProgress({
1791
1833
  workerId,
1792
- evalId: testCase.id,
1834
+ evalId: evalCase.id,
1793
1835
  status: "running",
1794
1836
  startedAt: Date.now()
1795
1837
  });
1796
1838
  }
1797
1839
  try {
1798
1840
  const judgeProvider = await resolveJudgeProvider(target);
1799
- const result = await runTestCase({
1800
- testCase,
1841
+ const result = await runEvalCase({
1842
+ evalCase,
1801
1843
  provider: primaryProvider,
1802
1844
  target,
1803
1845
  graders: graderRegistry,
@@ -1812,7 +1854,7 @@ async function runEvaluation(options) {
1812
1854
  if (onProgress) {
1813
1855
  await onProgress({
1814
1856
  workerId,
1815
- evalId: testCase.id,
1857
+ evalId: evalCase.id,
1816
1858
  status: "completed",
1817
1859
  startedAt: 0,
1818
1860
  // Not used for completed status
@@ -1827,7 +1869,7 @@ async function runEvaluation(options) {
1827
1869
  if (onProgress) {
1828
1870
  await onProgress({
1829
1871
  workerId,
1830
- evalId: testCase.id,
1872
+ evalId: evalCase.id,
1831
1873
  status: "failed",
1832
1874
  completedAt: Date.now(),
1833
1875
  error: error instanceof Error ? error.message : String(error)
@@ -1844,10 +1886,10 @@ async function runEvaluation(options) {
1844
1886
  if (outcome.status === "fulfilled") {
1845
1887
  results.push(outcome.value);
1846
1888
  } else {
1847
- const testCase = filteredTestCases[i];
1848
- const promptInputs = await buildPromptInputs(testCase);
1889
+ const evalCase = filteredEvalCases[i];
1890
+ const promptInputs = await buildPromptInputs(evalCase);
1849
1891
  const errorResult = buildErrorResult(
1850
- testCase,
1892
+ evalCase,
1851
1893
  target.name,
1852
1894
  (now ?? (() => /* @__PURE__ */ new Date()))(),
1853
1895
  outcome.reason,
@@ -1861,9 +1903,9 @@ async function runEvaluation(options) {
1861
1903
  }
1862
1904
  return results;
1863
1905
  }
1864
- async function runTestCase(options) {
1906
+ async function runEvalCase(options) {
1865
1907
  const {
1866
- testCase,
1908
+ evalCase,
1867
1909
  provider,
1868
1910
  target,
1869
1911
  graders,
@@ -1876,11 +1918,11 @@ async function runTestCase(options) {
1876
1918
  signal,
1877
1919
  judgeProvider
1878
1920
  } = options;
1879
- const promptInputs = await buildPromptInputs(testCase);
1921
+ const promptInputs = await buildPromptInputs(evalCase);
1880
1922
  if (promptDumpDir) {
1881
- await dumpPrompt(promptDumpDir, testCase, promptInputs);
1923
+ await dumpPrompt(promptDumpDir, evalCase, promptInputs);
1882
1924
  }
1883
- const cacheKey = useCache ? createCacheKey(provider, target, testCase, promptInputs) : void 0;
1925
+ const cacheKey = useCache ? createCacheKey(provider, target, evalCase, promptInputs) : void 0;
1884
1926
  let cachedResponse;
1885
1927
  if (cacheKey && cache) {
1886
1928
  cachedResponse = await cache.get(cacheKey);
@@ -1893,7 +1935,7 @@ async function runTestCase(options) {
1893
1935
  while (!providerResponse && attempt < attemptBudget) {
1894
1936
  try {
1895
1937
  providerResponse = await invokeProvider(provider, {
1896
- testCase,
1938
+ evalCase,
1897
1939
  target,
1898
1940
  promptInputs,
1899
1941
  attempt,
@@ -1906,12 +1948,12 @@ async function runTestCase(options) {
1906
1948
  attempt += 1;
1907
1949
  continue;
1908
1950
  }
1909
- return buildErrorResult(testCase, target.name, nowFn(), error, promptInputs);
1951
+ return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
1910
1952
  }
1911
1953
  }
1912
1954
  if (!providerResponse) {
1913
1955
  return buildErrorResult(
1914
- testCase,
1956
+ evalCase,
1915
1957
  target.name,
1916
1958
  nowFn(),
1917
1959
  lastError ?? new Error("Provider did not return a response"),
@@ -1921,7 +1963,7 @@ async function runTestCase(options) {
1921
1963
  if (cacheKey && cache && !cachedResponse) {
1922
1964
  await cache.set(cacheKey, providerResponse);
1923
1965
  }
1924
- const graderKind = testCase.grader ?? "heuristic";
1966
+ const graderKind = evalCase.grader ?? "heuristic";
1925
1967
  const activeGrader = graders[graderKind] ?? graders.heuristic;
1926
1968
  if (!activeGrader) {
1927
1969
  throw new Error(`No grader registered for kind '${graderKind}'`);
@@ -1930,7 +1972,7 @@ async function runTestCase(options) {
1930
1972
  try {
1931
1973
  const gradeTimestamp = nowFn();
1932
1974
  grade = await activeGrader.grade({
1933
- testCase,
1975
+ evalCase,
1934
1976
  candidate: providerResponse.text ?? "",
1935
1977
  target,
1936
1978
  provider,
@@ -1940,17 +1982,18 @@ async function runTestCase(options) {
1940
1982
  judgeProvider
1941
1983
  });
1942
1984
  } catch (error) {
1943
- return buildErrorResult(testCase, target.name, nowFn(), error, promptInputs);
1985
+ return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
1944
1986
  }
1945
1987
  const completedAt = nowFn();
1946
1988
  const rawRequest = {
1947
1989
  request: promptInputs.request,
1948
1990
  guidelines: promptInputs.guidelines,
1949
- guideline_paths: testCase.guideline_paths
1991
+ guideline_paths: evalCase.guideline_paths,
1992
+ system_message: promptInputs.systemMessage ?? ""
1950
1993
  };
1951
1994
  return {
1952
- eval_id: testCase.id,
1953
- conversation_id: testCase.conversation_id,
1995
+ eval_id: evalCase.id,
1996
+ conversation_id: evalCase.conversation_id,
1954
1997
  score: grade.score,
1955
1998
  hits: grade.hits,
1956
1999
  misses: grade.misses,
@@ -1964,11 +2007,11 @@ async function runTestCase(options) {
1964
2007
  grader_raw_request: grade.graderRawRequest
1965
2008
  };
1966
2009
  }
1967
- function filterTestCases(testCases, evalId) {
2010
+ function filterEvalCases(evalCases, evalId) {
1968
2011
  if (!evalId) {
1969
- return testCases;
2012
+ return evalCases;
1970
2013
  }
1971
- return testCases.filter((testCase) => testCase.id === evalId);
2014
+ return evalCases.filter((evalCase) => evalCase.id === evalId);
1972
2015
  }
1973
2016
  function buildGraderRegistry(overrides, resolveJudgeProvider) {
1974
2017
  const heuristic = overrides?.heuristic ?? new HeuristicGrader();
@@ -1986,16 +2029,16 @@ function buildGraderRegistry(overrides, resolveJudgeProvider) {
1986
2029
  llm_judge: llmJudge
1987
2030
  };
1988
2031
  }
1989
- async function dumpPrompt(directory, testCase, promptInputs) {
2032
+ async function dumpPrompt(directory, evalCase, promptInputs) {
1990
2033
  const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
1991
- const filename = `${timestamp}_${sanitizeFilename(testCase.id)}.json`;
2034
+ const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
1992
2035
  const filePath = path4.resolve(directory, filename);
1993
2036
  await mkdir(path4.dirname(filePath), { recursive: true });
1994
2037
  const payload = {
1995
- eval_id: testCase.id,
2038
+ eval_id: evalCase.id,
1996
2039
  request: promptInputs.request,
1997
2040
  guidelines: promptInputs.guidelines,
1998
- guideline_paths: testCase.guideline_paths
2041
+ guideline_paths: evalCase.guideline_paths
1999
2042
  };
2000
2043
  await writeFile2(filePath, JSON.stringify(payload, null, 2), "utf8");
2001
2044
  }
@@ -2007,7 +2050,7 @@ function sanitizeFilename(value) {
2007
2050
  return sanitized.length > 0 ? sanitized : randomUUID2();
2008
2051
  }
2009
2052
  async function invokeProvider(provider, options) {
2010
- const { testCase, target, promptInputs, attempt, agentTimeoutMs, signal } = options;
2053
+ const { evalCase, target, promptInputs, attempt, agentTimeoutMs, signal } = options;
2011
2054
  const controller = new AbortController();
2012
2055
  const timeout = agentTimeoutMs ? setTimeout(() => controller.abort(), agentTimeoutMs) : void 0;
2013
2056
  if (signal) {
@@ -2017,12 +2060,12 @@ async function invokeProvider(provider, options) {
2017
2060
  return await provider.invoke({
2018
2061
  prompt: promptInputs.request,
2019
2062
  guidelines: promptInputs.guidelines,
2020
- attachments: testCase.guideline_paths,
2021
- testCaseId: testCase.id,
2063
+ guideline_patterns: evalCase.guideline_patterns,
2064
+ attachments: evalCase.file_paths,
2065
+ evalCaseId: evalCase.id,
2022
2066
  attempt,
2023
2067
  metadata: {
2024
- target: target.name,
2025
- grader: testCase.grader
2068
+ systemPrompt: promptInputs.systemMessage ?? ""
2026
2069
  },
2027
2070
  signal: controller.signal
2028
2071
  });
@@ -2032,17 +2075,18 @@ async function invokeProvider(provider, options) {
2032
2075
  }
2033
2076
  }
2034
2077
  }
2035
- function buildErrorResult(testCase, targetName, timestamp, error, promptInputs) {
2078
+ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs) {
2036
2079
  const message = error instanceof Error ? error.message : String(error);
2037
2080
  const rawRequest = {
2038
2081
  request: promptInputs.request,
2039
2082
  guidelines: promptInputs.guidelines,
2040
- guideline_paths: testCase.guideline_paths,
2083
+ guideline_paths: evalCase.guideline_paths,
2084
+ system_message: promptInputs.systemMessage ?? "",
2041
2085
  error: message
2042
2086
  };
2043
2087
  return {
2044
- eval_id: testCase.id,
2045
- conversation_id: testCase.conversation_id,
2088
+ eval_id: evalCase.id,
2089
+ conversation_id: evalCase.conversation_id,
2046
2090
  score: 0,
2047
2091
  hits: [],
2048
2092
  misses: [`Error: ${message}`],
@@ -2054,13 +2098,14 @@ function buildErrorResult(testCase, targetName, timestamp, error, promptInputs)
2054
2098
  raw_request: rawRequest
2055
2099
  };
2056
2100
  }
2057
- function createCacheKey(provider, target, testCase, promptInputs) {
2101
+ function createCacheKey(provider, target, evalCase, promptInputs) {
2058
2102
  const hash = createHash("sha256");
2059
2103
  hash.update(provider.id);
2060
2104
  hash.update(target.name);
2061
- hash.update(testCase.id);
2105
+ hash.update(evalCase.id);
2062
2106
  hash.update(promptInputs.request);
2063
2107
  hash.update(promptInputs.guidelines);
2108
+ hash.update(promptInputs.systemMessage ?? "");
2064
2109
  return hash.digest("hex");
2065
2110
  }
2066
2111
  function isTimeoutLike(error) {
@@ -2088,7 +2133,9 @@ export {
2088
2133
  HeuristicGrader,
2089
2134
  QualityGrader,
2090
2135
  TEST_MESSAGE_ROLES,
2136
+ buildDirectoryChain,
2091
2137
  buildPromptInputs,
2138
+ buildSearchRoots,
2092
2139
  calculateHits,
2093
2140
  calculateMisses,
2094
2141
  createAgentKernel,
@@ -2096,6 +2143,8 @@ export {
2096
2143
  ensureVSCodeSubagents,
2097
2144
  extractAspects,
2098
2145
  extractCodeBlocks,
2146
+ fileExists,
2147
+ findGitRoot,
2099
2148
  getHitCount,
2100
2149
  isErrorLike,
2101
2150
  isGraderKind,
@@ -2105,12 +2154,13 @@ export {
2105
2154
  isTestMessage,
2106
2155
  isTestMessageRole,
2107
2156
  listTargetNames,
2108
- loadTestCases,
2157
+ loadEvalCases,
2109
2158
  readTargetDefinitions,
2110
2159
  resolveAndCreateProvider,
2160
+ resolveFileReference,
2111
2161
  resolveTargetDefinition,
2162
+ runEvalCase,
2112
2163
  runEvaluation,
2113
- runTestCase,
2114
2164
  scoreCandidateResponse
2115
2165
  };
2116
2166
  //# sourceMappingURL=index.js.map