@agentv/core 0.2.3 → 0.2.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,7 +1,11 @@
1
1
  import {
2
+ TARGETS_SCHEMA_V2,
3
+ buildDirectoryChain,
2
4
  buildSearchRoots,
5
+ fileExists,
6
+ findGitRoot,
3
7
  resolveFileReference
4
- } from "./chunk-5REK5RSI.js";
8
+ } from "./chunk-XXNQA4EW.js";
5
9
 
6
10
  // src/evaluation/types.ts
7
11
  var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
@@ -55,6 +59,7 @@ function getHitCount(result) {
55
59
  }
56
60
 
57
61
  // src/evaluation/yaml-parser.ts
62
+ import micromatch from "micromatch";
58
63
  import { constants } from "node:fs";
59
64
  import { access, readFile } from "node:fs/promises";
60
65
  import path from "node:path";
@@ -64,9 +69,52 @@ var CODE_BLOCK_PATTERN = /```[\s\S]*?```/g;
64
69
  var ANSI_YELLOW = "\x1B[33m";
65
70
  var ANSI_RESET = "\x1B[0m";
66
71
  var SCHEMA_EVAL_V2 = "agentv-eval-v2";
67
- function isGuidelineFile(filePath) {
72
+ var SCHEMA_CONFIG_V2 = "agentv-config-v2";
73
+ async function loadConfig(evalFilePath, repoRoot) {
74
+ const directories = buildDirectoryChain(evalFilePath, repoRoot);
75
+ for (const directory of directories) {
76
+ const configPath = path.join(directory, ".agentv", "config.yaml");
77
+ if (!await fileExists2(configPath)) {
78
+ continue;
79
+ }
80
+ try {
81
+ const rawConfig = await readFile(configPath, "utf8");
82
+ const parsed = parse(rawConfig);
83
+ if (!isJsonObject(parsed)) {
84
+ logWarning(`Invalid .agentv/config.yaml format at ${configPath}`);
85
+ continue;
86
+ }
87
+ const config = parsed;
88
+ const schema = config.$schema;
89
+ if (schema !== SCHEMA_CONFIG_V2) {
90
+ const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${configPath}. Expected '${SCHEMA_CONFIG_V2}'` : `Missing required field '$schema' in ${configPath}.
91
+ Please add '$schema: ${SCHEMA_CONFIG_V2}' at the top of the file.`;
92
+ logWarning(message);
93
+ continue;
94
+ }
95
+ const guidelinePatterns = config.guideline_patterns;
96
+ if (guidelinePatterns !== void 0 && !Array.isArray(guidelinePatterns)) {
97
+ logWarning(`Invalid guideline_patterns in ${configPath}, expected array`);
98
+ continue;
99
+ }
100
+ if (Array.isArray(guidelinePatterns) && !guidelinePatterns.every((p) => typeof p === "string")) {
101
+ logWarning(`Invalid guideline_patterns in ${configPath}, all entries must be strings`);
102
+ continue;
103
+ }
104
+ return {
105
+ guideline_patterns: guidelinePatterns
106
+ };
107
+ } catch (error) {
108
+ logWarning(`Could not read .agentv/config.yaml at ${configPath}: ${error.message}`);
109
+ continue;
110
+ }
111
+ }
112
+ return null;
113
+ }
114
+ function isGuidelineFile(filePath, patterns) {
68
115
  const normalized = filePath.split("\\").join("/");
69
- return normalized.endsWith(".instructions.md") || normalized.includes("/instructions/") || normalized.endsWith(".prompt.md") || normalized.includes("/prompts/");
116
+ const patternsToUse = patterns ?? [];
117
+ return micromatch.isMatch(normalized, patternsToUse);
70
118
  }
71
119
  function extractCodeBlocks(segments) {
72
120
  const codeBlocks = [];
@@ -86,43 +134,45 @@ function extractCodeBlocks(segments) {
86
134
  }
87
135
  return codeBlocks;
88
136
  }
89
- async function loadTestCases(testFilePath, repoRoot, options) {
137
+ async function loadEvalCases(evalFilePath, repoRoot, options) {
90
138
  const verbose = options?.verbose ?? false;
91
- const absoluteTestPath = path.resolve(testFilePath);
92
- if (!await fileExists(absoluteTestPath)) {
93
- throw new Error(`Test file not found: ${testFilePath}`);
139
+ const absoluteTestPath = path.resolve(evalFilePath);
140
+ if (!await fileExists2(absoluteTestPath)) {
141
+ throw new Error(`Test file not found: ${evalFilePath}`);
94
142
  }
95
143
  const repoRootPath = resolveToAbsolutePath(repoRoot);
96
144
  const searchRoots = buildSearchRoots(absoluteTestPath, repoRootPath);
145
+ const config = await loadConfig(absoluteTestPath, repoRootPath);
146
+ const guidelinePatterns = config?.guideline_patterns;
97
147
  const rawFile = await readFile(absoluteTestPath, "utf8");
98
148
  const parsed = parse(rawFile);
99
149
  if (!isJsonObject(parsed)) {
100
- throw new Error(`Invalid test file format: ${testFilePath}`);
150
+ throw new Error(`Invalid test file format: ${evalFilePath}`);
101
151
  }
102
152
  const suite = parsed;
103
153
  const schema = suite.$schema;
104
154
  if (schema !== SCHEMA_EVAL_V2) {
105
- const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${testFilePath}. Expected '${SCHEMA_EVAL_V2}'` : `Missing required field '$schema' in ${testFilePath}.
155
+ const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${evalFilePath}. Expected '${SCHEMA_EVAL_V2}'` : `Missing required field '$schema' in ${evalFilePath}.
106
156
  Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
107
157
  throw new Error(message);
108
158
  }
109
159
  const rawTestcases = suite.evalcases;
110
160
  if (!Array.isArray(rawTestcases)) {
111
- throw new Error(`Invalid test file format: ${testFilePath} - missing 'evalcases' field`);
161
+ throw new Error(`Invalid test file format: ${evalFilePath} - missing 'evalcases' field`);
112
162
  }
113
163
  const globalGrader = coerceGrader(suite.grader) ?? "llm_judge";
114
164
  const results = [];
115
- for (const rawTestcase of rawTestcases) {
116
- if (!isJsonObject(rawTestcase)) {
165
+ for (const rawEvalcase of rawTestcases) {
166
+ if (!isJsonObject(rawEvalcase)) {
117
167
  logWarning("Skipping invalid test case entry (expected object)");
118
168
  continue;
119
169
  }
120
- const testcase = rawTestcase;
121
- const id = asString(testcase.id);
122
- const conversationId = asString(testcase.conversation_id);
123
- const outcome = asString(testcase.outcome);
124
- const inputMessagesValue = testcase.input_messages;
125
- const expectedMessagesValue = testcase.expected_messages;
170
+ const evalcase = rawEvalcase;
171
+ const id = asString(evalcase.id);
172
+ const conversationId = asString(evalcase.conversation_id);
173
+ const outcome = asString(evalcase.outcome);
174
+ const inputMessagesValue = evalcase.input_messages;
175
+ const expectedMessagesValue = evalcase.expected_messages;
126
176
  if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
127
177
  logWarning(`Skipping incomplete test case: ${id ?? "unknown"}`);
128
178
  continue;
@@ -135,6 +185,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
135
185
  const expectedMessages = expectedMessagesValue.filter((msg) => isTestMessage(msg));
136
186
  const assistantMessages = expectedMessages.filter((message) => message.role === "assistant");
137
187
  const userMessages = inputMessages.filter((message) => message.role === "user");
188
+ const systemMessages = inputMessages.filter((message) => message.role === "system");
138
189
  if (assistantMessages.length === 0) {
139
190
  logWarning(`No assistant message found for test case: ${id}`);
140
191
  continue;
@@ -142,6 +193,29 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
142
193
  if (assistantMessages.length > 1) {
143
194
  logWarning(`Multiple assistant messages found for test case: ${id}, using first`);
144
195
  }
196
+ if (systemMessages.length > 1) {
197
+ logWarning(`Multiple system messages found for test case: ${id}, using first`);
198
+ }
199
+ let systemMessageContent;
200
+ if (systemMessages.length > 0) {
201
+ const content = systemMessages[0]?.content;
202
+ if (typeof content === "string") {
203
+ systemMessageContent = content;
204
+ } else if (Array.isArray(content)) {
205
+ const textParts = [];
206
+ for (const segment of content) {
207
+ if (isJsonObject(segment)) {
208
+ const value = segment.value;
209
+ if (typeof value === "string") {
210
+ textParts.push(value);
211
+ }
212
+ }
213
+ }
214
+ if (textParts.length > 0) {
215
+ systemMessageContent = textParts.join("\n\n");
216
+ }
217
+ }
218
+ }
145
219
  const userSegments = [];
146
220
  const guidelinePaths = [];
147
221
  const userTextParts = [];
@@ -173,7 +247,8 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
173
247
  }
174
248
  try {
175
249
  const fileContent = (await readFile(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
176
- if (isGuidelineFile(displayPath)) {
250
+ const relativeToRepo = path.relative(repoRootPath, resolvedPath);
251
+ if (isGuidelineFile(relativeToRepo, guidelinePatterns)) {
177
252
  guidelinePaths.push(path.resolve(resolvedPath));
178
253
  if (verbose) {
179
254
  console.log(` [Guideline] Found: ${displayPath}`);
@@ -183,7 +258,8 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
183
258
  userSegments.push({
184
259
  type: "file",
185
260
  path: displayPath,
186
- text: fileContent
261
+ text: fileContent,
262
+ resolvedPath: path.resolve(resolvedPath)
187
263
  });
188
264
  if (verbose) {
189
265
  console.log(` [File] Found: ${displayPath}`);
@@ -205,16 +281,29 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
205
281
  }
206
282
  const codeSnippets = extractCodeBlocks(userSegments);
207
283
  const assistantContent = assistantMessages[0]?.content;
208
- const expectedAssistantRaw = normalizeAssistantContent(assistantContent);
284
+ const expectedAssistantRaw = await resolveAssistantContent(assistantContent, searchRoots, verbose);
209
285
  const userTextPrompt = userTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
210
- const testCaseGrader = coerceGrader(testcase.grader) ?? globalGrader;
286
+ const testCaseGrader = coerceGrader(evalcase.grader) ?? globalGrader;
287
+ const userFilePaths = [];
288
+ for (const segment of userSegments) {
289
+ if (segment.type === "file" && typeof segment.resolvedPath === "string") {
290
+ userFilePaths.push(segment.resolvedPath);
291
+ }
292
+ }
293
+ const allFilePaths = [
294
+ ...guidelinePaths.map((guidelinePath) => path.resolve(guidelinePath)),
295
+ ...userFilePaths
296
+ ];
211
297
  const testCase = {
212
298
  id,
213
299
  conversation_id: conversationId,
214
300
  task: userTextPrompt,
215
301
  user_segments: userSegments,
302
+ system_message: systemMessageContent,
216
303
  expected_assistant_raw: expectedAssistantRaw,
217
304
  guideline_paths: guidelinePaths.map((guidelinePath) => path.resolve(guidelinePath)),
305
+ guideline_patterns: guidelinePatterns,
306
+ file_paths: allFilePaths,
218
307
  code_snippets: codeSnippets,
219
308
  outcome,
220
309
  grader: testCaseGrader
@@ -239,7 +328,7 @@ async function buildPromptInputs(testCase) {
239
328
  const guidelineContents = [];
240
329
  for (const rawPath of testCase.guideline_paths) {
241
330
  const absolutePath = path.resolve(rawPath);
242
- if (!await fileExists(absolutePath)) {
331
+ if (!await fileExists2(absolutePath)) {
243
332
  logWarning(`Could not read guideline file ${absolutePath}: file does not exist`);
244
333
  continue;
245
334
  }
@@ -280,9 +369,9 @@ ${body}`);
280
369
  }
281
370
  const request = requestParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
282
371
  const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
283
- return { request, guidelines };
372
+ return { request, guidelines, systemMessage: testCase.system_message };
284
373
  }
285
- async function fileExists(absolutePath) {
374
+ async function fileExists2(absolutePath) {
286
375
  try {
287
376
  await access(absolutePath, constants.F_OK);
288
377
  return true;
@@ -321,7 +410,7 @@ function cloneJsonValue(value) {
321
410
  }
322
411
  return cloneJsonObject(value);
323
412
  }
324
- function normalizeAssistantContent(content) {
413
+ async function resolveAssistantContent(content, searchRoots, verbose) {
325
414
  if (typeof content === "string") {
326
415
  return content;
327
416
  }
@@ -334,12 +423,42 @@ function normalizeAssistantContent(content) {
334
423
  parts.push(entry);
335
424
  continue;
336
425
  }
337
- const textValue = asString(entry["text"]);
426
+ if (!isJsonObject(entry)) {
427
+ continue;
428
+ }
429
+ const segmentType = asString(entry.type);
430
+ if (segmentType === "file") {
431
+ const rawValue = asString(entry.value);
432
+ if (!rawValue) {
433
+ continue;
434
+ }
435
+ const { displayPath, resolvedPath, attempted } = await resolveFileReference(
436
+ rawValue,
437
+ searchRoots
438
+ );
439
+ if (!resolvedPath) {
440
+ const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
441
+ logWarning(`File not found in expected_messages: ${displayPath}`, attempts);
442
+ continue;
443
+ }
444
+ try {
445
+ const fileContent = (await readFile(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
446
+ parts.push(fileContent);
447
+ if (verbose) {
448
+ console.log(` [Expected Assistant File] Found: ${displayPath}`);
449
+ console.log(` Resolved to: ${resolvedPath}`);
450
+ }
451
+ } catch (error) {
452
+ logWarning(`Could not read file ${resolvedPath}: ${error.message}`);
453
+ }
454
+ continue;
455
+ }
456
+ const textValue = asString(entry.text);
338
457
  if (typeof textValue === "string") {
339
458
  parts.push(textValue);
340
459
  continue;
341
460
  }
342
- const valueValue = asString(entry["value"]);
461
+ const valueValue = asString(entry.value);
343
462
  if (typeof valueValue === "string") {
344
463
  parts.push(valueValue);
345
464
  continue;
@@ -376,15 +495,18 @@ function buildChatPrompt(request) {
376
495
  return request.chatPrompt;
377
496
  }
378
497
  const systemSegments = [];
379
- if (request.guidelines && request.guidelines.trim().length > 0) {
380
- systemSegments.push(`Guidelines:
381
- ${request.guidelines.trim()}`);
382
- }
383
498
  const metadataSystemPrompt = typeof request.metadata?.systemPrompt === "string" ? request.metadata.systemPrompt : void 0;
384
499
  if (metadataSystemPrompt && metadataSystemPrompt.trim().length > 0) {
385
500
  systemSegments.push(metadataSystemPrompt.trim());
501
+ } else {
502
+ systemSegments.push(DEFAULT_SYSTEM_PROMPT);
503
+ }
504
+ if (request.guidelines && request.guidelines.trim().length > 0) {
505
+ systemSegments.push(`[[ ## Guidelines ## ]]
506
+
507
+ ${request.guidelines.trim()}`);
386
508
  }
387
- const systemContent = systemSegments.length > 0 ? systemSegments.join("\n\n") : DEFAULT_SYSTEM_PROMPT;
509
+ const systemContent = systemSegments.join("\n\n");
388
510
  const userContent = request.prompt.trim();
389
511
  const prompt = [
390
512
  {
@@ -840,11 +962,9 @@ function isLikelyEnvReference(value) {
840
962
  }
841
963
 
842
964
  // src/evaluation/providers/vscode.ts
843
- import { mkdtemp, readFile as readFile2, rm, writeFile } from "node:fs/promises";
844
- import { tmpdir } from "node:os";
965
+ import { readFile as readFile2 } from "node:fs/promises";
845
966
  import path2 from "node:path";
846
967
  import { dispatchAgentSession, getSubagentRoot, provisionSubagents } from "subagent";
847
- var PROMPT_FILE_PREFIX = "bbeval-vscode-";
848
968
  var VSCodeProvider = class {
849
969
  id;
850
970
  kind;
@@ -861,128 +981,89 @@ var VSCodeProvider = class {
861
981
  throw new Error("VS Code provider request was aborted before dispatch");
862
982
  }
863
983
  const attachments = normalizeAttachments(request.attachments);
864
- const promptContent = buildPromptDocument(request, attachments);
865
- const directory = await mkdtemp(path2.join(tmpdir(), PROMPT_FILE_PREFIX));
866
- const promptPath = path2.join(directory, `${request.testCaseId ?? "request"}.prompt.md`);
867
- try {
868
- await writeFile(promptPath, promptContent, "utf8");
869
- const session = await dispatchAgentSession({
870
- userQuery: composeUserQuery(request),
871
- promptFile: promptPath,
872
- extraAttachments: attachments,
873
- wait: this.config.waitForResponse,
874
- dryRun: this.config.dryRun,
875
- vscodeCmd: this.config.command,
876
- subagentRoot: this.config.subagentRoot,
877
- workspaceTemplate: this.config.workspaceTemplate,
878
- silent: true
879
- });
880
- if (session.exitCode !== 0 || !session.responseFile) {
881
- const failure = session.error ?? "VS Code subagent did not produce a response";
882
- throw new Error(failure);
883
- }
884
- if (this.config.dryRun) {
885
- return {
886
- text: "",
887
- raw: {
888
- session,
889
- promptFile: promptPath,
890
- attachments
891
- }
892
- };
893
- }
894
- const responseText = await readFile2(session.responseFile, "utf8");
984
+ const promptContent = buildPromptDocument(request, attachments, request.guideline_patterns);
985
+ const session = await dispatchAgentSession({
986
+ userQuery: promptContent,
987
+ // Use full prompt content instead of just request.prompt
988
+ extraAttachments: attachments,
989
+ wait: this.config.waitForResponse,
990
+ dryRun: this.config.dryRun,
991
+ vscodeCmd: this.config.command,
992
+ subagentRoot: this.config.subagentRoot,
993
+ workspaceTemplate: this.config.workspaceTemplate,
994
+ silent: true
995
+ });
996
+ if (session.exitCode !== 0 || !session.responseFile) {
997
+ const failure = session.error ?? "VS Code subagent did not produce a response";
998
+ throw new Error(failure);
999
+ }
1000
+ if (this.config.dryRun) {
895
1001
  return {
896
- text: responseText,
1002
+ text: "",
897
1003
  raw: {
898
1004
  session,
899
- promptFile: promptPath,
900
1005
  attachments
901
1006
  }
902
1007
  };
903
- } finally {
904
- await rm(directory, { recursive: true, force: true });
905
1008
  }
1009
+ const responseText = await readFile2(session.responseFile, "utf8");
1010
+ return {
1011
+ text: responseText,
1012
+ raw: {
1013
+ session,
1014
+ attachments
1015
+ }
1016
+ };
906
1017
  }
907
1018
  };
908
- function buildPromptDocument(request, attachments) {
1019
+ function buildPromptDocument(request, attachments, guidelinePatterns) {
909
1020
  const parts = [];
910
- const instructionFiles = collectInstructionFiles(attachments);
911
- if (instructionFiles.length > 0) {
912
- parts.push(buildMandatoryPrereadBlock(instructionFiles));
913
- }
914
- parts.push(`# BbEval Request`);
915
- if (request.testCaseId) {
916
- parts.push(`- Test Case: ${request.testCaseId}`);
917
- }
918
- if (request.metadata?.target) {
919
- parts.push(`- Target: ${String(request.metadata.target)}`);
920
- }
921
- parts.push("\n## Task\n", request.prompt.trim());
922
- if (request.guidelines && request.guidelines.trim().length > 0) {
923
- parts.push("\n## Guidelines\n", request.guidelines.trim());
924
- }
925
- if (attachments && attachments.length > 0) {
926
- const attachmentList = attachments.map((item) => `- ${item}`).join("\n");
927
- parts.push("\n## Attachments\n", attachmentList);
1021
+ const guidelineFiles = collectGuidelineFiles(attachments, guidelinePatterns);
1022
+ if (guidelineFiles.length > 0) {
1023
+ parts.push("\n", buildMandatoryPrereadBlock(guidelineFiles));
928
1024
  }
1025
+ parts.push("\n[[ ## user_query ## ]]\n", request.prompt.trim());
929
1026
  return parts.join("\n").trim();
930
1027
  }
931
- function buildMandatoryPrereadBlock(instructionFiles) {
932
- if (instructionFiles.length === 0) {
1028
+ function buildMandatoryPrereadBlock(guidelineFiles) {
1029
+ if (guidelineFiles.length === 0) {
933
1030
  return "";
934
1031
  }
935
1032
  const fileList = [];
936
- const tokenList = [];
937
1033
  let counter = 0;
938
- for (const absolutePath of instructionFiles) {
1034
+ for (const absolutePath of guidelineFiles) {
939
1035
  counter += 1;
940
1036
  const fileName = path2.basename(absolutePath);
941
1037
  const fileUri = pathToFileUri(absolutePath);
942
- fileList.push(`[${fileName}](${fileUri})`);
943
- tokenList.push(`INSTRUCTIONS_READ: \`${fileName}\` i=${counter} SHA256=<hex>`);
1038
+ fileList.push(`* [${fileName}](${fileUri})`);
944
1039
  }
945
- const filesText = fileList.join(", ");
946
- const tokensText = tokenList.join("\n");
1040
+ const filesText = fileList.join("\n");
947
1041
  const instruction = [
948
- `Read all instruction files: ${filesText}.`,
949
- `After reading each file, compute its SHA256 hash using this PowerShell command:`,
950
- "`Get-FileHash -Algorithm SHA256 -LiteralPath '<file-path>' | Select-Object -ExpandProperty Hash`.",
951
- `Then include, at the top of your reply, these exact tokens on separate lines:
1042
+ `Read all guideline files:
1043
+ ${filesText}.
952
1044
  `,
953
- tokensText,
954
- `
955
- Replace \`<hex>\` with the actual SHA256 hash value computed from the PowerShell command.`,
956
1045
  `If any file is missing, fail with ERROR: missing-file <filename> and stop.
957
1046
  `,
958
- `Then fetch all documentation required by the instructions before proceeding with your task.`
959
- ].join(" ");
960
- return `[[ ## mandatory_pre_read ## ]]
961
-
962
- ${instruction}
963
-
964
- `;
1047
+ `Then apply system_instructions on the user query below.`
1048
+ ].join("");
1049
+ return `${instruction}`;
965
1050
  }
966
- function collectInstructionFiles(attachments) {
1051
+ function collectGuidelineFiles(attachments, guidelinePatterns) {
967
1052
  if (!attachments || attachments.length === 0) {
968
1053
  return [];
969
1054
  }
970
1055
  const unique = /* @__PURE__ */ new Map();
971
1056
  for (const attachment of attachments) {
972
- if (!isInstructionPath(attachment)) {
973
- continue;
974
- }
975
1057
  const absolutePath = path2.resolve(attachment);
976
- if (!unique.has(absolutePath)) {
977
- unique.set(absolutePath, absolutePath);
1058
+ const normalized = absolutePath.split(path2.sep).join("/");
1059
+ if (isGuidelineFile(normalized, guidelinePatterns)) {
1060
+ if (!unique.has(absolutePath)) {
1061
+ unique.set(absolutePath, absolutePath);
1062
+ }
978
1063
  }
979
1064
  }
980
1065
  return Array.from(unique.values());
981
1066
  }
982
- function isInstructionPath(filePath) {
983
- const normalized = filePath.split(path2.sep).join("/");
984
- return normalized.endsWith(".instructions.md") || normalized.includes("/instructions/") || normalized.endsWith(".prompt.md") || normalized.includes("/prompts/");
985
- }
986
1067
  function pathToFileUri(filePath) {
987
1068
  const absolutePath = path2.isAbsolute(filePath) ? filePath : path2.resolve(filePath);
988
1069
  const normalizedPath = absolutePath.replace(/\\/g, "/");
@@ -991,14 +1072,6 @@ function pathToFileUri(filePath) {
991
1072
  }
992
1073
  return `file://${normalizedPath}`;
993
1074
  }
994
- function composeUserQuery(request) {
995
- const segments = [];
996
- segments.push(request.prompt.trim());
997
- if (request.guidelines && request.guidelines.trim().length > 0) {
998
- segments.push("\nGuidelines:\n", request.guidelines.trim());
999
- }
1000
- return segments.join("\n").trim();
1001
- }
1002
1075
  function normalizeAttachments(attachments) {
1003
1076
  if (!attachments || attachments.length === 0) {
1004
1077
  return void 0;
@@ -1056,18 +1129,24 @@ import { parse as parse2 } from "yaml";
1056
1129
  function isRecord(value) {
1057
1130
  return typeof value === "object" && value !== null && !Array.isArray(value);
1058
1131
  }
1059
- function checkVersion(parsed, absolutePath) {
1060
- const version = typeof parsed.version === "number" ? parsed.version : typeof parsed.version === "string" ? parseFloat(parsed.version) : void 0;
1061
- if (version === void 0) {
1132
+ function checkSchema(parsed, absolutePath) {
1133
+ const schema = parsed.$schema;
1134
+ if (schema === void 0) {
1062
1135
  throw new Error(
1063
- `Missing version field in targets.yaml at ${absolutePath}.
1064
- Please add 'version: 2.0' at the top of the file.`
1136
+ `Missing $schema field in targets.yaml at ${absolutePath}.
1137
+ Please add '$schema: ${TARGETS_SCHEMA_V2}' at the top of the file.`
1065
1138
  );
1066
1139
  }
1067
- if (version < 2) {
1140
+ if (typeof schema !== "string") {
1068
1141
  throw new Error(
1069
- `Outdated targets.yaml format (version ${version}) at ${absolutePath}.
1070
- Please update to version 2.0 format with 'targets' array.`
1142
+ `Invalid $schema field in targets.yaml at ${absolutePath}.
1143
+ Expected a string value '${TARGETS_SCHEMA_V2}'.`
1144
+ );
1145
+ }
1146
+ if (schema !== TARGETS_SCHEMA_V2) {
1147
+ throw new Error(
1148
+ `Invalid $schema '${schema}' in targets.yaml at ${absolutePath}.
1149
+ Expected '${TARGETS_SCHEMA_V2}'.`
1071
1150
  );
1072
1151
  }
1073
1152
  }
@@ -1099,7 +1178,7 @@ function assertTargetDefinition(value, index, filePath) {
1099
1178
  judge_target: typeof judgeTarget === "string" ? judgeTarget : void 0
1100
1179
  };
1101
1180
  }
1102
- async function fileExists2(filePath) {
1181
+ async function fileExists3(filePath) {
1103
1182
  try {
1104
1183
  await access2(filePath, constants2.F_OK);
1105
1184
  return true;
@@ -1109,15 +1188,15 @@ async function fileExists2(filePath) {
1109
1188
  }
1110
1189
  async function readTargetDefinitions(filePath) {
1111
1190
  const absolutePath = path3.resolve(filePath);
1112
- if (!await fileExists2(absolutePath)) {
1191
+ if (!await fileExists3(absolutePath)) {
1113
1192
  throw new Error(`targets.yaml not found at ${absolutePath}`);
1114
1193
  }
1115
1194
  const raw = await readFile3(absolutePath, "utf8");
1116
1195
  const parsed = parse2(raw);
1117
1196
  if (!isRecord(parsed)) {
1118
- throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with 'version' and 'targets' fields`);
1197
+ throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with '$schema' and 'targets' fields`);
1119
1198
  }
1120
- checkVersion(parsed, absolutePath);
1199
+ checkSchema(parsed, absolutePath);
1121
1200
  const targets = extractTargetsArray(parsed, absolutePath);
1122
1201
  const definitions = targets.map((entry, index) => assertTargetDefinition(entry, index, absolutePath));
1123
1202
  return definitions;
@@ -1339,7 +1418,7 @@ import { randomUUID } from "node:crypto";
1339
1418
  var HeuristicGrader = class {
1340
1419
  kind = "heuristic";
1341
1420
  grade(context) {
1342
- const expectedAspects = extractAspects(context.testCase.expected_assistant_raw);
1421
+ const expectedAspects = extractAspects(context.evalCase.expected_assistant_raw);
1343
1422
  const result = scoreCandidateResponse(context.candidate, expectedAspects);
1344
1423
  const misses = [...result.misses];
1345
1424
  if (expectedAspects.length === 0 && isErrorLike(context.candidate)) {
@@ -1372,14 +1451,14 @@ var QualityGrader = class {
1372
1451
  if (!judgeProvider) {
1373
1452
  throw new Error("No judge provider available for LLM grading");
1374
1453
  }
1375
- const prompt = buildQualityPrompt(context.testCase, context.candidate);
1454
+ const prompt = buildQualityPrompt(context.evalCase, context.candidate);
1376
1455
  const metadata = {
1377
1456
  systemPrompt: QUALITY_SYSTEM_PROMPT
1378
1457
  };
1379
1458
  const response = await judgeProvider.invoke({
1380
1459
  prompt,
1381
1460
  metadata,
1382
- testCaseId: context.testCase.id,
1461
+ evalCaseId: context.evalCase.id,
1383
1462
  attempt: context.attempt,
1384
1463
  maxOutputTokens: this.maxOutputTokens,
1385
1464
  temperature: this.temperature
@@ -1425,16 +1504,16 @@ var QUALITY_SYSTEM_PROMPT = [
1425
1504
  function buildQualityPrompt(testCase, candidate) {
1426
1505
  const parts = [
1427
1506
  "[[ ## expected_outcome ## ]]",
1428
- testCase.outcome,
1507
+ testCase.outcome.trim(),
1429
1508
  "",
1430
1509
  "[[ ## request ## ]]",
1431
- testCase.task,
1510
+ testCase.task.trim(),
1432
1511
  "",
1433
1512
  "[[ ## reference_answer ## ]]",
1434
- testCase.expected_assistant_raw,
1513
+ testCase.expected_assistant_raw.trim(),
1435
1514
  "",
1436
1515
  "[[ ## generated_answer ## ]]",
1437
- candidate,
1516
+ candidate.trim(),
1438
1517
  "",
1439
1518
  "Respond with a single JSON object matching the schema described in the system prompt."
1440
1519
  ];
@@ -1678,17 +1757,17 @@ async function runEvaluation(options) {
1678
1757
  cache,
1679
1758
  useCache,
1680
1759
  now,
1681
- testId,
1760
+ evalId,
1682
1761
  verbose,
1683
1762
  onResult,
1684
1763
  onProgress
1685
1764
  } = options;
1686
- const load = loadTestCases;
1687
- const testCases = await load(testFilePath, repoRoot, { verbose });
1688
- const filteredTestCases = filterTestCases(testCases, testId);
1689
- if (filteredTestCases.length === 0) {
1690
- if (testId) {
1691
- throw new Error(`Test case with id '${testId}' not found in ${testFilePath}`);
1765
+ const load = loadEvalCases;
1766
+ const evalCases = await load(testFilePath, repoRoot, { verbose });
1767
+ const filteredEvalCases = filterEvalCases(evalCases, evalId);
1768
+ if (filteredEvalCases.length === 0) {
1769
+ if (evalId) {
1770
+ throw new Error(`Test case with id '${evalId}' not found in ${testFilePath}`);
1692
1771
  }
1693
1772
  return [];
1694
1773
  }
@@ -1732,11 +1811,11 @@ async function runEvaluation(options) {
1732
1811
  };
1733
1812
  const graderRegistry = buildGraderRegistry(graders, resolveJudgeProvider);
1734
1813
  const primaryProvider = getOrCreateProvider(target);
1735
- if (onProgress && filteredTestCases.length > 0) {
1736
- for (let i = 0; i < filteredTestCases.length; i++) {
1814
+ if (onProgress && filteredEvalCases.length > 0) {
1815
+ for (let i = 0; i < filteredEvalCases.length; i++) {
1737
1816
  await onProgress({
1738
1817
  workerId: i + 1,
1739
- testId: filteredTestCases[i].id,
1818
+ evalId: filteredEvalCases[i].id,
1740
1819
  status: "pending"
1741
1820
  });
1742
1821
  }
@@ -1744,23 +1823,23 @@ async function runEvaluation(options) {
1744
1823
  const workers = options.maxConcurrency ?? target.workers ?? 1;
1745
1824
  const limit = pLimit(workers);
1746
1825
  let nextWorkerId = 1;
1747
- const workerIdByTestId = /* @__PURE__ */ new Map();
1748
- const promises = filteredTestCases.map(
1749
- (testCase) => limit(async () => {
1826
+ const workerIdByEvalId = /* @__PURE__ */ new Map();
1827
+ const promises = filteredEvalCases.map(
1828
+ (evalCase) => limit(async () => {
1750
1829
  const workerId = nextWorkerId++;
1751
- workerIdByTestId.set(testCase.id, workerId);
1830
+ workerIdByEvalId.set(evalCase.id, workerId);
1752
1831
  if (onProgress) {
1753
1832
  await onProgress({
1754
1833
  workerId,
1755
- testId: testCase.id,
1834
+ evalId: evalCase.id,
1756
1835
  status: "running",
1757
1836
  startedAt: Date.now()
1758
1837
  });
1759
1838
  }
1760
1839
  try {
1761
1840
  const judgeProvider = await resolveJudgeProvider(target);
1762
- const result = await runTestCase({
1763
- testCase,
1841
+ const result = await runEvalCase({
1842
+ evalCase,
1764
1843
  provider: primaryProvider,
1765
1844
  target,
1766
1845
  graders: graderRegistry,
@@ -1775,7 +1854,7 @@ async function runEvaluation(options) {
1775
1854
  if (onProgress) {
1776
1855
  await onProgress({
1777
1856
  workerId,
1778
- testId: testCase.id,
1857
+ evalId: evalCase.id,
1779
1858
  status: "completed",
1780
1859
  startedAt: 0,
1781
1860
  // Not used for completed status
@@ -1790,7 +1869,7 @@ async function runEvaluation(options) {
1790
1869
  if (onProgress) {
1791
1870
  await onProgress({
1792
1871
  workerId,
1793
- testId: testCase.id,
1872
+ evalId: evalCase.id,
1794
1873
  status: "failed",
1795
1874
  completedAt: Date.now(),
1796
1875
  error: error instanceof Error ? error.message : String(error)
@@ -1807,10 +1886,10 @@ async function runEvaluation(options) {
1807
1886
  if (outcome.status === "fulfilled") {
1808
1887
  results.push(outcome.value);
1809
1888
  } else {
1810
- const testCase = filteredTestCases[i];
1811
- const promptInputs = await buildPromptInputs(testCase);
1889
+ const evalCase = filteredEvalCases[i];
1890
+ const promptInputs = await buildPromptInputs(evalCase);
1812
1891
  const errorResult = buildErrorResult(
1813
- testCase,
1892
+ evalCase,
1814
1893
  target.name,
1815
1894
  (now ?? (() => /* @__PURE__ */ new Date()))(),
1816
1895
  outcome.reason,
@@ -1824,9 +1903,9 @@ async function runEvaluation(options) {
1824
1903
  }
1825
1904
  return results;
1826
1905
  }
1827
- async function runTestCase(options) {
1906
+ async function runEvalCase(options) {
1828
1907
  const {
1829
- testCase,
1908
+ evalCase,
1830
1909
  provider,
1831
1910
  target,
1832
1911
  graders,
@@ -1839,11 +1918,11 @@ async function runTestCase(options) {
1839
1918
  signal,
1840
1919
  judgeProvider
1841
1920
  } = options;
1842
- const promptInputs = await buildPromptInputs(testCase);
1921
+ const promptInputs = await buildPromptInputs(evalCase);
1843
1922
  if (promptDumpDir) {
1844
- await dumpPrompt(promptDumpDir, testCase, promptInputs);
1923
+ await dumpPrompt(promptDumpDir, evalCase, promptInputs);
1845
1924
  }
1846
- const cacheKey = useCache ? createCacheKey(provider, target, testCase, promptInputs) : void 0;
1925
+ const cacheKey = useCache ? createCacheKey(provider, target, evalCase, promptInputs) : void 0;
1847
1926
  let cachedResponse;
1848
1927
  if (cacheKey && cache) {
1849
1928
  cachedResponse = await cache.get(cacheKey);
@@ -1856,7 +1935,7 @@ async function runTestCase(options) {
1856
1935
  while (!providerResponse && attempt < attemptBudget) {
1857
1936
  try {
1858
1937
  providerResponse = await invokeProvider(provider, {
1859
- testCase,
1938
+ evalCase,
1860
1939
  target,
1861
1940
  promptInputs,
1862
1941
  attempt,
@@ -1869,12 +1948,12 @@ async function runTestCase(options) {
1869
1948
  attempt += 1;
1870
1949
  continue;
1871
1950
  }
1872
- return buildErrorResult(testCase, target.name, nowFn(), error, promptInputs);
1951
+ return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
1873
1952
  }
1874
1953
  }
1875
1954
  if (!providerResponse) {
1876
1955
  return buildErrorResult(
1877
- testCase,
1956
+ evalCase,
1878
1957
  target.name,
1879
1958
  nowFn(),
1880
1959
  lastError ?? new Error("Provider did not return a response"),
@@ -1884,7 +1963,7 @@ async function runTestCase(options) {
1884
1963
  if (cacheKey && cache && !cachedResponse) {
1885
1964
  await cache.set(cacheKey, providerResponse);
1886
1965
  }
1887
- const graderKind = testCase.grader ?? "heuristic";
1966
+ const graderKind = evalCase.grader ?? "heuristic";
1888
1967
  const activeGrader = graders[graderKind] ?? graders.heuristic;
1889
1968
  if (!activeGrader) {
1890
1969
  throw new Error(`No grader registered for kind '${graderKind}'`);
@@ -1893,7 +1972,7 @@ async function runTestCase(options) {
1893
1972
  try {
1894
1973
  const gradeTimestamp = nowFn();
1895
1974
  grade = await activeGrader.grade({
1896
- testCase,
1975
+ evalCase,
1897
1976
  candidate: providerResponse.text ?? "",
1898
1977
  target,
1899
1978
  provider,
@@ -1903,17 +1982,18 @@ async function runTestCase(options) {
1903
1982
  judgeProvider
1904
1983
  });
1905
1984
  } catch (error) {
1906
- return buildErrorResult(testCase, target.name, nowFn(), error, promptInputs);
1985
+ return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
1907
1986
  }
1908
1987
  const completedAt = nowFn();
1909
1988
  const rawRequest = {
1910
1989
  request: promptInputs.request,
1911
1990
  guidelines: promptInputs.guidelines,
1912
- guideline_paths: testCase.guideline_paths
1991
+ guideline_paths: evalCase.guideline_paths,
1992
+ system_message: promptInputs.systemMessage ?? ""
1913
1993
  };
1914
1994
  return {
1915
- test_id: testCase.id,
1916
- conversation_id: testCase.conversation_id,
1995
+ eval_id: evalCase.id,
1996
+ conversation_id: evalCase.conversation_id,
1917
1997
  score: grade.score,
1918
1998
  hits: grade.hits,
1919
1999
  misses: grade.misses,
@@ -1927,11 +2007,11 @@ async function runTestCase(options) {
1927
2007
  grader_raw_request: grade.graderRawRequest
1928
2008
  };
1929
2009
  }
1930
- function filterTestCases(testCases, testId) {
1931
- if (!testId) {
1932
- return testCases;
2010
+ function filterEvalCases(evalCases, evalId) {
2011
+ if (!evalId) {
2012
+ return evalCases;
1933
2013
  }
1934
- return testCases.filter((testCase) => testCase.id === testId);
2014
+ return evalCases.filter((evalCase) => evalCase.id === evalId);
1935
2015
  }
1936
2016
  function buildGraderRegistry(overrides, resolveJudgeProvider) {
1937
2017
  const heuristic = overrides?.heuristic ?? new HeuristicGrader();
@@ -1949,16 +2029,16 @@ function buildGraderRegistry(overrides, resolveJudgeProvider) {
1949
2029
  llm_judge: llmJudge
1950
2030
  };
1951
2031
  }
1952
- async function dumpPrompt(directory, testCase, promptInputs) {
2032
+ async function dumpPrompt(directory, evalCase, promptInputs) {
1953
2033
  const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
1954
- const filename = `${timestamp}_${sanitizeFilename(testCase.id)}.json`;
2034
+ const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
1955
2035
  const filePath = path4.resolve(directory, filename);
1956
2036
  await mkdir(path4.dirname(filePath), { recursive: true });
1957
2037
  const payload = {
1958
- test_id: testCase.id,
2038
+ eval_id: evalCase.id,
1959
2039
  request: promptInputs.request,
1960
2040
  guidelines: promptInputs.guidelines,
1961
- guideline_paths: testCase.guideline_paths
2041
+ guideline_paths: evalCase.guideline_paths
1962
2042
  };
1963
2043
  await writeFile2(filePath, JSON.stringify(payload, null, 2), "utf8");
1964
2044
  }
@@ -1970,7 +2050,7 @@ function sanitizeFilename(value) {
1970
2050
  return sanitized.length > 0 ? sanitized : randomUUID2();
1971
2051
  }
1972
2052
  async function invokeProvider(provider, options) {
1973
- const { testCase, target, promptInputs, attempt, agentTimeoutMs, signal } = options;
2053
+ const { evalCase, target, promptInputs, attempt, agentTimeoutMs, signal } = options;
1974
2054
  const controller = new AbortController();
1975
2055
  const timeout = agentTimeoutMs ? setTimeout(() => controller.abort(), agentTimeoutMs) : void 0;
1976
2056
  if (signal) {
@@ -1980,12 +2060,12 @@ async function invokeProvider(provider, options) {
1980
2060
  return await provider.invoke({
1981
2061
  prompt: promptInputs.request,
1982
2062
  guidelines: promptInputs.guidelines,
1983
- attachments: testCase.guideline_paths,
1984
- testCaseId: testCase.id,
2063
+ guideline_patterns: evalCase.guideline_patterns,
2064
+ attachments: evalCase.file_paths,
2065
+ evalCaseId: evalCase.id,
1985
2066
  attempt,
1986
2067
  metadata: {
1987
- target: target.name,
1988
- grader: testCase.grader
2068
+ systemPrompt: promptInputs.systemMessage ?? ""
1989
2069
  },
1990
2070
  signal: controller.signal
1991
2071
  });
@@ -1995,17 +2075,18 @@ async function invokeProvider(provider, options) {
1995
2075
  }
1996
2076
  }
1997
2077
  }
1998
- function buildErrorResult(testCase, targetName, timestamp, error, promptInputs) {
2078
+ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs) {
1999
2079
  const message = error instanceof Error ? error.message : String(error);
2000
2080
  const rawRequest = {
2001
2081
  request: promptInputs.request,
2002
2082
  guidelines: promptInputs.guidelines,
2003
- guideline_paths: testCase.guideline_paths,
2083
+ guideline_paths: evalCase.guideline_paths,
2084
+ system_message: promptInputs.systemMessage ?? "",
2004
2085
  error: message
2005
2086
  };
2006
2087
  return {
2007
- test_id: testCase.id,
2008
- conversation_id: testCase.conversation_id,
2088
+ eval_id: evalCase.id,
2089
+ conversation_id: evalCase.conversation_id,
2009
2090
  score: 0,
2010
2091
  hits: [],
2011
2092
  misses: [`Error: ${message}`],
@@ -2017,13 +2098,14 @@ function buildErrorResult(testCase, targetName, timestamp, error, promptInputs)
2017
2098
  raw_request: rawRequest
2018
2099
  };
2019
2100
  }
2020
- function createCacheKey(provider, target, testCase, promptInputs) {
2101
+ function createCacheKey(provider, target, evalCase, promptInputs) {
2021
2102
  const hash = createHash("sha256");
2022
2103
  hash.update(provider.id);
2023
2104
  hash.update(target.name);
2024
- hash.update(testCase.id);
2105
+ hash.update(evalCase.id);
2025
2106
  hash.update(promptInputs.request);
2026
2107
  hash.update(promptInputs.guidelines);
2108
+ hash.update(promptInputs.systemMessage ?? "");
2027
2109
  return hash.digest("hex");
2028
2110
  }
2029
2111
  function isTimeoutLike(error) {
@@ -2051,7 +2133,9 @@ export {
2051
2133
  HeuristicGrader,
2052
2134
  QualityGrader,
2053
2135
  TEST_MESSAGE_ROLES,
2136
+ buildDirectoryChain,
2054
2137
  buildPromptInputs,
2138
+ buildSearchRoots,
2055
2139
  calculateHits,
2056
2140
  calculateMisses,
2057
2141
  createAgentKernel,
@@ -2059,6 +2143,8 @@ export {
2059
2143
  ensureVSCodeSubagents,
2060
2144
  extractAspects,
2061
2145
  extractCodeBlocks,
2146
+ fileExists,
2147
+ findGitRoot,
2062
2148
  getHitCount,
2063
2149
  isErrorLike,
2064
2150
  isGraderKind,
@@ -2068,12 +2154,13 @@ export {
2068
2154
  isTestMessage,
2069
2155
  isTestMessageRole,
2070
2156
  listTargetNames,
2071
- loadTestCases,
2157
+ loadEvalCases,
2072
2158
  readTargetDefinitions,
2073
2159
  resolveAndCreateProvider,
2160
+ resolveFileReference,
2074
2161
  resolveTargetDefinition,
2162
+ runEvalCase,
2075
2163
  runEvaluation,
2076
- runTestCase,
2077
2164
  scoreCandidateResponse
2078
2165
  };
2079
2166
  //# sourceMappingURL=index.js.map