@agentv/core 0.10.1 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -33,15 +33,15 @@ __export(index_exports, {
33
33
  CodeEvaluator: () => CodeEvaluator,
34
34
  LlmJudgeEvaluator: () => LlmJudgeEvaluator,
35
35
  TEST_MESSAGE_ROLES: () => TEST_MESSAGE_ROLES,
36
- buildDirectoryChain: () => buildDirectoryChain,
36
+ buildDirectoryChain: () => buildDirectoryChain2,
37
37
  buildPromptInputs: () => buildPromptInputs,
38
- buildSearchRoots: () => buildSearchRoots,
38
+ buildSearchRoots: () => buildSearchRoots2,
39
39
  consumeCodexLogEntries: () => consumeCodexLogEntries,
40
40
  createAgentKernel: () => createAgentKernel,
41
41
  createProvider: () => createProvider,
42
42
  ensureVSCodeSubagents: () => ensureVSCodeSubagents,
43
43
  extractCodeBlocks: () => extractCodeBlocks,
44
- fileExists: () => fileExists,
44
+ fileExists: () => fileExists2,
45
45
  findGitRoot: () => findGitRoot,
46
46
  getHitCount: () => getHitCount,
47
47
  isEvaluatorKind: () => isEvaluatorKind,
@@ -57,7 +57,7 @@ __export(index_exports, {
57
57
  readTestSuiteMetadata: () => readTestSuiteMetadata,
58
58
  readTextFile: () => readTextFile,
59
59
  resolveAndCreateProvider: () => resolveAndCreateProvider,
60
- resolveFileReference: () => resolveFileReference,
60
+ resolveFileReference: () => resolveFileReference2,
61
61
  resolveTargetDefinition: () => resolveTargetDefinition,
62
62
  runEvalCase: () => runEvalCase,
63
63
  runEvaluation: () => runEvaluation,
@@ -116,47 +116,112 @@ function getHitCount(result) {
116
116
  }
117
117
 
118
118
  // src/evaluation/yaml-parser.ts
119
+ var import_promises5 = require("fs/promises");
120
+ var import_node_path6 = __toESM(require("path"), 1);
121
+ var import_yaml2 = require("yaml");
122
+
123
+ // src/evaluation/formatting/segment-formatter.ts
124
+ function extractCodeBlocks(segments) {
125
+ const CODE_BLOCK_PATTERN = /```[\s\S]*?```/g;
126
+ const codeBlocks = [];
127
+ for (const segment of segments) {
128
+ const typeValue = segment["type"];
129
+ if (typeof typeValue !== "string" || typeValue !== "text") {
130
+ continue;
131
+ }
132
+ const textValue = segment["value"];
133
+ if (typeof textValue !== "string") {
134
+ continue;
135
+ }
136
+ const matches = textValue.match(CODE_BLOCK_PATTERN);
137
+ if (matches) {
138
+ codeBlocks.push(...matches);
139
+ }
140
+ }
141
+ return codeBlocks;
142
+ }
143
+ function formatFileContents(parts) {
144
+ const fileCount = parts.filter((p) => p.isFile).length;
145
+ if (fileCount > 0) {
146
+ return parts.map((part) => {
147
+ if (part.isFile && part.displayPath) {
148
+ return `<file path="${part.displayPath}">
149
+ ${part.content}
150
+ </file>`;
151
+ }
152
+ return part.content;
153
+ }).join("\n\n");
154
+ }
155
+ return parts.map((p) => p.content).join(" ");
156
+ }
157
+ function formatSegment(segment) {
158
+ const type = asString(segment.type);
159
+ if (type === "text") {
160
+ return asString(segment.value);
161
+ }
162
+ if (type === "guideline_ref") {
163
+ const refPath = asString(segment.path);
164
+ return refPath ? `<Attached: ${refPath}>` : void 0;
165
+ }
166
+ if (type === "file") {
167
+ const text = asString(segment.text);
168
+ const filePath = asString(segment.path);
169
+ if (text && filePath) {
170
+ return formatFileContents([{ content: text.trim(), isFile: true, displayPath: filePath }]);
171
+ }
172
+ }
173
+ return void 0;
174
+ }
175
+ function hasVisibleContent(segments) {
176
+ return segments.some((segment) => {
177
+ const type = asString(segment.type);
178
+ if (type === "text") {
179
+ const value = asString(segment.value);
180
+ return value !== void 0 && value.trim().length > 0;
181
+ }
182
+ if (type === "guideline_ref") {
183
+ return false;
184
+ }
185
+ if (type === "file") {
186
+ const text = asString(segment.text);
187
+ return text !== void 0 && text.trim().length > 0;
188
+ }
189
+ return false;
190
+ });
191
+ }
192
+ function asString(value) {
193
+ return typeof value === "string" ? value : void 0;
194
+ }
195
+
196
+ // src/evaluation/loaders/config-loader.ts
119
197
  var import_micromatch = __toESM(require("micromatch"), 1);
120
- var import_node_fs2 = require("fs");
121
198
  var import_promises2 = require("fs/promises");
122
199
  var import_node_path2 = __toESM(require("path"), 1);
123
- var import_node_url = require("url");
124
200
  var import_yaml = require("yaml");
125
201
 
126
- // src/evaluation/file-utils.ts
202
+ // src/evaluation/loaders/file-resolver.ts
127
203
  var import_node_fs = require("fs");
128
204
  var import_promises = require("fs/promises");
129
205
  var import_node_path = __toESM(require("path"), 1);
130
- async function fileExists(filePath) {
206
+ async function fileExists(absolutePath) {
131
207
  try {
132
- await (0, import_promises.access)(filePath, import_node_fs.constants.F_OK);
208
+ await (0, import_promises.access)(absolutePath, import_node_fs.constants.F_OK);
133
209
  return true;
134
210
  } catch {
135
211
  return false;
136
212
  }
137
213
  }
138
- function normalizeLineEndings(content) {
139
- return content.replace(/\r\n/g, "\n");
140
- }
141
- async function readTextFile(filePath) {
142
- const content = await (0, import_promises.readFile)(filePath, "utf8");
143
- return normalizeLineEndings(content);
144
- }
145
- async function findGitRoot(startPath) {
146
- let currentDir = import_node_path.default.dirname(import_node_path.default.resolve(startPath));
147
- const root = import_node_path.default.parse(currentDir).root;
148
- while (currentDir !== root) {
149
- const gitPath = import_node_path.default.join(currentDir, ".git");
150
- if (await fileExists(gitPath)) {
151
- return currentDir;
152
- }
153
- const parentDir = import_node_path.default.dirname(currentDir);
154
- if (parentDir === currentDir) {
155
- break;
214
+ function resolveToAbsolutePath(candidate) {
215
+ if (candidate instanceof URL) {
216
+ return new URL(candidate).pathname;
217
+ }
218
+ if (typeof candidate === "string") {
219
+ if (candidate.startsWith("file://")) {
220
+ return new URL(candidate).pathname;
156
221
  }
157
- currentDir = parentDir;
222
+ return import_node_path.default.resolve(candidate);
158
223
  }
159
- return null;
224
+ throw new TypeError("Unsupported repoRoot value. Expected string or URL.");
160
225
  }
161
226
  function buildDirectoryChain(filePath, repoRoot) {
162
227
  const directories = [];
@@ -234,44 +299,15 @@ async function resolveFileReference(rawValue, searchRoots) {
234
299
  return { displayPath, attempted };
235
300
  }
236
301
 
237
- // src/evaluation/yaml-parser.ts
238
- var CODE_BLOCK_PATTERN = /```[\s\S]*?```/g;
302
+ // src/evaluation/loaders/config-loader.ts
303
+ var SCHEMA_CONFIG_V2 = "agentv-config-v2";
239
304
  var ANSI_YELLOW = "\x1B[33m";
240
305
  var ANSI_RESET = "\x1B[0m";
241
- var SCHEMA_EVAL_V2 = "agentv-eval-v2";
242
- var SCHEMA_CONFIG_V2 = "agentv-config-v2";
243
- async function readTestSuiteMetadata(testFilePath) {
244
- try {
245
- const absolutePath = import_node_path2.default.resolve(testFilePath);
246
- const content = await (0, import_promises2.readFile)(absolutePath, "utf8");
247
- const parsed = (0, import_yaml.parse)(content);
248
- if (!isJsonObject(parsed)) {
249
- return {};
250
- }
251
- return { target: extractTargetFromSuite(parsed) };
252
- } catch {
253
- return {};
254
- }
255
- }
256
- function extractTargetFromSuite(suite) {
257
- const execution = suite.execution;
258
- if (execution && typeof execution === "object" && !Array.isArray(execution)) {
259
- const executionTarget = execution.target;
260
- if (typeof executionTarget === "string" && executionTarget.trim().length > 0) {
261
- return executionTarget.trim();
262
- }
263
- }
264
- const targetValue = suite.target;
265
- if (typeof targetValue === "string" && targetValue.trim().length > 0) {
266
- return targetValue.trim();
267
- }
268
- return void 0;
269
- }
270
306
  async function loadConfig(evalFilePath, repoRoot) {
271
307
  const directories = buildDirectoryChain(evalFilePath, repoRoot);
272
308
  for (const directory of directories) {
273
309
  const configPath = import_node_path2.default.join(directory, ".agentv", "config.yaml");
274
- if (!await fileExists2(configPath)) {
310
+ if (!await fileExists(configPath)) {
275
311
  continue;
276
312
  }
277
313
  try {
@@ -313,24 +349,134 @@ function isGuidelineFile(filePath, patterns) {
313
349
  const patternsToUse = patterns ?? [];
314
350
  return import_micromatch.default.isMatch(normalized, patternsToUse);
315
351
  }
316
- function extractCodeBlocks(segments) {
317
- const codeBlocks = [];
318
- for (const segment of segments) {
319
- const typeValue = segment["type"];
320
- if (typeof typeValue !== "string" || typeValue !== "text") {
352
+ function extractTargetFromSuite(suite) {
353
+ const execution = suite.execution;
354
+ if (execution && typeof execution === "object" && !Array.isArray(execution)) {
355
+ const executionTarget = execution.target;
356
+ if (typeof executionTarget === "string" && executionTarget.trim().length > 0) {
357
+ return executionTarget.trim();
358
+ }
359
+ }
360
+ const targetValue = suite.target;
361
+ if (typeof targetValue === "string" && targetValue.trim().length > 0) {
362
+ return targetValue.trim();
363
+ }
364
+ return void 0;
365
+ }
366
+ function logWarning(message) {
367
+ console.warn(`${ANSI_YELLOW}Warning: ${message}${ANSI_RESET}`);
368
+ }
369
+
370
+ // src/evaluation/loaders/evaluator-parser.ts
371
+ var import_node_path3 = __toESM(require("path"), 1);
372
+ var ANSI_YELLOW2 = "\x1B[33m";
373
+ var ANSI_RESET2 = "\x1B[0m";
374
+ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
375
+ const execution = rawEvalCase.execution;
376
+ const candidateEvaluators = isJsonObject2(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators ?? globalExecution?.evaluators;
377
+ if (candidateEvaluators === void 0) {
378
+ return void 0;
379
+ }
380
+ if (!Array.isArray(candidateEvaluators)) {
381
+ logWarning2(`Skipping evaluators for '${evalId}': expected array`);
382
+ return void 0;
383
+ }
384
+ const evaluators = [];
385
+ for (const rawEvaluator of candidateEvaluators) {
386
+ if (!isJsonObject2(rawEvaluator)) {
387
+ logWarning2(`Skipping invalid evaluator entry for '${evalId}' (expected object)`);
321
388
  continue;
322
389
  }
323
- const textValue = segment["value"];
324
- if (typeof textValue !== "string") {
390
+ const name = asString2(rawEvaluator.name);
391
+ const typeValue = rawEvaluator.type;
392
+ if (!name || !isEvaluatorKind(typeValue)) {
393
+ logWarning2(`Skipping evaluator with invalid name/type in '${evalId}'`);
325
394
  continue;
326
395
  }
327
- const matches = textValue.match(CODE_BLOCK_PATTERN);
328
- if (matches) {
329
- codeBlocks.push(...matches);
396
+ if (typeValue === "code") {
397
+ const script = asString2(rawEvaluator.script);
398
+ if (!script) {
399
+ logWarning2(`Skipping code evaluator '${name}' in '${evalId}': missing script`);
400
+ continue;
401
+ }
402
+ const cwd = asString2(rawEvaluator.cwd);
403
+ let resolvedCwd;
404
+ if (cwd) {
405
+ const resolved = await resolveFileReference(cwd, searchRoots);
406
+ if (resolved.resolvedPath) {
407
+ resolvedCwd = import_node_path3.default.resolve(resolved.resolvedPath);
408
+ } else {
409
+ logWarning2(
410
+ `Code evaluator '${name}' in '${evalId}': cwd not found (${resolved.displayPath})`,
411
+ resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
412
+ );
413
+ }
414
+ } else {
415
+ resolvedCwd = searchRoots[0];
416
+ }
417
+ evaluators.push({
418
+ name,
419
+ type: "code",
420
+ script,
421
+ cwd,
422
+ resolvedCwd
423
+ });
424
+ continue;
425
+ }
426
+ const prompt = asString2(rawEvaluator.prompt);
427
+ let promptPath;
428
+ if (prompt) {
429
+ const resolved = await resolveFileReference(prompt, searchRoots);
430
+ if (resolved.resolvedPath) {
431
+ promptPath = import_node_path3.default.resolve(resolved.resolvedPath);
432
+ } else {
433
+ logWarning2(
434
+ `Inline prompt used for evaluator '${name}' in '${evalId}' (file not found: ${resolved.displayPath})`,
435
+ resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
436
+ );
437
+ }
330
438
  }
439
+ const _model = asString2(rawEvaluator.model);
440
+ evaluators.push({
441
+ name,
442
+ type: "llm_judge",
443
+ prompt,
444
+ promptPath
445
+ });
331
446
  }
332
- return codeBlocks;
447
+ return evaluators.length > 0 ? evaluators : void 0;
448
+ }
449
+ function coerceEvaluator(candidate, contextId) {
450
+ if (typeof candidate !== "string") {
451
+ return void 0;
452
+ }
453
+ if (isEvaluatorKind(candidate)) {
454
+ return candidate;
455
+ }
456
+ logWarning2(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
457
+ return void 0;
458
+ }
459
+ function asString2(value) {
460
+ return typeof value === "string" ? value : void 0;
461
+ }
462
+ function isJsonObject2(value) {
463
+ return typeof value === "object" && value !== null && !Array.isArray(value);
333
464
  }
465
+ function logWarning2(message, details) {
466
+ if (details && details.length > 0) {
467
+ const detailBlock = details.join("\n");
468
+ console.warn(`${ANSI_YELLOW2}Warning: ${message}
469
+ ${detailBlock}${ANSI_RESET2}`);
470
+ } else {
471
+ console.warn(`${ANSI_YELLOW2}Warning: ${message}${ANSI_RESET2}`);
472
+ }
473
+ }
474
+
475
+ // src/evaluation/loaders/message-processor.ts
476
+ var import_promises3 = require("fs/promises");
477
+ var import_node_path4 = __toESM(require("path"), 1);
478
+ var ANSI_YELLOW3 = "\x1B[33m";
479
+ var ANSI_RESET3 = "\x1B[0m";
334
480
  async function processMessages(options) {
335
481
  const {
336
482
  messages,
@@ -356,9 +502,9 @@ async function processMessages(options) {
356
502
  if (!isJsonObject(rawSegment)) {
357
503
  continue;
358
504
  }
359
- const segmentType = asString(rawSegment.type);
505
+ const segmentType = asString3(rawSegment.type);
360
506
  if (segmentType === "file") {
361
- const rawValue = asString(rawSegment.value);
507
+ const rawValue = asString3(rawSegment.value);
362
508
  if (!rawValue) {
363
509
  continue;
364
510
  }
@@ -369,15 +515,15 @@ async function processMessages(options) {
369
515
  if (!resolvedPath) {
370
516
  const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
371
517
  const context = messageType === "input" ? "" : " in expected_messages";
372
- logWarning(`File not found${context}: ${displayPath}`, attempts);
518
+ logWarning3(`File not found${context}: ${displayPath}`, attempts);
373
519
  continue;
374
520
  }
375
521
  try {
376
- const fileContent = (await (0, import_promises2.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
522
+ const fileContent = (await (0, import_promises3.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
377
523
  if (messageType === "input" && guidelinePatterns && guidelinePaths) {
378
- const relativeToRepo = import_node_path2.default.relative(repoRootPath, resolvedPath);
524
+ const relativeToRepo = import_node_path4.default.relative(repoRootPath, resolvedPath);
379
525
  if (isGuidelineFile(relativeToRepo, guidelinePatterns)) {
380
- guidelinePaths.push(import_node_path2.default.resolve(resolvedPath));
526
+ guidelinePaths.push(import_node_path4.default.resolve(resolvedPath));
381
527
  if (verbose) {
382
528
  console.log(` [Guideline] Found: ${displayPath}`);
383
529
  console.log(` Resolved to: ${resolvedPath}`);
@@ -389,7 +535,7 @@ async function processMessages(options) {
389
535
  type: "file",
390
536
  path: displayPath,
391
537
  text: fileContent,
392
- resolvedPath: import_node_path2.default.resolve(resolvedPath)
538
+ resolvedPath: import_node_path4.default.resolve(resolvedPath)
393
539
  });
394
540
  if (verbose) {
395
541
  const label = messageType === "input" ? "[File]" : "[Expected Output File]";
@@ -398,7 +544,7 @@ async function processMessages(options) {
398
544
  }
399
545
  } catch (error) {
400
546
  const context = messageType === "input" ? "" : " expected output";
401
- logWarning(`Could not read${context} file ${resolvedPath}: ${error.message}`);
547
+ logWarning3(`Could not read${context} file ${resolvedPath}: ${error.message}`);
402
548
  }
403
549
  continue;
404
550
  }
@@ -412,202 +558,120 @@ async function processMessages(options) {
412
558
  }
413
559
  return segments;
414
560
  }
415
- async function loadEvalCases(evalFilePath, repoRoot, options) {
416
- const verbose = options?.verbose ?? false;
417
- const evalIdFilter = options?.evalId;
418
- const absoluteTestPath = import_node_path2.default.resolve(evalFilePath);
419
- if (!await fileExists2(absoluteTestPath)) {
420
- throw new Error(`Test file not found: ${evalFilePath}`);
421
- }
422
- const repoRootPath = resolveToAbsolutePath(repoRoot);
423
- const searchRoots = buildSearchRoots(absoluteTestPath, repoRootPath);
424
- const config = await loadConfig(absoluteTestPath, repoRootPath);
425
- const guidelinePatterns = config?.guideline_patterns;
426
- const rawFile = await (0, import_promises2.readFile)(absoluteTestPath, "utf8");
427
- const parsed = (0, import_yaml.parse)(rawFile);
428
- if (!isJsonObject(parsed)) {
429
- throw new Error(`Invalid test file format: ${evalFilePath}`);
430
- }
431
- const suite = parsed;
432
- const datasetNameFromSuite = asString(suite.dataset)?.trim();
433
- const fallbackDataset = import_node_path2.default.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
434
- const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
435
- const schema = suite.$schema;
436
- if (schema !== SCHEMA_EVAL_V2) {
437
- const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${evalFilePath}. Expected '${SCHEMA_EVAL_V2}'` : `Missing required field '$schema' in ${evalFilePath}.
438
- Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
439
- throw new Error(message);
561
+ async function resolveAssistantContent(content, searchRoots, verbose) {
562
+ if (typeof content === "string") {
563
+ return content;
440
564
  }
441
- const rawTestcases = suite.evalcases;
442
- if (!Array.isArray(rawTestcases)) {
443
- throw new Error(`Invalid test file format: ${evalFilePath} - missing 'evalcases' field`);
565
+ if (!content) {
566
+ return "";
444
567
  }
445
- const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm_judge";
446
- const globalExecution = isJsonObject(suite.execution) ? suite.execution : void 0;
447
- const globalTarget = asString(globalExecution?.target) ?? asString(suite.target);
448
- const results = [];
449
- for (const rawEvalcase of rawTestcases) {
450
- if (!isJsonObject(rawEvalcase)) {
451
- logWarning("Skipping invalid eval case entry (expected object)");
568
+ const parts = [];
569
+ for (const entry of content) {
570
+ if (typeof entry === "string") {
571
+ parts.push({ content: entry, isFile: false });
452
572
  continue;
453
573
  }
454
- const evalcase = rawEvalcase;
455
- const id = asString(evalcase.id);
456
- if (evalIdFilter && id !== evalIdFilter) {
574
+ if (!isJsonObject(entry)) {
457
575
  continue;
458
576
  }
459
- const conversationId = asString(evalcase.conversation_id);
460
- const outcome = asString(evalcase.outcome);
461
- const inputMessagesValue = evalcase.input_messages;
462
- const expectedMessagesValue = evalcase.expected_messages;
463
- if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
464
- logWarning(`Skipping incomplete eval case: ${id ?? "unknown"}`);
465
- continue;
466
- }
467
- const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
468
- const inputMessages = inputMessagesValue.filter((msg) => isTestMessage(msg));
469
- const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
470
- if (hasExpectedMessages && expectedMessages.length === 0) {
471
- logWarning(`No valid expected message found for eval case: ${id}`);
472
- continue;
473
- }
474
- if (expectedMessages.length > 1) {
475
- logWarning(`Multiple expected messages found for eval case: ${id}, using first`);
476
- }
477
- const guidelinePaths = [];
478
- const inputTextParts = [];
479
- const inputSegments = await processMessages({
480
- messages: inputMessages,
481
- searchRoots,
482
- repoRootPath,
483
- guidelinePatterns,
484
- guidelinePaths,
485
- textParts: inputTextParts,
486
- messageType: "input",
487
- verbose
488
- });
489
- const outputSegments = hasExpectedMessages ? await processMessages({
490
- messages: expectedMessages,
491
- searchRoots,
492
- repoRootPath,
493
- guidelinePatterns,
494
- messageType: "output",
495
- verbose
496
- }) : [];
497
- const codeSnippets = extractCodeBlocks(inputSegments);
498
- const expectedContent = expectedMessages[0]?.content;
499
- const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
500
- const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
501
- const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
502
- const evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
503
- const userFilePaths = [];
504
- for (const segment of inputSegments) {
505
- if (segment.type === "file" && typeof segment.resolvedPath === "string") {
506
- userFilePaths.push(segment.resolvedPath);
577
+ const segmentType = asString3(entry.type);
578
+ if (segmentType === "file") {
579
+ const rawValue = asString3(entry.value);
580
+ if (!rawValue) {
581
+ continue;
507
582
  }
508
- }
509
- const allFilePaths = [
510
- ...guidelinePaths.map((guidelinePath) => import_node_path2.default.resolve(guidelinePath)),
511
- ...userFilePaths
512
- ];
513
- const testCase = {
514
- id,
515
- dataset: datasetName,
516
- conversation_id: conversationId,
517
- question,
518
- input_messages: inputMessages,
519
- input_segments: inputSegments,
520
- output_segments: outputSegments,
521
- reference_answer: referenceAnswer,
522
- guideline_paths: guidelinePaths.map((guidelinePath) => import_node_path2.default.resolve(guidelinePath)),
523
- guideline_patterns: guidelinePatterns,
524
- file_paths: allFilePaths,
525
- code_snippets: codeSnippets,
526
- expected_outcome: outcome,
527
- evaluator: evalCaseEvaluatorKind,
528
- evaluators
529
- };
530
- if (verbose) {
531
- console.log(`
532
- [Eval Case: ${id}]`);
533
- if (testCase.guideline_paths.length > 0) {
534
- console.log(` Guidelines used: ${testCase.guideline_paths.length}`);
535
- for (const guidelinePath of testCase.guideline_paths) {
536
- console.log(` - ${guidelinePath}`);
583
+ const { displayPath, resolvedPath, attempted } = await resolveFileReference(
584
+ rawValue,
585
+ searchRoots
586
+ );
587
+ if (!resolvedPath) {
588
+ const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
589
+ logWarning3(`File not found in expected_messages: ${displayPath}`, attempts);
590
+ continue;
591
+ }
592
+ try {
593
+ const fileContent = (await (0, import_promises3.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n").trim();
594
+ parts.push({ content: fileContent, isFile: true, displayPath });
595
+ if (verbose) {
596
+ console.log(` [Expected Assistant File] Found: ${displayPath}`);
597
+ console.log(` Resolved to: ${resolvedPath}`);
537
598
  }
538
- } else {
539
- console.log(" No guidelines found");
599
+ } catch (error) {
600
+ logWarning3(`Could not read file ${resolvedPath}: ${error.message}`);
540
601
  }
602
+ continue;
541
603
  }
542
- results.push(testCase);
543
- }
544
- return results;
545
- }
546
- function needsRoleMarkers(messages, processedSegmentsByMessage) {
547
- if (messages.some((msg) => msg.role === "assistant" || msg.role === "tool")) {
548
- return true;
549
- }
550
- let messagesWithContent = 0;
551
- for (const segments of processedSegmentsByMessage) {
552
- if (hasVisibleContent(segments)) {
553
- messagesWithContent++;
604
+ const textValue = asString3(entry.text);
605
+ if (typeof textValue === "string") {
606
+ parts.push({ content: textValue, isFile: false });
607
+ continue;
608
+ }
609
+ const valueValue = asString3(entry.value);
610
+ if (typeof valueValue === "string") {
611
+ parts.push({ content: valueValue, isFile: false });
612
+ continue;
554
613
  }
614
+ parts.push({ content: JSON.stringify(entry), isFile: false });
555
615
  }
556
- return messagesWithContent > 1;
616
+ return formatFileContents(parts);
557
617
  }
558
- function hasVisibleContent(segments) {
559
- return segments.some((segment) => {
560
- const type = asString(segment.type);
561
- if (type === "text") {
562
- const value = asString(segment.value);
563
- return value !== void 0 && value.trim().length > 0;
564
- }
565
- if (type === "guideline_ref") {
566
- return false;
567
- }
568
- if (type === "file") {
569
- const text = asString(segment.text);
570
- return text !== void 0 && text.trim().length > 0;
571
- }
572
- return false;
573
- });
618
+ function asString3(value) {
619
+ return typeof value === "string" ? value : void 0;
574
620
  }
575
- function formatSegment(segment) {
576
- const type = asString(segment.type);
577
- if (type === "text") {
578
- return asString(segment.value);
621
+ function cloneJsonObject(source) {
622
+ const entries = Object.entries(source).map(([key, value]) => [key, cloneJsonValue(value)]);
623
+ return Object.fromEntries(entries);
624
+ }
625
+ function cloneJsonValue(value) {
626
+ if (value === null) {
627
+ return null;
579
628
  }
580
- if (type === "guideline_ref") {
581
- const refPath = asString(segment.path);
582
- return refPath ? `<Attached: ${refPath}>` : void 0;
629
+ if (typeof value === "string" || typeof value === "number" || typeof value === "boolean") {
630
+ return value;
583
631
  }
584
- if (type === "file") {
585
- const text = asString(segment.text);
586
- const filePath = asString(segment.path);
587
- if (text && filePath) {
588
- return `=== ${filePath} ===
589
- ${text}`;
590
- }
632
+ if (Array.isArray(value)) {
633
+ return value.map((item) => cloneJsonValue(item));
634
+ }
635
+ if (typeof value === "object") {
636
+ return cloneJsonObject(value);
637
+ }
638
+ return value;
639
+ }
640
+ function logWarning3(message, details) {
641
+ if (details && details.length > 0) {
642
+ const detailBlock = details.join("\n");
643
+ console.warn(`${ANSI_YELLOW3}Warning: ${message}
644
+ ${detailBlock}${ANSI_RESET3}`);
645
+ } else {
646
+ console.warn(`${ANSI_YELLOW3}Warning: ${message}${ANSI_RESET3}`);
591
647
  }
592
- return void 0;
593
648
  }
649
+
650
+ // src/evaluation/formatting/prompt-builder.ts
651
+ var import_promises4 = require("fs/promises");
652
+ var import_node_path5 = __toESM(require("path"), 1);
653
+ var ANSI_YELLOW4 = "\x1B[33m";
654
+ var ANSI_RESET4 = "\x1B[0m";
594
655
  async function buildPromptInputs(testCase) {
595
- const guidelineContents = [];
656
+ const guidelineParts = [];
596
657
  for (const rawPath of testCase.guideline_paths) {
597
- const absolutePath = import_node_path2.default.resolve(rawPath);
598
- if (!await fileExists2(absolutePath)) {
599
- logWarning(`Could not read guideline file ${absolutePath}: file does not exist`);
658
+ const absolutePath = import_node_path5.default.resolve(rawPath);
659
+ if (!await fileExists(absolutePath)) {
660
+ logWarning4(`Could not read guideline file ${absolutePath}: file does not exist`);
600
661
  continue;
601
662
  }
602
663
  try {
603
- const content = (await (0, import_promises2.readFile)(absolutePath, "utf8")).replace(/\r\n/g, "\n");
604
- guidelineContents.push(`=== ${import_node_path2.default.basename(absolutePath)} ===
605
- ${content}`);
664
+ const content = (await (0, import_promises4.readFile)(absolutePath, "utf8")).replace(/\r\n/g, "\n").trim();
665
+ guidelineParts.push({
666
+ content,
667
+ isFile: true,
668
+ displayPath: import_node_path5.default.basename(absolutePath)
669
+ });
606
670
  } catch (error) {
607
- logWarning(`Could not read guideline file ${absolutePath}: ${error.message}`);
671
+ logWarning4(`Could not read guideline file ${absolutePath}: ${error.message}`);
608
672
  }
609
673
  }
610
- const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
674
+ const guidelines = formatFileContents(guidelineParts);
611
675
  const segmentsByMessage = [];
612
676
  const fileContentsByPath = /* @__PURE__ */ new Map();
613
677
  for (const segment of testCase.input_segments) {
@@ -628,9 +692,9 @@ ${content}`);
628
692
  messageSegments.push({ type: "text", value: segment });
629
693
  }
630
694
  } else if (isJsonObject(segment)) {
631
- const type = asString(segment.type);
695
+ const type = asString4(segment.type);
632
696
  if (type === "file") {
633
- const value = asString(segment.value);
697
+ const value = asString4(segment.value);
634
698
  if (!value) continue;
635
699
  if (testCase.guideline_patterns && isGuidelineFile(value, testCase.guideline_patterns)) {
636
700
  messageSegments.push({ type: "guideline_ref", path: value });
@@ -641,7 +705,7 @@ ${content}`);
641
705
  messageSegments.push({ type: "file", text: fileText, path: value });
642
706
  }
643
707
  } else if (type === "text") {
644
- const textValue = asString(segment.value);
708
+ const textValue = asString4(segment.value);
645
709
  if (textValue && textValue.trim().length > 0) {
646
710
  messageSegments.push({ type: "text", value: textValue });
647
711
  }
@@ -697,6 +761,18 @@ ${messageContent}`);
697
761
  }) : void 0;
698
762
  return { question, guidelines, chatPrompt };
699
763
  }
764
+ function needsRoleMarkers(messages, processedSegmentsByMessage) {
765
+ if (messages.some((msg) => msg.role === "assistant" || msg.role === "tool")) {
766
+ return true;
767
+ }
768
+ let messagesWithContent = 0;
769
+ for (const segments of processedSegmentsByMessage) {
770
+ if (hasVisibleContent(segments)) {
771
+ messagesWithContent++;
772
+ }
773
+ }
774
+ return messagesWithContent > 1;
775
+ }
700
776
  function buildChatPromptFromSegments(options) {
701
777
  const { messages, segmentsByMessage, guidelinePatterns, guidelineContent, systemPrompt } = options;
702
778
  if (messages.length === 0) {
@@ -756,209 +832,294 @@ ${guidelineContent.trim()}`);
756
832
  if (isGuidelineRef) {
757
833
  continue;
758
834
  }
759
- contentParts.push(formatted);
835
+ contentParts.push(formatted);
836
+ }
837
+ }
838
+ if (contentParts.length === 0) {
839
+ continue;
840
+ }
841
+ chatPrompt.push({
842
+ role,
843
+ content: contentParts.join("\n"),
844
+ ...name ? { name } : {}
845
+ });
846
+ }
847
+ return chatPrompt.length > 0 ? chatPrompt : void 0;
848
+ }
849
+ function asString4(value) {
850
+ return typeof value === "string" ? value : void 0;
851
+ }
852
+ function logWarning4(message) {
853
+ console.warn(`${ANSI_YELLOW4}Warning: ${message}${ANSI_RESET4}`);
854
+ }
855
+
856
+ // src/evaluation/yaml-parser.ts
857
+ var ANSI_YELLOW5 = "\x1B[33m";
858
+ var ANSI_RESET5 = "\x1B[0m";
859
+ var SCHEMA_EVAL_V2 = "agentv-eval-v2";
860
+ async function readTestSuiteMetadata(testFilePath) {
861
+ try {
862
+ const absolutePath = import_node_path6.default.resolve(testFilePath);
863
+ const content = await (0, import_promises5.readFile)(absolutePath, "utf8");
864
+ const parsed = (0, import_yaml2.parse)(content);
865
+ if (!isJsonObject(parsed)) {
866
+ return {};
867
+ }
868
+ return { target: extractTargetFromSuite(parsed) };
869
+ } catch {
870
+ return {};
871
+ }
872
+ }
873
+ async function loadEvalCases(evalFilePath, repoRoot, options) {
874
+ const verbose = options?.verbose ?? false;
875
+ const evalIdFilter = options?.evalId;
876
+ const absoluteTestPath = import_node_path6.default.resolve(evalFilePath);
877
+ const repoRootPath = resolveToAbsolutePath(repoRoot);
878
+ const searchRoots = buildSearchRoots(absoluteTestPath, repoRootPath);
879
+ const config = await loadConfig(absoluteTestPath, repoRootPath);
880
+ const guidelinePatterns = config?.guideline_patterns;
881
+ const rawFile = await (0, import_promises5.readFile)(absoluteTestPath, "utf8");
882
+ const parsed = (0, import_yaml2.parse)(rawFile);
883
+ if (!isJsonObject(parsed)) {
884
+ throw new Error(`Invalid test file format: ${evalFilePath}`);
885
+ }
886
+ const suite = parsed;
887
+ const datasetNameFromSuite = asString5(suite.dataset)?.trim();
888
+ const fallbackDataset = import_node_path6.default.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
889
+ const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
890
+ const schema = suite.$schema;
891
+ if (schema !== SCHEMA_EVAL_V2) {
892
+ const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${evalFilePath}. Expected '${SCHEMA_EVAL_V2}'` : `Missing required field '$schema' in ${evalFilePath}.
893
+ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
894
+ throw new Error(message);
895
+ }
896
+ const rawTestcases = suite.evalcases;
897
+ if (!Array.isArray(rawTestcases)) {
898
+ throw new Error(`Invalid test file format: ${evalFilePath} - missing 'evalcases' field`);
899
+ }
900
+ const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm_judge";
901
+ const globalExecution = isJsonObject(suite.execution) ? suite.execution : void 0;
902
+ const _globalTarget = asString5(globalExecution?.target) ?? asString5(suite.target);
903
+ const results = [];
904
+ for (const rawEvalcase of rawTestcases) {
905
+ if (!isJsonObject(rawEvalcase)) {
906
+ logWarning5("Skipping invalid eval case entry (expected object)");
907
+ continue;
908
+ }
909
+ const evalcase = rawEvalcase;
910
+ const id = asString5(evalcase.id);
911
+ if (evalIdFilter && id !== evalIdFilter) {
912
+ continue;
913
+ }
914
+ const conversationId = asString5(evalcase.conversation_id);
915
+ const outcome = asString5(evalcase.outcome);
916
+ const inputMessagesValue = evalcase.input_messages;
917
+ const expectedMessagesValue = evalcase.expected_messages;
918
+ if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
919
+ logWarning5(`Skipping incomplete eval case: ${id ?? "unknown"}`);
920
+ continue;
921
+ }
922
+ const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
923
+ const inputMessages = inputMessagesValue.filter((msg) => isTestMessage(msg));
924
+ const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
925
+ if (hasExpectedMessages && expectedMessages.length === 0) {
926
+ logWarning5(`No valid expected message found for eval case: ${id}`);
927
+ continue;
928
+ }
929
+ if (expectedMessages.length > 1) {
930
+ logWarning5(`Multiple expected messages found for eval case: ${id}, using first`);
931
+ }
932
+ const guidelinePaths = [];
933
+ const inputTextParts = [];
934
+ const inputSegments = await processMessages({
935
+ messages: inputMessages,
936
+ searchRoots,
937
+ repoRootPath,
938
+ guidelinePatterns,
939
+ guidelinePaths,
940
+ textParts: inputTextParts,
941
+ messageType: "input",
942
+ verbose
943
+ });
944
+ const outputSegments = hasExpectedMessages ? await processMessages({
945
+ messages: expectedMessages,
946
+ searchRoots,
947
+ repoRootPath,
948
+ guidelinePatterns,
949
+ messageType: "output",
950
+ verbose
951
+ }) : [];
952
+ const codeSnippets = extractCodeBlocks(inputSegments);
953
+ const expectedContent = expectedMessages[0]?.content;
954
+ const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
955
+ const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
956
+ const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
957
+ const evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
958
+ const userFilePaths = [];
959
+ for (const segment of inputSegments) {
960
+ if (segment.type === "file" && typeof segment.resolvedPath === "string") {
961
+ userFilePaths.push(segment.resolvedPath);
962
+ }
963
+ }
964
+ const allFilePaths = [
965
+ ...guidelinePaths.map((guidelinePath) => import_node_path6.default.resolve(guidelinePath)),
966
+ ...userFilePaths
967
+ ];
968
+ const testCase = {
969
+ id,
970
+ dataset: datasetName,
971
+ conversation_id: conversationId,
972
+ question,
973
+ input_messages: inputMessages,
974
+ input_segments: inputSegments,
975
+ output_segments: outputSegments,
976
+ reference_answer: referenceAnswer,
977
+ guideline_paths: guidelinePaths.map((guidelinePath) => import_node_path6.default.resolve(guidelinePath)),
978
+ guideline_patterns: guidelinePatterns,
979
+ file_paths: allFilePaths,
980
+ code_snippets: codeSnippets,
981
+ expected_outcome: outcome,
982
+ evaluator: evalCaseEvaluatorKind,
983
+ evaluators
984
+ };
985
+ if (verbose) {
986
+ console.log(`
987
+ [Eval Case: ${id}]`);
988
+ if (testCase.guideline_paths.length > 0) {
989
+ console.log(` Guidelines used: ${testCase.guideline_paths.length}`);
990
+ for (const guidelinePath of testCase.guideline_paths) {
991
+ console.log(` - ${guidelinePath}`);
992
+ }
993
+ } else {
994
+ console.log(" No guidelines found");
760
995
  }
761
996
  }
762
- if (contentParts.length === 0) {
763
- continue;
764
- }
765
- chatPrompt.push({
766
- role,
767
- content: contentParts.join("\n"),
768
- ...name ? { name } : {}
769
- });
997
+ results.push(testCase);
770
998
  }
771
- return chatPrompt.length > 0 ? chatPrompt : void 0;
999
+ return results;
1000
+ }
1001
+ function asString5(value) {
1002
+ return typeof value === "string" ? value : void 0;
772
1003
  }
773
- async function fileExists2(absolutePath) {
1004
+ function logWarning5(message, details) {
1005
+ if (details && details.length > 0) {
1006
+ const detailBlock = details.join("\n");
1007
+ console.warn(`${ANSI_YELLOW5}Warning: ${message}
1008
+ ${detailBlock}${ANSI_RESET5}`);
1009
+ } else {
1010
+ console.warn(`${ANSI_YELLOW5}Warning: ${message}${ANSI_RESET5}`);
1011
+ }
1012
+ }
1013
+
1014
+ // src/evaluation/file-utils.ts
1015
+ var import_node_fs2 = require("fs");
1016
+ var import_promises6 = require("fs/promises");
1017
+ var import_node_path7 = __toESM(require("path"), 1);
1018
+ async function fileExists2(filePath) {
774
1019
  try {
775
- await (0, import_promises2.access)(absolutePath, import_node_fs2.constants.F_OK);
1020
+ await (0, import_promises6.access)(filePath, import_node_fs2.constants.F_OK);
776
1021
  return true;
777
1022
  } catch {
778
1023
  return false;
779
1024
  }
780
1025
  }
781
- function resolveToAbsolutePath(candidate) {
782
- if (candidate instanceof URL) {
783
- return (0, import_node_url.fileURLToPath)(candidate);
784
- }
785
- if (typeof candidate === "string") {
786
- if (candidate.startsWith("file://")) {
787
- return (0, import_node_url.fileURLToPath)(new URL(candidate));
788
- }
789
- return import_node_path2.default.resolve(candidate);
790
- }
791
- throw new TypeError("Unsupported repoRoot value. Expected string or URL.");
792
- }
793
- function asString(value) {
794
- return typeof value === "string" ? value : void 0;
1026
+ function normalizeLineEndings(content) {
1027
+ return content.replace(/\r\n/g, "\n");
795
1028
  }
796
- function cloneJsonObject(source) {
797
- const entries = Object.entries(source).map(([key, value]) => [key, cloneJsonValue(value)]);
798
- return Object.fromEntries(entries);
1029
+ async function readTextFile(filePath) {
1030
+ const content = await (0, import_promises6.readFile)(filePath, "utf8");
1031
+ return normalizeLineEndings(content);
799
1032
  }
800
- function cloneJsonValue(value) {
801
- if (value === null) {
802
- return null;
803
- }
804
- if (typeof value === "string" || typeof value === "number" || typeof value === "boolean") {
805
- return value;
806
- }
807
- if (Array.isArray(value)) {
808
- return value.map((item) => cloneJsonValue(item));
1033
+ async function findGitRoot(startPath) {
1034
+ let currentDir = import_node_path7.default.dirname(import_node_path7.default.resolve(startPath));
1035
+ const root = import_node_path7.default.parse(currentDir).root;
1036
+ while (currentDir !== root) {
1037
+ const gitPath = import_node_path7.default.join(currentDir, ".git");
1038
+ if (await fileExists2(gitPath)) {
1039
+ return currentDir;
1040
+ }
1041
+ const parentDir = import_node_path7.default.dirname(currentDir);
1042
+ if (parentDir === currentDir) {
1043
+ break;
1044
+ }
1045
+ currentDir = parentDir;
809
1046
  }
810
- return cloneJsonObject(value);
1047
+ return null;
811
1048
  }
812
- async function resolveAssistantContent(content, searchRoots, verbose) {
813
- if (typeof content === "string") {
814
- return content;
815
- }
816
- if (!content) {
817
- return "";
818
- }
819
- const parts = [];
820
- for (const entry of content) {
821
- if (typeof entry === "string") {
822
- parts.push(entry);
823
- continue;
1049
+ function buildDirectoryChain2(filePath, repoRoot) {
1050
+ const directories = [];
1051
+ const seen = /* @__PURE__ */ new Set();
1052
+ const boundary = import_node_path7.default.resolve(repoRoot);
1053
+ let current = import_node_path7.default.resolve(import_node_path7.default.dirname(filePath));
1054
+ while (current !== void 0) {
1055
+ if (!seen.has(current)) {
1056
+ directories.push(current);
1057
+ seen.add(current);
824
1058
  }
825
- if (!isJsonObject(entry)) {
826
- continue;
1059
+ if (current === boundary) {
1060
+ break;
827
1061
  }
828
- const segmentType = asString(entry.type);
829
- if (segmentType === "file") {
830
- const rawValue = asString(entry.value);
831
- if (!rawValue) {
832
- continue;
833
- }
834
- const { displayPath, resolvedPath, attempted } = await resolveFileReference(
835
- rawValue,
836
- searchRoots
837
- );
838
- if (!resolvedPath) {
839
- const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
840
- logWarning(`File not found in expected_messages: ${displayPath}`, attempts);
841
- continue;
842
- }
843
- try {
844
- const fileContent = (await (0, import_promises2.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
845
- parts.push(fileContent);
846
- if (verbose) {
847
- console.log(` [Expected Assistant File] Found: ${displayPath}`);
848
- console.log(` Resolved to: ${resolvedPath}`);
849
- }
850
- } catch (error) {
851
- logWarning(`Could not read file ${resolvedPath}: ${error.message}`);
852
- }
853
- continue;
1062
+ const parent = import_node_path7.default.dirname(current);
1063
+ if (parent === current) {
1064
+ break;
854
1065
  }
855
- const textValue = asString(entry.text);
856
- if (typeof textValue === "string") {
857
- parts.push(textValue);
858
- continue;
1066
+ current = parent;
1067
+ }
1068
+ if (!seen.has(boundary)) {
1069
+ directories.push(boundary);
1070
+ }
1071
+ return directories;
1072
+ }
1073
+ function buildSearchRoots2(evalPath, repoRoot) {
1074
+ const uniqueRoots = [];
1075
+ const addRoot = (root) => {
1076
+ const normalized = import_node_path7.default.resolve(root);
1077
+ if (!uniqueRoots.includes(normalized)) {
1078
+ uniqueRoots.push(normalized);
859
1079
  }
860
- const valueValue = asString(entry.value);
861
- if (typeof valueValue === "string") {
862
- parts.push(valueValue);
863
- continue;
1080
+ };
1081
+ let currentDir = import_node_path7.default.dirname(evalPath);
1082
+ let reachedBoundary = false;
1083
+ while (!reachedBoundary) {
1084
+ addRoot(currentDir);
1085
+ const parentDir = import_node_path7.default.dirname(currentDir);
1086
+ if (currentDir === repoRoot || parentDir === currentDir) {
1087
+ reachedBoundary = true;
1088
+ } else {
1089
+ currentDir = parentDir;
864
1090
  }
865
- parts.push(JSON.stringify(entry));
866
1091
  }
867
- return parts.join(" ");
1092
+ addRoot(repoRoot);
1093
+ addRoot(process.cwd());
1094
+ return uniqueRoots;
868
1095
  }
869
- async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
870
- const execution = rawEvalCase.execution;
871
- const candidateEvaluators = isJsonObject(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators ?? globalExecution?.evaluators;
872
- if (candidateEvaluators === void 0) {
873
- return void 0;
1096
+ function trimLeadingSeparators2(value) {
1097
+ const trimmed = value.replace(/^[/\\]+/, "");
1098
+ return trimmed.length > 0 ? trimmed : value;
1099
+ }
1100
+ async function resolveFileReference2(rawValue, searchRoots) {
1101
+ const displayPath = trimLeadingSeparators2(rawValue);
1102
+ const potentialPaths = [];
1103
+ if (import_node_path7.default.isAbsolute(rawValue)) {
1104
+ potentialPaths.push(import_node_path7.default.normalize(rawValue));
874
1105
  }
875
- if (!Array.isArray(candidateEvaluators)) {
876
- logWarning(`Skipping evaluators for '${evalId}': expected array`);
877
- return void 0;
1106
+ for (const base of searchRoots) {
1107
+ potentialPaths.push(import_node_path7.default.resolve(base, displayPath));
878
1108
  }
879
- const evaluators = [];
880
- for (const rawEvaluator of candidateEvaluators) {
881
- if (!isJsonObject(rawEvaluator)) {
882
- logWarning(`Skipping invalid evaluator entry for '${evalId}' (expected object)`);
883
- continue;
884
- }
885
- const name = asString(rawEvaluator.name);
886
- const typeValue = rawEvaluator.type;
887
- if (!name || !isEvaluatorKind(typeValue)) {
888
- logWarning(`Skipping evaluator with invalid name/type in '${evalId}'`);
889
- continue;
890
- }
891
- if (typeValue === "code") {
892
- const script = asString(rawEvaluator.script);
893
- if (!script) {
894
- logWarning(`Skipping code evaluator '${name}' in '${evalId}': missing script`);
895
- continue;
896
- }
897
- const cwd = asString(rawEvaluator.cwd);
898
- let resolvedCwd;
899
- if (cwd) {
900
- const resolved = await resolveFileReference(cwd, searchRoots);
901
- if (resolved.resolvedPath) {
902
- resolvedCwd = import_node_path2.default.resolve(resolved.resolvedPath);
903
- } else {
904
- logWarning(
905
- `Code evaluator '${name}' in '${evalId}': cwd not found (${resolved.displayPath})`,
906
- resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
907
- );
908
- }
909
- } else {
910
- resolvedCwd = searchRoots[0];
911
- }
912
- evaluators.push({
913
- name,
914
- type: "code",
915
- script,
916
- cwd,
917
- resolvedCwd
918
- });
1109
+ const attempted = [];
1110
+ const seen = /* @__PURE__ */ new Set();
1111
+ for (const candidate of potentialPaths) {
1112
+ const absoluteCandidate = import_node_path7.default.resolve(candidate);
1113
+ if (seen.has(absoluteCandidate)) {
919
1114
  continue;
920
1115
  }
921
- const prompt = asString(rawEvaluator.prompt);
922
- let promptPath;
923
- if (prompt) {
924
- const resolved = await resolveFileReference(prompt, searchRoots);
925
- if (resolved.resolvedPath) {
926
- promptPath = import_node_path2.default.resolve(resolved.resolvedPath);
927
- } else {
928
- logWarning(
929
- `Inline prompt used for evaluator '${name}' in '${evalId}' (file not found: ${resolved.displayPath})`,
930
- resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
931
- );
932
- }
1116
+ seen.add(absoluteCandidate);
1117
+ attempted.push(absoluteCandidate);
1118
+ if (await fileExists2(absoluteCandidate)) {
1119
+ return { displayPath, resolvedPath: absoluteCandidate, attempted };
933
1120
  }
934
- const model = asString(rawEvaluator.model);
935
- evaluators.push({
936
- name,
937
- type: "llm_judge",
938
- prompt,
939
- promptPath
940
- });
941
- }
942
- return evaluators.length > 0 ? evaluators : void 0;
943
- }
944
- function coerceEvaluator(candidate, contextId) {
945
- if (typeof candidate !== "string") {
946
- return void 0;
947
- }
948
- if (isEvaluatorKind(candidate)) {
949
- return candidate;
950
- }
951
- logWarning(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
952
- return void 0;
953
- }
954
- function logWarning(message, details) {
955
- if (details && details.length > 0) {
956
- const detailBlock = details.join("\n");
957
- console.warn(`${ANSI_YELLOW}Warning: ${message}
958
- ${detailBlock}${ANSI_RESET}`);
959
- } else {
960
- console.warn(`${ANSI_YELLOW}Warning: ${message}${ANSI_RESET}`);
961
1121
  }
1122
+ return { displayPath, attempted };
962
1123
  }
963
1124
 
964
1125
  // src/evaluation/providers/ax.ts
@@ -989,9 +1150,8 @@ function buildChatPrompt(request) {
989
1150
  }
990
1151
  function resolveSystemContent(request) {
991
1152
  const systemSegments = [];
992
- const metadataSystemPrompt = typeof request.metadata?.systemPrompt === "string" ? request.metadata.systemPrompt : void 0;
993
- if (metadataSystemPrompt && metadataSystemPrompt.trim().length > 0) {
994
- systemSegments.push(metadataSystemPrompt.trim());
1153
+ if (request.systemPrompt && request.systemPrompt.trim().length > 0) {
1154
+ systemSegments.push(request.systemPrompt.trim());
995
1155
  } else {
996
1156
  systemSegments.push(DEFAULT_SYSTEM_PROMPT);
997
1157
  }
@@ -1242,9 +1402,9 @@ var GeminiProvider = class {
1242
1402
 
1243
1403
  // src/evaluation/providers/cli.ts
1244
1404
  var import_node_child_process = require("child_process");
1245
- var import_promises3 = __toESM(require("fs/promises"), 1);
1405
+ var import_promises7 = __toESM(require("fs/promises"), 1);
1246
1406
  var import_node_os = __toESM(require("os"), 1);
1247
- var import_node_path3 = __toESM(require("path"), 1);
1407
+ var import_node_path8 = __toESM(require("path"), 1);
1248
1408
  var import_node_util = require("util");
1249
1409
  var execAsync = (0, import_node_util.promisify)(import_node_child_process.exec);
1250
1410
  var DEFAULT_MAX_BUFFER = 10 * 1024 * 1024;
@@ -1341,7 +1501,7 @@ var CliProvider = class {
1341
1501
  const errorMsg = error instanceof Error ? error.message : String(error);
1342
1502
  throw new Error(`Failed to read output file '${filePath}': ${errorMsg}`);
1343
1503
  } finally {
1344
- await import_promises3.default.unlink(filePath).catch(() => {
1504
+ await import_promises7.default.unlink(filePath).catch(() => {
1345
1505
  });
1346
1506
  }
1347
1507
  }
@@ -1423,7 +1583,7 @@ function normalizeInputFiles(inputFiles) {
1423
1583
  }
1424
1584
  const unique = /* @__PURE__ */ new Map();
1425
1585
  for (const inputFile of inputFiles) {
1426
- const absolutePath = import_node_path3.default.resolve(inputFile);
1586
+ const absolutePath = import_node_path8.default.resolve(inputFile);
1427
1587
  if (!unique.has(absolutePath)) {
1428
1588
  unique.set(absolutePath, absolutePath);
1429
1589
  }
@@ -1437,7 +1597,7 @@ function formatFileList(files, template) {
1437
1597
  const formatter = template ?? "{path}";
1438
1598
  return files.map((filePath) => {
1439
1599
  const escapedPath = shellEscape(filePath);
1440
- const escapedName = shellEscape(import_node_path3.default.basename(filePath));
1600
+ const escapedName = shellEscape(import_node_path8.default.basename(filePath));
1441
1601
  return formatter.replaceAll("{path}", escapedPath).replaceAll("{basename}", escapedName);
1442
1602
  }).join(" ");
1443
1603
  }
@@ -1461,7 +1621,7 @@ function generateOutputFilePath(evalCaseId) {
1461
1621
  const safeEvalId = evalCaseId || "unknown";
1462
1622
  const timestamp = Date.now();
1463
1623
  const random = Math.random().toString(36).substring(2, 9);
1464
- return import_node_path3.default.join(import_node_os.default.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}.json`);
1624
+ return import_node_path8.default.join(import_node_os.default.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}.json`);
1465
1625
  }
1466
1626
  function formatTimeoutSuffix(timeoutMs) {
1467
1627
  if (!timeoutMs || timeoutMs <= 0) {
@@ -1475,9 +1635,9 @@ function formatTimeoutSuffix(timeoutMs) {
1475
1635
  var import_node_child_process2 = require("child_process");
1476
1636
  var import_node_crypto = require("crypto");
1477
1637
  var import_node_fs3 = require("fs");
1478
- var import_promises4 = require("fs/promises");
1638
+ var import_promises8 = require("fs/promises");
1479
1639
  var import_node_os2 = require("os");
1480
- var import_node_path5 = __toESM(require("path"), 1);
1640
+ var import_node_path10 = __toESM(require("path"), 1);
1481
1641
  var import_node_util2 = require("util");
1482
1642
 
1483
1643
  // src/evaluation/providers/codex-log-tracker.ts
@@ -1534,7 +1694,7 @@ function subscribeToCodexLogEntries(listener) {
1534
1694
  }
1535
1695
 
1536
1696
  // src/evaluation/providers/preread.ts
1537
- var import_node_path4 = __toESM(require("path"), 1);
1697
+ var import_node_path9 = __toESM(require("path"), 1);
1538
1698
  function buildPromptDocument(request, inputFiles, options) {
1539
1699
  const parts = [];
1540
1700
  const guidelineFiles = collectGuidelineFiles(
@@ -1559,7 +1719,7 @@ function normalizeInputFiles2(inputFiles) {
1559
1719
  }
1560
1720
  const deduped = /* @__PURE__ */ new Map();
1561
1721
  for (const inputFile of inputFiles) {
1562
- const absolutePath = import_node_path4.default.resolve(inputFile);
1722
+ const absolutePath = import_node_path9.default.resolve(inputFile);
1563
1723
  if (!deduped.has(absolutePath)) {
1564
1724
  deduped.set(absolutePath, absolutePath);
1565
1725
  }
@@ -1572,14 +1732,14 @@ function collectGuidelineFiles(inputFiles, guidelinePatterns, overrides) {
1572
1732
  }
1573
1733
  const unique = /* @__PURE__ */ new Map();
1574
1734
  for (const inputFile of inputFiles) {
1575
- const absolutePath = import_node_path4.default.resolve(inputFile);
1735
+ const absolutePath = import_node_path9.default.resolve(inputFile);
1576
1736
  if (overrides?.has(absolutePath)) {
1577
1737
  if (!unique.has(absolutePath)) {
1578
1738
  unique.set(absolutePath, absolutePath);
1579
1739
  }
1580
1740
  continue;
1581
1741
  }
1582
- const normalized = absolutePath.split(import_node_path4.default.sep).join("/");
1742
+ const normalized = absolutePath.split(import_node_path9.default.sep).join("/");
1583
1743
  if (isGuidelineFile(normalized, guidelinePatterns)) {
1584
1744
  if (!unique.has(absolutePath)) {
1585
1745
  unique.set(absolutePath, absolutePath);
@@ -1594,7 +1754,7 @@ function collectInputFiles(inputFiles) {
1594
1754
  }
1595
1755
  const unique = /* @__PURE__ */ new Map();
1596
1756
  for (const inputFile of inputFiles) {
1597
- const absolutePath = import_node_path4.default.resolve(inputFile);
1757
+ const absolutePath = import_node_path9.default.resolve(inputFile);
1598
1758
  if (!unique.has(absolutePath)) {
1599
1759
  unique.set(absolutePath, absolutePath);
1600
1760
  }
@@ -1606,7 +1766,7 @@ function buildMandatoryPrereadBlock(guidelineFiles, inputFiles) {
1606
1766
  return "";
1607
1767
  }
1608
1768
  const buildList = (files) => files.map((absolutePath) => {
1609
- const fileName = import_node_path4.default.basename(absolutePath);
1769
+ const fileName = import_node_path9.default.basename(absolutePath);
1610
1770
  const fileUri = pathToFileUri(absolutePath);
1611
1771
  return `* [${fileName}](${fileUri})`;
1612
1772
  });
@@ -1626,7 +1786,7 @@ ${buildList(inputFiles).join("\n")}.`);
1626
1786
  return sections.join("\n");
1627
1787
  }
1628
1788
  function pathToFileUri(filePath) {
1629
- const absolutePath = import_node_path4.default.isAbsolute(filePath) ? filePath : import_node_path4.default.resolve(filePath);
1789
+ const absolutePath = import_node_path9.default.isAbsolute(filePath) ? filePath : import_node_path9.default.resolve(filePath);
1630
1790
  const normalizedPath = absolutePath.replace(/\\/g, "/");
1631
1791
  if (/^[a-zA-Z]:\//.test(normalizedPath)) {
1632
1792
  return `file:///${normalizedPath}`;
@@ -1664,8 +1824,8 @@ var CodexProvider = class {
1664
1824
  const logger = await this.createStreamLogger(request).catch(() => void 0);
1665
1825
  try {
1666
1826
  const promptContent = buildPromptDocument(request, inputFiles);
1667
- const promptFile = import_node_path5.default.join(workspaceRoot, PROMPT_FILENAME);
1668
- await (0, import_promises4.writeFile)(promptFile, promptContent, "utf8");
1827
+ const promptFile = import_node_path10.default.join(workspaceRoot, PROMPT_FILENAME);
1828
+ await (0, import_promises8.writeFile)(promptFile, promptContent, "utf8");
1669
1829
  const args = this.buildCodexArgs();
1670
1830
  const cwd = this.resolveCwd(workspaceRoot);
1671
1831
  const result = await this.executeCodex(args, cwd, promptContent, request.signal, logger);
@@ -1714,7 +1874,7 @@ var CodexProvider = class {
1714
1874
  if (!this.config.cwd) {
1715
1875
  return workspaceRoot;
1716
1876
  }
1717
- return import_node_path5.default.resolve(this.config.cwd);
1877
+ return import_node_path10.default.resolve(this.config.cwd);
1718
1878
  }
1719
1879
  buildCodexArgs() {
1720
1880
  const args = ["--ask-for-approval", "never", "exec", "--json", "--color", "never", "--skip-git-repo-check"];
@@ -1748,11 +1908,11 @@ var CodexProvider = class {
1748
1908
  }
1749
1909
  }
1750
1910
  async createWorkspace() {
1751
- return await (0, import_promises4.mkdtemp)(import_node_path5.default.join((0, import_node_os2.tmpdir)(), WORKSPACE_PREFIX));
1911
+ return await (0, import_promises8.mkdtemp)(import_node_path10.default.join((0, import_node_os2.tmpdir)(), WORKSPACE_PREFIX));
1752
1912
  }
1753
1913
  async cleanupWorkspace(workspaceRoot) {
1754
1914
  try {
1755
- await (0, import_promises4.rm)(workspaceRoot, { recursive: true, force: true });
1915
+ await (0, import_promises8.rm)(workspaceRoot, { recursive: true, force: true });
1756
1916
  } catch {
1757
1917
  }
1758
1918
  }
@@ -1762,9 +1922,9 @@ var CodexProvider = class {
1762
1922
  return void 0;
1763
1923
  }
1764
1924
  if (this.config.logDir) {
1765
- return import_node_path5.default.resolve(this.config.logDir);
1925
+ return import_node_path10.default.resolve(this.config.logDir);
1766
1926
  }
1767
- return import_node_path5.default.join(process.cwd(), ".agentv", "logs", "codex");
1927
+ return import_node_path10.default.join(process.cwd(), ".agentv", "logs", "codex");
1768
1928
  }
1769
1929
  async createStreamLogger(request) {
1770
1930
  const logDir = this.resolveLogDirectory();
@@ -1772,13 +1932,13 @@ var CodexProvider = class {
1772
1932
  return void 0;
1773
1933
  }
1774
1934
  try {
1775
- await (0, import_promises4.mkdir)(logDir, { recursive: true });
1935
+ await (0, import_promises8.mkdir)(logDir, { recursive: true });
1776
1936
  } catch (error) {
1777
1937
  const message = error instanceof Error ? error.message : String(error);
1778
1938
  console.warn(`Skipping Codex stream logging (could not create ${logDir}): ${message}`);
1779
1939
  return void 0;
1780
1940
  }
1781
- const filePath = import_node_path5.default.join(logDir, buildLogFilename(request, this.targetName));
1941
+ const filePath = import_node_path10.default.join(logDir, buildLogFilename(request, this.targetName));
1782
1942
  try {
1783
1943
  const logger = await CodexStreamLogger.create({
1784
1944
  filePath,
@@ -1993,9 +2153,9 @@ function tryParseJsonValue(rawLine) {
1993
2153
  async function locateExecutable(candidate) {
1994
2154
  const includesPathSeparator = candidate.includes("/") || candidate.includes("\\");
1995
2155
  if (includesPathSeparator) {
1996
- const resolved = import_node_path5.default.isAbsolute(candidate) ? candidate : import_node_path5.default.resolve(candidate);
2156
+ const resolved = import_node_path10.default.isAbsolute(candidate) ? candidate : import_node_path10.default.resolve(candidate);
1997
2157
  const executablePath = await ensureWindowsExecutableVariant(resolved);
1998
- await (0, import_promises4.access)(executablePath, import_node_fs3.constants.F_OK);
2158
+ await (0, import_promises8.access)(executablePath, import_node_fs3.constants.F_OK);
1999
2159
  return executablePath;
2000
2160
  }
2001
2161
  const locator = process.platform === "win32" ? "where" : "which";
@@ -2005,7 +2165,7 @@ async function locateExecutable(candidate) {
2005
2165
  const preferred = selectExecutableCandidate(lines);
2006
2166
  if (preferred) {
2007
2167
  const executablePath = await ensureWindowsExecutableVariant(preferred);
2008
- await (0, import_promises4.access)(executablePath, import_node_fs3.constants.F_OK);
2168
+ await (0, import_promises8.access)(executablePath, import_node_fs3.constants.F_OK);
2009
2169
  return executablePath;
2010
2170
  }
2011
2171
  } catch {
@@ -2039,7 +2199,7 @@ async function ensureWindowsExecutableVariant(candidate) {
2039
2199
  for (const ext of extensions) {
2040
2200
  const withExtension = `${candidate}${ext}`;
2041
2201
  try {
2042
- await (0, import_promises4.access)(withExtension, import_node_fs3.constants.F_OK);
2202
+ await (0, import_promises8.access)(withExtension, import_node_fs3.constants.F_OK);
2043
2203
  return withExtension;
2044
2204
  } catch {
2045
2205
  }
@@ -2851,7 +3011,7 @@ function resolveOptionalNumberArray(source, description) {
2851
3011
  }
2852
3012
 
2853
3013
  // src/evaluation/providers/vscode.ts
2854
- var import_node_path6 = __toESM(require("path"), 1);
3014
+ var import_node_path11 = __toESM(require("path"), 1);
2855
3015
  var import_subagent = require("subagent");
2856
3016
  var VSCodeProvider = class {
2857
3017
  id;
@@ -2964,6 +3124,9 @@ var VSCodeProvider = class {
2964
3124
  };
2965
3125
  function buildPromptDocument2(request, attachments, guidelinePatterns) {
2966
3126
  const parts = [];
3127
+ if (request.systemPrompt && request.systemPrompt.trim().length > 0) {
3128
+ parts.push(request.systemPrompt.trim());
3129
+ }
2967
3130
  const guidelineFiles = collectGuidelineFiles2(attachments, guidelinePatterns);
2968
3131
  const attachmentFiles = collectAttachmentFiles(attachments);
2969
3132
  const nonGuidelineAttachments = attachmentFiles.filter(
@@ -2981,7 +3144,7 @@ function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
2981
3144
  return "";
2982
3145
  }
2983
3146
  const buildList = (files) => files.map((absolutePath) => {
2984
- const fileName = import_node_path6.default.basename(absolutePath);
3147
+ const fileName = import_node_path11.default.basename(absolutePath);
2985
3148
  const fileUri = pathToFileUri2(absolutePath);
2986
3149
  return `* [${fileName}](${fileUri})`;
2987
3150
  });
@@ -3006,8 +3169,8 @@ function collectGuidelineFiles2(attachments, guidelinePatterns) {
3006
3169
  }
3007
3170
  const unique = /* @__PURE__ */ new Map();
3008
3171
  for (const attachment of attachments) {
3009
- const absolutePath = import_node_path6.default.resolve(attachment);
3010
- const normalized = absolutePath.split(import_node_path6.default.sep).join("/");
3172
+ const absolutePath = import_node_path11.default.resolve(attachment);
3173
+ const normalized = absolutePath.split(import_node_path11.default.sep).join("/");
3011
3174
  if (isGuidelineFile(normalized, guidelinePatterns)) {
3012
3175
  if (!unique.has(absolutePath)) {
3013
3176
  unique.set(absolutePath, absolutePath);
@@ -3022,7 +3185,7 @@ function collectAttachmentFiles(attachments) {
3022
3185
  }
3023
3186
  const unique = /* @__PURE__ */ new Map();
3024
3187
  for (const attachment of attachments) {
3025
- const absolutePath = import_node_path6.default.resolve(attachment);
3188
+ const absolutePath = import_node_path11.default.resolve(attachment);
3026
3189
  if (!unique.has(absolutePath)) {
3027
3190
  unique.set(absolutePath, absolutePath);
3028
3191
  }
@@ -3030,7 +3193,7 @@ function collectAttachmentFiles(attachments) {
3030
3193
  return Array.from(unique.values());
3031
3194
  }
3032
3195
  function pathToFileUri2(filePath) {
3033
- const absolutePath = import_node_path6.default.isAbsolute(filePath) ? filePath : import_node_path6.default.resolve(filePath);
3196
+ const absolutePath = import_node_path11.default.isAbsolute(filePath) ? filePath : import_node_path11.default.resolve(filePath);
3034
3197
  const normalizedPath = absolutePath.replace(/\\/g, "/");
3035
3198
  if (/^[a-zA-Z]:\//.test(normalizedPath)) {
3036
3199
  return `file:///${normalizedPath}`;
@@ -3043,7 +3206,7 @@ function normalizeAttachments(attachments) {
3043
3206
  }
3044
3207
  const deduped = /* @__PURE__ */ new Set();
3045
3208
  for (const attachment of attachments) {
3046
- deduped.add(import_node_path6.default.resolve(attachment));
3209
+ deduped.add(import_node_path11.default.resolve(attachment));
3047
3210
  }
3048
3211
  return Array.from(deduped);
3049
3212
  }
@@ -3052,7 +3215,7 @@ function mergeAttachments(all) {
3052
3215
  for (const list of all) {
3053
3216
  if (!list) continue;
3054
3217
  for (const inputFile of list) {
3055
- deduped.add(import_node_path6.default.resolve(inputFile));
3218
+ deduped.add(import_node_path11.default.resolve(inputFile));
3056
3219
  }
3057
3220
  }
3058
3221
  return deduped.size > 0 ? Array.from(deduped) : void 0;
@@ -3098,9 +3261,9 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
3098
3261
 
3099
3262
  // src/evaluation/providers/targets-file.ts
3100
3263
  var import_node_fs4 = require("fs");
3101
- var import_promises5 = require("fs/promises");
3102
- var import_node_path7 = __toESM(require("path"), 1);
3103
- var import_yaml2 = require("yaml");
3264
+ var import_promises9 = require("fs/promises");
3265
+ var import_node_path12 = __toESM(require("path"), 1);
3266
+ var import_yaml3 = require("yaml");
3104
3267
 
3105
3268
  // src/evaluation/providers/types.ts
3106
3269
  var AGENT_PROVIDER_KINDS = [
@@ -3161,19 +3324,19 @@ function assertTargetDefinition(value, index, filePath) {
3161
3324
  }
3162
3325
  async function fileExists3(filePath) {
3163
3326
  try {
3164
- await (0, import_promises5.access)(filePath, import_node_fs4.constants.F_OK);
3327
+ await (0, import_promises9.access)(filePath, import_node_fs4.constants.F_OK);
3165
3328
  return true;
3166
3329
  } catch {
3167
3330
  return false;
3168
3331
  }
3169
3332
  }
3170
3333
  async function readTargetDefinitions(filePath) {
3171
- const absolutePath = import_node_path7.default.resolve(filePath);
3334
+ const absolutePath = import_node_path12.default.resolve(filePath);
3172
3335
  if (!await fileExists3(absolutePath)) {
3173
3336
  throw new Error(`targets.yaml not found at ${absolutePath}`);
3174
3337
  }
3175
- const raw = await (0, import_promises5.readFile)(absolutePath, "utf8");
3176
- const parsed = (0, import_yaml2.parse)(raw);
3338
+ const raw = await (0, import_promises9.readFile)(absolutePath, "utf8");
3339
+ const parsed = (0, import_yaml3.parse)(raw);
3177
3340
  if (!isRecord(parsed)) {
3178
3341
  throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with '$schema' and 'targets' fields`);
3179
3342
  }
@@ -3216,18 +3379,34 @@ function resolveAndCreateProvider(definition, env = process.env) {
3216
3379
  }
3217
3380
 
3218
3381
  // src/evaluation/evaluators.ts
3219
- var import_node_crypto2 = require("crypto");
3382
+ var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
3383
+
3384
+ Use the reference_answer as a gold standard for a high-quality response (if provided). The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
3385
+
3386
+ Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.
3387
+
3388
+ [[ ## expected_outcome ## ]]
3389
+ {{expected_outcome}}
3390
+
3391
+ [[ ## question ## ]]
3392
+ {{question}}
3393
+
3394
+ [[ ## reference_answer ## ]]
3395
+ {{reference_answer}}
3396
+
3397
+ [[ ## candidate_answer ## ]]
3398
+ {{candidate_answer}}`;
3220
3399
  var LlmJudgeEvaluator = class {
3221
3400
  kind = "llm_judge";
3222
3401
  resolveJudgeProvider;
3223
3402
  maxOutputTokens;
3224
3403
  temperature;
3225
- customPrompt;
3404
+ evaluatorTemplate;
3226
3405
  constructor(options) {
3227
3406
  this.resolveJudgeProvider = options.resolveJudgeProvider;
3228
3407
  this.maxOutputTokens = options.maxOutputTokens;
3229
3408
  this.temperature = options.temperature;
3230
- this.customPrompt = options.customPrompt;
3409
+ this.evaluatorTemplate = options.evaluatorTemplate;
3231
3410
  }
3232
3411
  async evaluate(context) {
3233
3412
  const judgeProvider = await this.resolveJudgeProvider(context);
@@ -3237,26 +3416,21 @@ var LlmJudgeEvaluator = class {
3237
3416
  return this.evaluateWithPrompt(context, judgeProvider);
3238
3417
  }
3239
3418
  async evaluateWithPrompt(context, judgeProvider) {
3240
- const hasReferenceAnswer = hasNonEmptyReferenceAnswer(context.evalCase);
3241
3419
  const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
3242
- let prompt = buildQualityPrompt(context.evalCase, context.candidate, formattedQuestion);
3243
- let systemPrompt = context.systemPrompt ?? this.customPrompt ?? buildSystemPrompt(hasReferenceAnswer);
3244
- if (systemPrompt && hasTemplateVariables(systemPrompt)) {
3245
- const variables = {
3246
- input_messages: JSON.stringify(context.evalCase.input_segments, null, 2),
3247
- output_messages: JSON.stringify(context.evalCase.output_segments, null, 2),
3248
- candidate_answer: context.candidate,
3249
- reference_answer: context.evalCase.reference_answer ?? "",
3250
- expected_outcome: context.evalCase.expected_outcome,
3251
- question: formattedQuestion
3252
- };
3253
- prompt = substituteVariables(systemPrompt, variables);
3254
- systemPrompt = buildSystemPrompt(hasReferenceAnswer);
3255
- }
3256
- const metadata = systemPrompt !== void 0 ? { systemPrompt } : {};
3420
+ const variables = {
3421
+ input_messages: JSON.stringify(context.evalCase.input_segments, null, 2),
3422
+ output_messages: JSON.stringify(context.evalCase.output_segments, null, 2),
3423
+ candidate_answer: context.candidate.trim(),
3424
+ reference_answer: (context.evalCase.reference_answer ?? "").trim(),
3425
+ expected_outcome: context.evalCase.expected_outcome.trim(),
3426
+ question: formattedQuestion.trim()
3427
+ };
3428
+ const systemPrompt = buildOutputSchema();
3429
+ const evaluatorTemplate = context.evaluatorTemplateOverride ?? this.evaluatorTemplate ?? DEFAULT_EVALUATOR_TEMPLATE;
3430
+ const userPrompt = substituteVariables(evaluatorTemplate, variables);
3257
3431
  const response = await judgeProvider.invoke({
3258
- question: prompt,
3259
- metadata,
3432
+ question: userPrompt,
3433
+ systemPrompt,
3260
3434
  evalCaseId: context.evalCase.id,
3261
3435
  attempt: context.attempt,
3262
3436
  maxOutputTokens: this.maxOutputTokens,
@@ -3269,11 +3443,9 @@ var LlmJudgeEvaluator = class {
3269
3443
  const reasoning = parsed.reasoning ?? response.reasoning;
3270
3444
  const expectedAspectCount = Math.max(hits.length + misses.length, 1);
3271
3445
  const evaluatorRawRequest = {
3272
- id: (0, import_node_crypto2.randomUUID)(),
3273
- provider: judgeProvider.id,
3274
- prompt,
3275
- target: context.target.name,
3276
- ...systemPrompt !== void 0 && { systemPrompt }
3446
+ userPrompt,
3447
+ systemPrompt,
3448
+ target: judgeProvider.targetName
3277
3449
  };
3278
3450
  return {
3279
3451
  score,
@@ -3285,20 +3457,8 @@ var LlmJudgeEvaluator = class {
3285
3457
  };
3286
3458
  }
3287
3459
  };
3288
- function buildSystemPrompt(hasReferenceAnswer) {
3289
- const basePrompt = [
3290
- "You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.",
3291
- ""
3292
- ];
3293
- if (hasReferenceAnswer) {
3294
- basePrompt.push(
3295
- "Use the reference_answer as a gold standard for a high-quality response. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.",
3296
- ""
3297
- );
3298
- }
3299
- basePrompt.push(
3300
- "Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.",
3301
- "",
3460
+ function buildOutputSchema() {
3461
+ return [
3302
3462
  "You must respond with a single JSON object matching this schema:",
3303
3463
  "",
3304
3464
  "{",
@@ -3307,30 +3467,7 @@ function buildSystemPrompt(hasReferenceAnswer) {
3307
3467
  ' "misses": [<array of strings, max 4 items, brief specific failures or omissions, empty if none>],',
3308
3468
  ' "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
3309
3469
  "}"
3310
- );
3311
- return basePrompt.join("\n");
3312
- }
3313
- function buildQualityPrompt(evalCase, candidate, question) {
3314
- const parts = [
3315
- "[[ ## expected_outcome ## ]]",
3316
- evalCase.expected_outcome.trim(),
3317
- "",
3318
- "[[ ## question ## ]]",
3319
- question.trim(),
3320
- ""
3321
- ];
3322
- if (hasNonEmptyReferenceAnswer(evalCase)) {
3323
- parts.push(
3324
- "[[ ## reference_answer ## ]]",
3325
- evalCase.reference_answer.trim(),
3326
- ""
3327
- );
3328
- }
3329
- parts.push(
3330
- "[[ ## candidate_answer ## ]]",
3331
- candidate.trim()
3332
- );
3333
- return parts.join("\n");
3470
+ ].join("\n");
3334
3471
  }
3335
3472
  function clampScore(value) {
3336
3473
  if (Number.isNaN(value) || !Number.isFinite(value)) {
@@ -3412,9 +3549,6 @@ function extractJsonBlob(text) {
3412
3549
  function isNonEmptyString(value) {
3413
3550
  return typeof value === "string" && value.trim().length > 0;
3414
3551
  }
3415
- function hasNonEmptyReferenceAnswer(evalCase) {
3416
- return evalCase.reference_answer !== void 0 && evalCase.reference_answer.trim().length > 0;
3417
- }
3418
3552
  var CodeEvaluator = class {
3419
3553
  kind = "code";
3420
3554
  script;
@@ -3520,19 +3654,16 @@ function parseJsonSafe(payload) {
3520
3654
  return void 0;
3521
3655
  }
3522
3656
  }
3523
- function hasTemplateVariables(text) {
3524
- return /\$\{[a-zA-Z0-9_]+\}/.test(text);
3525
- }
3526
3657
  function substituteVariables(template, variables) {
3527
- return template.replace(/\$\{([a-zA-Z0-9_]+)\}/g, (match, varName) => {
3658
+ return template.replace(/\{\{([a-zA-Z0-9_]+)\}\}/g, (match, varName) => {
3528
3659
  return variables[varName] ?? match;
3529
3660
  });
3530
3661
  }
3531
3662
 
3532
3663
  // src/evaluation/orchestrator.ts
3533
- var import_node_crypto3 = require("crypto");
3534
- var import_promises6 = require("fs/promises");
3535
- var import_node_path8 = __toESM(require("path"), 1);
3664
+ var import_node_crypto2 = require("crypto");
3665
+ var import_promises10 = require("fs/promises");
3666
+ var import_node_path13 = __toESM(require("path"), 1);
3536
3667
 
3537
3668
  // ../../node_modules/.pnpm/yocto-queue@1.2.1/node_modules/yocto-queue/index.js
3538
3669
  var Node = class {
@@ -4095,6 +4226,7 @@ async function evaluateCandidate(options) {
4095
4226
  }
4096
4227
  }
4097
4228
  return {
4229
+ timestamp: completedAt.toISOString(),
4098
4230
  eval_id: evalCase.id,
4099
4231
  dataset: evalCase.dataset,
4100
4232
  conversation_id: evalCase.conversation_id,
@@ -4102,14 +4234,12 @@ async function evaluateCandidate(options) {
4102
4234
  hits: score.hits,
4103
4235
  misses: score.misses,
4104
4236
  candidate_answer: candidate,
4105
- expected_aspect_count: score.expectedAspectCount,
4106
4237
  target: target.name,
4107
- timestamp: completedAt.toISOString(),
4108
4238
  reasoning: score.reasoning,
4109
4239
  raw_aspects: score.rawAspects,
4110
4240
  agent_provider_request: agentProviderRequest,
4111
4241
  lm_provider_request: lmProviderRequest,
4112
- evaluator_raw_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
4242
+ evaluator_provider_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
4113
4243
  evaluator_results: evaluatorResults
4114
4244
  };
4115
4245
  }
@@ -4186,7 +4316,7 @@ async function runEvaluatorList(options) {
4186
4316
  hits: score2.hits,
4187
4317
  misses: score2.misses,
4188
4318
  reasoning: score2.reasoning,
4189
- evaluator_raw_request: score2.evaluatorRawRequest
4319
+ evaluator_provider_request: score2.evaluatorRawRequest
4190
4320
  });
4191
4321
  continue;
4192
4322
  }
@@ -4213,7 +4343,7 @@ async function runEvaluatorList(options) {
4213
4343
  hits: score2.hits,
4214
4344
  misses: score2.misses,
4215
4345
  reasoning: score2.reasoning,
4216
- evaluator_raw_request: score2.evaluatorRawRequest
4346
+ evaluator_provider_request: score2.evaluatorRawRequest
4217
4347
  });
4218
4348
  continue;
4219
4349
  }
@@ -4266,7 +4396,7 @@ async function runLlmJudgeEvaluator(options) {
4266
4396
  promptInputs,
4267
4397
  now,
4268
4398
  judgeProvider,
4269
- systemPrompt: customPrompt,
4399
+ evaluatorTemplateOverride: customPrompt,
4270
4400
  evaluator: config
4271
4401
  });
4272
4402
  }
@@ -4307,22 +4437,22 @@ function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
4307
4437
  async function dumpPrompt(directory, evalCase, promptInputs) {
4308
4438
  const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
4309
4439
  const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
4310
- const filePath = import_node_path8.default.resolve(directory, filename);
4311
- await (0, import_promises6.mkdir)(import_node_path8.default.dirname(filePath), { recursive: true });
4440
+ const filePath = import_node_path13.default.resolve(directory, filename);
4441
+ await (0, import_promises10.mkdir)(import_node_path13.default.dirname(filePath), { recursive: true });
4312
4442
  const payload = {
4313
4443
  eval_id: evalCase.id,
4314
4444
  question: promptInputs.question,
4315
4445
  guidelines: promptInputs.guidelines,
4316
4446
  guideline_paths: evalCase.guideline_paths
4317
4447
  };
4318
- await (0, import_promises6.writeFile)(filePath, JSON.stringify(payload, null, 2), "utf8");
4448
+ await (0, import_promises10.writeFile)(filePath, JSON.stringify(payload, null, 2), "utf8");
4319
4449
  }
4320
4450
  function sanitizeFilename(value) {
4321
4451
  if (!value) {
4322
4452
  return "prompt";
4323
4453
  }
4324
4454
  const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
4325
- return sanitized.length > 0 ? sanitized : (0, import_node_crypto3.randomUUID)();
4455
+ return sanitized.length > 0 ? sanitized : (0, import_node_crypto2.randomUUID)();
4326
4456
  }
4327
4457
  async function invokeProvider(provider, options) {
4328
4458
  const { evalCase, promptInputs, attempt, agentTimeoutMs, signal } = options;
@@ -4378,6 +4508,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
4378
4508
  }
4379
4509
  }
4380
4510
  return {
4511
+ timestamp: timestamp.toISOString(),
4381
4512
  eval_id: evalCase.id,
4382
4513
  dataset: evalCase.dataset,
4383
4514
  conversation_id: evalCase.conversation_id,
@@ -4385,9 +4516,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
4385
4516
  hits: [],
4386
4517
  misses: [`Error: ${message}`],
4387
4518
  candidate_answer: `Error occurred: ${message}`,
4388
- expected_aspect_count: 0,
4389
4519
  target: targetName,
4390
- timestamp: timestamp.toISOString(),
4391
4520
  raw_aspects: [],
4392
4521
  agent_provider_request: agentProviderRequest,
4393
4522
  lm_provider_request: lmProviderRequest,
@@ -4395,7 +4524,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
4395
4524
  };
4396
4525
  }
4397
4526
  function createCacheKey(provider, target, evalCase, promptInputs) {
4398
- const hash = (0, import_node_crypto3.createHash)("sha256");
4527
+ const hash = (0, import_node_crypto2.createHash)("sha256");
4399
4528
  hash.update(provider.id);
4400
4529
  hash.update(target.name);
4401
4530
  hash.update(evalCase.id);