@agentv/core 0.11.0 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -33,15 +33,15 @@ __export(index_exports, {
33
33
  CodeEvaluator: () => CodeEvaluator,
34
34
  LlmJudgeEvaluator: () => LlmJudgeEvaluator,
35
35
  TEST_MESSAGE_ROLES: () => TEST_MESSAGE_ROLES,
36
- buildDirectoryChain: () => buildDirectoryChain,
36
+ buildDirectoryChain: () => buildDirectoryChain2,
37
37
  buildPromptInputs: () => buildPromptInputs,
38
- buildSearchRoots: () => buildSearchRoots,
38
+ buildSearchRoots: () => buildSearchRoots2,
39
39
  consumeCodexLogEntries: () => consumeCodexLogEntries,
40
40
  createAgentKernel: () => createAgentKernel,
41
41
  createProvider: () => createProvider,
42
42
  ensureVSCodeSubagents: () => ensureVSCodeSubagents,
43
43
  extractCodeBlocks: () => extractCodeBlocks,
44
- fileExists: () => fileExists,
44
+ fileExists: () => fileExists2,
45
45
  findGitRoot: () => findGitRoot,
46
46
  getHitCount: () => getHitCount,
47
47
  isEvaluatorKind: () => isEvaluatorKind,
@@ -57,7 +57,7 @@ __export(index_exports, {
57
57
  readTestSuiteMetadata: () => readTestSuiteMetadata,
58
58
  readTextFile: () => readTextFile,
59
59
  resolveAndCreateProvider: () => resolveAndCreateProvider,
60
- resolveFileReference: () => resolveFileReference,
60
+ resolveFileReference: () => resolveFileReference2,
61
61
  resolveTargetDefinition: () => resolveTargetDefinition,
62
62
  runEvalCase: () => runEvalCase,
63
63
  runEvaluation: () => runEvaluation,
@@ -116,47 +116,112 @@ function getHitCount(result) {
116
116
  }
117
117
 
118
118
  // src/evaluation/yaml-parser.ts
119
+ var import_promises5 = require("fs/promises");
120
+ var import_node_path6 = __toESM(require("path"), 1);
121
+ var import_yaml2 = require("yaml");
122
+
123
+ // src/evaluation/formatting/segment-formatter.ts
124
+ function extractCodeBlocks(segments) {
125
+ const CODE_BLOCK_PATTERN = /```[\s\S]*?```/g;
126
+ const codeBlocks = [];
127
+ for (const segment of segments) {
128
+ const typeValue = segment["type"];
129
+ if (typeof typeValue !== "string" || typeValue !== "text") {
130
+ continue;
131
+ }
132
+ const textValue = segment["value"];
133
+ if (typeof textValue !== "string") {
134
+ continue;
135
+ }
136
+ const matches = textValue.match(CODE_BLOCK_PATTERN);
137
+ if (matches) {
138
+ codeBlocks.push(...matches);
139
+ }
140
+ }
141
+ return codeBlocks;
142
+ }
143
+ function formatFileContents(parts) {
144
+ const fileCount = parts.filter((p) => p.isFile).length;
145
+ if (fileCount > 0) {
146
+ return parts.map((part) => {
147
+ if (part.isFile && part.displayPath) {
148
+ return `<file path="${part.displayPath}">
149
+ ${part.content}
150
+ </file>`;
151
+ }
152
+ return part.content;
153
+ }).join("\n\n");
154
+ }
155
+ return parts.map((p) => p.content).join(" ");
156
+ }
157
+ function formatSegment(segment) {
158
+ const type = asString(segment.type);
159
+ if (type === "text") {
160
+ return asString(segment.value);
161
+ }
162
+ if (type === "guideline_ref") {
163
+ const refPath = asString(segment.path);
164
+ return refPath ? `<Attached: ${refPath}>` : void 0;
165
+ }
166
+ if (type === "file") {
167
+ const text = asString(segment.text);
168
+ const filePath = asString(segment.path);
169
+ if (text && filePath) {
170
+ return formatFileContents([{ content: text.trim(), isFile: true, displayPath: filePath }]);
171
+ }
172
+ }
173
+ return void 0;
174
+ }
175
+ function hasVisibleContent(segments) {
176
+ return segments.some((segment) => {
177
+ const type = asString(segment.type);
178
+ if (type === "text") {
179
+ const value = asString(segment.value);
180
+ return value !== void 0 && value.trim().length > 0;
181
+ }
182
+ if (type === "guideline_ref") {
183
+ return false;
184
+ }
185
+ if (type === "file") {
186
+ const text = asString(segment.text);
187
+ return text !== void 0 && text.trim().length > 0;
188
+ }
189
+ return false;
190
+ });
191
+ }
192
+ function asString(value) {
193
+ return typeof value === "string" ? value : void 0;
194
+ }
195
+
196
+ // src/evaluation/loaders/config-loader.ts
119
197
  var import_micromatch = __toESM(require("micromatch"), 1);
120
- var import_node_fs2 = require("fs");
121
198
  var import_promises2 = require("fs/promises");
122
199
  var import_node_path2 = __toESM(require("path"), 1);
123
- var import_node_url = require("url");
124
200
  var import_yaml = require("yaml");
125
201
 
126
- // src/evaluation/file-utils.ts
202
+ // src/evaluation/loaders/file-resolver.ts
127
203
  var import_node_fs = require("fs");
128
204
  var import_promises = require("fs/promises");
129
205
  var import_node_path = __toESM(require("path"), 1);
130
- async function fileExists(filePath) {
206
+ async function fileExists(absolutePath) {
131
207
  try {
132
- await (0, import_promises.access)(filePath, import_node_fs.constants.F_OK);
208
+ await (0, import_promises.access)(absolutePath, import_node_fs.constants.F_OK);
133
209
  return true;
134
210
  } catch {
135
211
  return false;
136
212
  }
137
213
  }
138
- function normalizeLineEndings(content) {
139
- return content.replace(/\r\n/g, "\n");
140
- }
141
- async function readTextFile(filePath) {
142
- const content = await (0, import_promises.readFile)(filePath, "utf8");
143
- return normalizeLineEndings(content);
144
- }
145
- async function findGitRoot(startPath) {
146
- let currentDir = import_node_path.default.dirname(import_node_path.default.resolve(startPath));
147
- const root = import_node_path.default.parse(currentDir).root;
148
- while (currentDir !== root) {
149
- const gitPath = import_node_path.default.join(currentDir, ".git");
150
- if (await fileExists(gitPath)) {
151
- return currentDir;
152
- }
153
- const parentDir = import_node_path.default.dirname(currentDir);
154
- if (parentDir === currentDir) {
155
- break;
214
+ function resolveToAbsolutePath(candidate) {
215
+ if (candidate instanceof URL) {
216
+ return new URL(candidate).pathname;
217
+ }
218
+ if (typeof candidate === "string") {
219
+ if (candidate.startsWith("file://")) {
220
+ return new URL(candidate).pathname;
156
221
  }
157
- currentDir = parentDir;
222
+ return import_node_path.default.resolve(candidate);
158
223
  }
159
- return null;
224
+ throw new TypeError("Unsupported repoRoot value. Expected string or URL.");
160
225
  }
161
226
  function buildDirectoryChain(filePath, repoRoot) {
162
227
  const directories = [];
@@ -234,44 +299,15 @@ async function resolveFileReference(rawValue, searchRoots) {
234
299
  return { displayPath, attempted };
235
300
  }
236
301
 
237
- // src/evaluation/yaml-parser.ts
238
- var CODE_BLOCK_PATTERN = /```[\s\S]*?```/g;
302
+ // src/evaluation/loaders/config-loader.ts
303
+ var SCHEMA_CONFIG_V2 = "agentv-config-v2";
239
304
  var ANSI_YELLOW = "\x1B[33m";
240
305
  var ANSI_RESET = "\x1B[0m";
241
- var SCHEMA_EVAL_V2 = "agentv-eval-v2";
242
- var SCHEMA_CONFIG_V2 = "agentv-config-v2";
243
- async function readTestSuiteMetadata(testFilePath) {
244
- try {
245
- const absolutePath = import_node_path2.default.resolve(testFilePath);
246
- const content = await (0, import_promises2.readFile)(absolutePath, "utf8");
247
- const parsed = (0, import_yaml.parse)(content);
248
- if (!isJsonObject(parsed)) {
249
- return {};
250
- }
251
- return { target: extractTargetFromSuite(parsed) };
252
- } catch {
253
- return {};
254
- }
255
- }
256
- function extractTargetFromSuite(suite) {
257
- const execution = suite.execution;
258
- if (execution && typeof execution === "object" && !Array.isArray(execution)) {
259
- const executionTarget = execution.target;
260
- if (typeof executionTarget === "string" && executionTarget.trim().length > 0) {
261
- return executionTarget.trim();
262
- }
263
- }
264
- const targetValue = suite.target;
265
- if (typeof targetValue === "string" && targetValue.trim().length > 0) {
266
- return targetValue.trim();
267
- }
268
- return void 0;
269
- }
270
306
  async function loadConfig(evalFilePath, repoRoot) {
271
307
  const directories = buildDirectoryChain(evalFilePath, repoRoot);
272
308
  for (const directory of directories) {
273
309
  const configPath = import_node_path2.default.join(directory, ".agentv", "config.yaml");
274
- if (!await fileExists2(configPath)) {
310
+ if (!await fileExists(configPath)) {
275
311
  continue;
276
312
  }
277
313
  try {
@@ -313,24 +349,134 @@ function isGuidelineFile(filePath, patterns) {
313
349
  const patternsToUse = patterns ?? [];
314
350
  return import_micromatch.default.isMatch(normalized, patternsToUse);
315
351
  }
316
- function extractCodeBlocks(segments) {
317
- const codeBlocks = [];
318
- for (const segment of segments) {
319
- const typeValue = segment["type"];
320
- if (typeof typeValue !== "string" || typeValue !== "text") {
352
+ function extractTargetFromSuite(suite) {
353
+ const execution = suite.execution;
354
+ if (execution && typeof execution === "object" && !Array.isArray(execution)) {
355
+ const executionTarget = execution.target;
356
+ if (typeof executionTarget === "string" && executionTarget.trim().length > 0) {
357
+ return executionTarget.trim();
358
+ }
359
+ }
360
+ const targetValue = suite.target;
361
+ if (typeof targetValue === "string" && targetValue.trim().length > 0) {
362
+ return targetValue.trim();
363
+ }
364
+ return void 0;
365
+ }
366
+ function logWarning(message) {
367
+ console.warn(`${ANSI_YELLOW}Warning: ${message}${ANSI_RESET}`);
368
+ }
369
+
370
+ // src/evaluation/loaders/evaluator-parser.ts
371
+ var import_node_path3 = __toESM(require("path"), 1);
372
+ var ANSI_YELLOW2 = "\x1B[33m";
373
+ var ANSI_RESET2 = "\x1B[0m";
374
+ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
375
+ const execution = rawEvalCase.execution;
376
+ const candidateEvaluators = isJsonObject2(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators ?? globalExecution?.evaluators;
377
+ if (candidateEvaluators === void 0) {
378
+ return void 0;
379
+ }
380
+ if (!Array.isArray(candidateEvaluators)) {
381
+ logWarning2(`Skipping evaluators for '${evalId}': expected array`);
382
+ return void 0;
383
+ }
384
+ const evaluators = [];
385
+ for (const rawEvaluator of candidateEvaluators) {
386
+ if (!isJsonObject2(rawEvaluator)) {
387
+ logWarning2(`Skipping invalid evaluator entry for '${evalId}' (expected object)`);
321
388
  continue;
322
389
  }
323
- const textValue = segment["value"];
324
- if (typeof textValue !== "string") {
390
+ const name = asString2(rawEvaluator.name);
391
+ const typeValue = rawEvaluator.type;
392
+ if (!name || !isEvaluatorKind(typeValue)) {
393
+ logWarning2(`Skipping evaluator with invalid name/type in '${evalId}'`);
325
394
  continue;
326
395
  }
327
- const matches = textValue.match(CODE_BLOCK_PATTERN);
328
- if (matches) {
329
- codeBlocks.push(...matches);
396
+ if (typeValue === "code") {
397
+ const script = asString2(rawEvaluator.script);
398
+ if (!script) {
399
+ logWarning2(`Skipping code evaluator '${name}' in '${evalId}': missing script`);
400
+ continue;
401
+ }
402
+ const cwd = asString2(rawEvaluator.cwd);
403
+ let resolvedCwd;
404
+ if (cwd) {
405
+ const resolved = await resolveFileReference(cwd, searchRoots);
406
+ if (resolved.resolvedPath) {
407
+ resolvedCwd = import_node_path3.default.resolve(resolved.resolvedPath);
408
+ } else {
409
+ logWarning2(
410
+ `Code evaluator '${name}' in '${evalId}': cwd not found (${resolved.displayPath})`,
411
+ resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
412
+ );
413
+ }
414
+ } else {
415
+ resolvedCwd = searchRoots[0];
416
+ }
417
+ evaluators.push({
418
+ name,
419
+ type: "code",
420
+ script,
421
+ cwd,
422
+ resolvedCwd
423
+ });
424
+ continue;
425
+ }
426
+ const prompt = asString2(rawEvaluator.prompt);
427
+ let promptPath;
428
+ if (prompt) {
429
+ const resolved = await resolveFileReference(prompt, searchRoots);
430
+ if (resolved.resolvedPath) {
431
+ promptPath = import_node_path3.default.resolve(resolved.resolvedPath);
432
+ } else {
433
+ logWarning2(
434
+ `Inline prompt used for evaluator '${name}' in '${evalId}' (file not found: ${resolved.displayPath})`,
435
+ resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
436
+ );
437
+ }
330
438
  }
439
+ const _model = asString2(rawEvaluator.model);
440
+ evaluators.push({
441
+ name,
442
+ type: "llm_judge",
443
+ prompt,
444
+ promptPath
445
+ });
446
+ }
447
+ return evaluators.length > 0 ? evaluators : void 0;
448
+ }
449
+ function coerceEvaluator(candidate, contextId) {
450
+ if (typeof candidate !== "string") {
451
+ return void 0;
452
+ }
453
+ if (isEvaluatorKind(candidate)) {
454
+ return candidate;
455
+ }
456
+ logWarning2(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
457
+ return void 0;
458
+ }
459
+ function asString2(value) {
460
+ return typeof value === "string" ? value : void 0;
461
+ }
462
+ function isJsonObject2(value) {
463
+ return typeof value === "object" && value !== null && !Array.isArray(value);
464
+ }
465
+ function logWarning2(message, details) {
466
+ if (details && details.length > 0) {
467
+ const detailBlock = details.join("\n");
468
+ console.warn(`${ANSI_YELLOW2}Warning: ${message}
469
+ ${detailBlock}${ANSI_RESET2}`);
470
+ } else {
471
+ console.warn(`${ANSI_YELLOW2}Warning: ${message}${ANSI_RESET2}`);
331
472
  }
332
- return codeBlocks;
333
473
  }
474
+
475
+ // src/evaluation/loaders/message-processor.ts
476
+ var import_promises3 = require("fs/promises");
477
+ var import_node_path4 = __toESM(require("path"), 1);
478
+ var ANSI_YELLOW3 = "\x1B[33m";
479
+ var ANSI_RESET3 = "\x1B[0m";
334
480
  async function processMessages(options) {
335
481
  const {
336
482
  messages,
@@ -356,9 +502,9 @@ async function processMessages(options) {
356
502
  if (!isJsonObject(rawSegment)) {
357
503
  continue;
358
504
  }
359
- const segmentType = asString(rawSegment.type);
505
+ const segmentType = asString3(rawSegment.type);
360
506
  if (segmentType === "file") {
361
- const rawValue = asString(rawSegment.value);
507
+ const rawValue = asString3(rawSegment.value);
362
508
  if (!rawValue) {
363
509
  continue;
364
510
  }
@@ -369,15 +515,15 @@ async function processMessages(options) {
369
515
  if (!resolvedPath) {
370
516
  const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
371
517
  const context = messageType === "input" ? "" : " in expected_messages";
372
- logWarning(`File not found${context}: ${displayPath}`, attempts);
518
+ logWarning3(`File not found${context}: ${displayPath}`, attempts);
373
519
  continue;
374
520
  }
375
521
  try {
376
- const fileContent = (await (0, import_promises2.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
522
+ const fileContent = (await (0, import_promises3.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
377
523
  if (messageType === "input" && guidelinePatterns && guidelinePaths) {
378
- const relativeToRepo = import_node_path2.default.relative(repoRootPath, resolvedPath);
524
+ const relativeToRepo = import_node_path4.default.relative(repoRootPath, resolvedPath);
379
525
  if (isGuidelineFile(relativeToRepo, guidelinePatterns)) {
380
- guidelinePaths.push(import_node_path2.default.resolve(resolvedPath));
526
+ guidelinePaths.push(import_node_path4.default.resolve(resolvedPath));
381
527
  if (verbose) {
382
528
  console.log(` [Guideline] Found: ${displayPath}`);
383
529
  console.log(` Resolved to: ${resolvedPath}`);
@@ -389,7 +535,7 @@ async function processMessages(options) {
389
535
  type: "file",
390
536
  path: displayPath,
391
537
  text: fileContent,
392
- resolvedPath: import_node_path2.default.resolve(resolvedPath)
538
+ resolvedPath: import_node_path4.default.resolve(resolvedPath)
393
539
  });
394
540
  if (verbose) {
395
541
  const label = messageType === "input" ? "[File]" : "[Expected Output File]";
@@ -398,7 +544,7 @@ async function processMessages(options) {
398
544
  }
399
545
  } catch (error) {
400
546
  const context = messageType === "input" ? "" : " expected output";
401
- logWarning(`Could not read${context} file ${resolvedPath}: ${error.message}`);
547
+ logWarning3(`Could not read${context} file ${resolvedPath}: ${error.message}`);
402
548
  }
403
549
  continue;
404
550
  }
@@ -412,201 +558,117 @@ async function processMessages(options) {
412
558
  }
413
559
  return segments;
414
560
  }
415
- async function loadEvalCases(evalFilePath, repoRoot, options) {
416
- const verbose = options?.verbose ?? false;
417
- const evalIdFilter = options?.evalId;
418
- const absoluteTestPath = import_node_path2.default.resolve(evalFilePath);
419
- if (!await fileExists2(absoluteTestPath)) {
420
- throw new Error(`Test file not found: ${evalFilePath}`);
421
- }
422
- const repoRootPath = resolveToAbsolutePath(repoRoot);
423
- const searchRoots = buildSearchRoots(absoluteTestPath, repoRootPath);
424
- const config = await loadConfig(absoluteTestPath, repoRootPath);
425
- const guidelinePatterns = config?.guideline_patterns;
426
- const rawFile = await (0, import_promises2.readFile)(absoluteTestPath, "utf8");
427
- const parsed = (0, import_yaml.parse)(rawFile);
428
- if (!isJsonObject(parsed)) {
429
- throw new Error(`Invalid test file format: ${evalFilePath}`);
430
- }
431
- const suite = parsed;
432
- const datasetNameFromSuite = asString(suite.dataset)?.trim();
433
- const fallbackDataset = import_node_path2.default.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
434
- const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
435
- const schema = suite.$schema;
436
- if (schema !== SCHEMA_EVAL_V2) {
437
- const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${evalFilePath}. Expected '${SCHEMA_EVAL_V2}'` : `Missing required field '$schema' in ${evalFilePath}.
438
- Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
439
- throw new Error(message);
561
+ async function resolveAssistantContent(content, searchRoots, verbose) {
562
+ if (typeof content === "string") {
563
+ return content;
440
564
  }
441
- const rawTestcases = suite.evalcases;
442
- if (!Array.isArray(rawTestcases)) {
443
- throw new Error(`Invalid test file format: ${evalFilePath} - missing 'evalcases' field`);
565
+ if (!content) {
566
+ return "";
444
567
  }
445
- const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm_judge";
446
- const globalExecution = isJsonObject(suite.execution) ? suite.execution : void 0;
447
- const globalTarget = asString(globalExecution?.target) ?? asString(suite.target);
448
- const results = [];
449
- for (const rawEvalcase of rawTestcases) {
450
- if (!isJsonObject(rawEvalcase)) {
451
- logWarning("Skipping invalid eval case entry (expected object)");
568
+ const parts = [];
569
+ for (const entry of content) {
570
+ if (typeof entry === "string") {
571
+ parts.push({ content: entry, isFile: false });
452
572
  continue;
453
573
  }
454
- const evalcase = rawEvalcase;
455
- const id = asString(evalcase.id);
456
- if (evalIdFilter && id !== evalIdFilter) {
574
+ if (!isJsonObject(entry)) {
457
575
  continue;
458
576
  }
459
- const conversationId = asString(evalcase.conversation_id);
460
- const outcome = asString(evalcase.outcome);
461
- const inputMessagesValue = evalcase.input_messages;
462
- const expectedMessagesValue = evalcase.expected_messages;
463
- if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
464
- logWarning(`Skipping incomplete eval case: ${id ?? "unknown"}`);
577
+ const segmentType = asString3(entry.type);
578
+ if (segmentType === "file") {
579
+ const rawValue = asString3(entry.value);
580
+ if (!rawValue) {
581
+ continue;
582
+ }
583
+ const { displayPath, resolvedPath, attempted } = await resolveFileReference(
584
+ rawValue,
585
+ searchRoots
586
+ );
587
+ if (!resolvedPath) {
588
+ const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
589
+ logWarning3(`File not found in expected_messages: ${displayPath}`, attempts);
590
+ continue;
591
+ }
592
+ try {
593
+ const fileContent = (await (0, import_promises3.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n").trim();
594
+ parts.push({ content: fileContent, isFile: true, displayPath });
595
+ if (verbose) {
596
+ console.log(` [Expected Assistant File] Found: ${displayPath}`);
597
+ console.log(` Resolved to: ${resolvedPath}`);
598
+ }
599
+ } catch (error) {
600
+ logWarning3(`Could not read file ${resolvedPath}: ${error.message}`);
601
+ }
465
602
  continue;
466
603
  }
467
- const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
468
- const inputMessages = inputMessagesValue.filter((msg) => isTestMessage(msg));
469
- const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
470
- if (hasExpectedMessages && expectedMessages.length === 0) {
471
- logWarning(`No valid expected message found for eval case: ${id}`);
604
+ const textValue = asString3(entry.text);
605
+ if (typeof textValue === "string") {
606
+ parts.push({ content: textValue, isFile: false });
472
607
  continue;
473
608
  }
474
- if (expectedMessages.length > 1) {
475
- logWarning(`Multiple expected messages found for eval case: ${id}, using first`);
476
- }
477
- const guidelinePaths = [];
478
- const inputTextParts = [];
479
- const inputSegments = await processMessages({
480
- messages: inputMessages,
481
- searchRoots,
482
- repoRootPath,
483
- guidelinePatterns,
484
- guidelinePaths,
485
- textParts: inputTextParts,
486
- messageType: "input",
487
- verbose
488
- });
489
- const outputSegments = hasExpectedMessages ? await processMessages({
490
- messages: expectedMessages,
491
- searchRoots,
492
- repoRootPath,
493
- guidelinePatterns,
494
- messageType: "output",
495
- verbose
496
- }) : [];
497
- const codeSnippets = extractCodeBlocks(inputSegments);
498
- const expectedContent = expectedMessages[0]?.content;
499
- const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
500
- const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
501
- const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
502
- const evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
503
- const userFilePaths = [];
504
- for (const segment of inputSegments) {
505
- if (segment.type === "file" && typeof segment.resolvedPath === "string") {
506
- userFilePaths.push(segment.resolvedPath);
507
- }
508
- }
509
- const allFilePaths = [
510
- ...guidelinePaths.map((guidelinePath) => import_node_path2.default.resolve(guidelinePath)),
511
- ...userFilePaths
512
- ];
513
- const testCase = {
514
- id,
515
- dataset: datasetName,
516
- conversation_id: conversationId,
517
- question,
518
- input_messages: inputMessages,
519
- input_segments: inputSegments,
520
- output_segments: outputSegments,
521
- reference_answer: referenceAnswer,
522
- guideline_paths: guidelinePaths.map((guidelinePath) => import_node_path2.default.resolve(guidelinePath)),
523
- guideline_patterns: guidelinePatterns,
524
- file_paths: allFilePaths,
525
- code_snippets: codeSnippets,
526
- expected_outcome: outcome,
527
- evaluator: evalCaseEvaluatorKind,
528
- evaluators
529
- };
530
- if (verbose) {
531
- console.log(`
532
- [Eval Case: ${id}]`);
533
- if (testCase.guideline_paths.length > 0) {
534
- console.log(` Guidelines used: ${testCase.guideline_paths.length}`);
535
- for (const guidelinePath of testCase.guideline_paths) {
536
- console.log(` - ${guidelinePath}`);
537
- }
538
- } else {
539
- console.log(" No guidelines found");
540
- }
609
+ const valueValue = asString3(entry.value);
610
+ if (typeof valueValue === "string") {
611
+ parts.push({ content: valueValue, isFile: false });
612
+ continue;
541
613
  }
542
- results.push(testCase);
614
+ parts.push({ content: JSON.stringify(entry), isFile: false });
543
615
  }
544
- return results;
616
+ return formatFileContents(parts);
545
617
  }
546
- function needsRoleMarkers(messages, processedSegmentsByMessage) {
547
- if (messages.some((msg) => msg.role === "assistant" || msg.role === "tool")) {
548
- return true;
549
- }
550
- let messagesWithContent = 0;
551
- for (const segments of processedSegmentsByMessage) {
552
- if (hasVisibleContent(segments)) {
553
- messagesWithContent++;
554
- }
555
- }
556
- return messagesWithContent > 1;
618
+ function asString3(value) {
619
+ return typeof value === "string" ? value : void 0;
557
620
  }
558
- function hasVisibleContent(segments) {
559
- return segments.some((segment) => {
560
- const type = asString(segment.type);
561
- if (type === "text") {
562
- const value = asString(segment.value);
563
- return value !== void 0 && value.trim().length > 0;
564
- }
565
- if (type === "guideline_ref") {
566
- return false;
567
- }
568
- if (type === "file") {
569
- const text = asString(segment.text);
570
- return text !== void 0 && text.trim().length > 0;
571
- }
572
- return false;
573
- });
621
+ function cloneJsonObject(source) {
622
+ const entries = Object.entries(source).map(([key, value]) => [key, cloneJsonValue(value)]);
623
+ return Object.fromEntries(entries);
574
624
  }
575
- function formatSegment(segment) {
576
- const type = asString(segment.type);
577
- if (type === "text") {
578
- return asString(segment.value);
625
+ function cloneJsonValue(value) {
626
+ if (value === null) {
627
+ return null;
579
628
  }
580
- if (type === "guideline_ref") {
581
- const refPath = asString(segment.path);
582
- return refPath ? `<Attached: ${refPath}>` : void 0;
629
+ if (typeof value === "string" || typeof value === "number" || typeof value === "boolean") {
630
+ return value;
583
631
  }
584
- if (type === "file") {
585
- const text = asString(segment.text);
586
- const filePath = asString(segment.path);
587
- if (text && filePath) {
588
- return formatFileContents([{ content: text.trim(), isFile: true, displayPath: filePath }]);
589
- }
632
+ if (Array.isArray(value)) {
633
+ return value.map((item) => cloneJsonValue(item));
634
+ }
635
+ if (typeof value === "object") {
636
+ return cloneJsonObject(value);
637
+ }
638
+ return value;
639
+ }
640
+ function logWarning3(message, details) {
641
+ if (details && details.length > 0) {
642
+ const detailBlock = details.join("\n");
643
+ console.warn(`${ANSI_YELLOW3}Warning: ${message}
644
+ ${detailBlock}${ANSI_RESET3}`);
645
+ } else {
646
+ console.warn(`${ANSI_YELLOW3}Warning: ${message}${ANSI_RESET3}`);
590
647
  }
591
- return void 0;
592
648
  }
649
+
650
+ // src/evaluation/formatting/prompt-builder.ts
651
+ var import_promises4 = require("fs/promises");
652
+ var import_node_path5 = __toESM(require("path"), 1);
653
+ var ANSI_YELLOW4 = "\x1B[33m";
654
+ var ANSI_RESET4 = "\x1B[0m";
593
655
  async function buildPromptInputs(testCase) {
594
656
  const guidelineParts = [];
595
657
  for (const rawPath of testCase.guideline_paths) {
596
- const absolutePath = import_node_path2.default.resolve(rawPath);
597
- if (!await fileExists2(absolutePath)) {
598
- logWarning(`Could not read guideline file ${absolutePath}: file does not exist`);
658
+ const absolutePath = import_node_path5.default.resolve(rawPath);
659
+ if (!await fileExists(absolutePath)) {
660
+ logWarning4(`Could not read guideline file ${absolutePath}: file does not exist`);
599
661
  continue;
600
662
  }
601
663
  try {
602
- const content = (await (0, import_promises2.readFile)(absolutePath, "utf8")).replace(/\r\n/g, "\n").trim();
664
+ const content = (await (0, import_promises4.readFile)(absolutePath, "utf8")).replace(/\r\n/g, "\n").trim();
603
665
  guidelineParts.push({
604
666
  content,
605
667
  isFile: true,
606
- displayPath: import_node_path2.default.basename(absolutePath)
668
+ displayPath: import_node_path5.default.basename(absolutePath)
607
669
  });
608
670
  } catch (error) {
609
- logWarning(`Could not read guideline file ${absolutePath}: ${error.message}`);
671
+ logWarning4(`Could not read guideline file ${absolutePath}: ${error.message}`);
610
672
  }
611
673
  }
612
674
  const guidelines = formatFileContents(guidelineParts);
@@ -630,9 +692,9 @@ async function buildPromptInputs(testCase) {
630
692
  messageSegments.push({ type: "text", value: segment });
631
693
  }
632
694
  } else if (isJsonObject(segment)) {
633
- const type = asString(segment.type);
695
+ const type = asString4(segment.type);
634
696
  if (type === "file") {
635
- const value = asString(segment.value);
697
+ const value = asString4(segment.value);
636
698
  if (!value) continue;
637
699
  if (testCase.guideline_patterns && isGuidelineFile(value, testCase.guideline_patterns)) {
638
700
  messageSegments.push({ type: "guideline_ref", path: value });
@@ -643,7 +705,7 @@ async function buildPromptInputs(testCase) {
643
705
  messageSegments.push({ type: "file", text: fileText, path: value });
644
706
  }
645
707
  } else if (type === "text") {
646
- const textValue = asString(segment.value);
708
+ const textValue = asString4(segment.value);
647
709
  if (textValue && textValue.trim().length > 0) {
648
710
  messageSegments.push({ type: "text", value: textValue });
649
711
  }
@@ -699,6 +761,18 @@ ${messageContent}`);
699
761
  }) : void 0;
700
762
  return { question, guidelines, chatPrompt };
701
763
  }
764
+ function needsRoleMarkers(messages, processedSegmentsByMessage) {
765
+ if (messages.some((msg) => msg.role === "assistant" || msg.role === "tool")) {
766
+ return true;
767
+ }
768
+ let messagesWithContent = 0;
769
+ for (const segments of processedSegmentsByMessage) {
770
+ if (hasVisibleContent(segments)) {
771
+ messagesWithContent++;
772
+ }
773
+ }
774
+ return messagesWithContent > 1;
775
+ }
702
776
  function buildChatPromptFromSegments(options) {
703
777
  const { messages, segmentsByMessage, guidelinePatterns, guidelineContent, systemPrompt } = options;
704
778
  if (messages.length === 0) {
@@ -770,211 +844,282 @@ ${guidelineContent.trim()}`);
770
844
  ...name ? { name } : {}
771
845
  });
772
846
  }
773
- return chatPrompt.length > 0 ? chatPrompt : void 0;
847
+ return chatPrompt.length > 0 ? chatPrompt : void 0;
848
+ }
849
+ function asString4(value) {
850
+ return typeof value === "string" ? value : void 0;
851
+ }
852
+ function logWarning4(message) {
853
+ console.warn(`${ANSI_YELLOW4}Warning: ${message}${ANSI_RESET4}`);
854
+ }
855
+
856
+ // src/evaluation/yaml-parser.ts
857
+ var ANSI_YELLOW5 = "\x1B[33m";
858
+ var ANSI_RESET5 = "\x1B[0m";
859
+ var SCHEMA_EVAL_V2 = "agentv-eval-v2";
860
+ async function readTestSuiteMetadata(testFilePath) {
861
+ try {
862
+ const absolutePath = import_node_path6.default.resolve(testFilePath);
863
+ const content = await (0, import_promises5.readFile)(absolutePath, "utf8");
864
+ const parsed = (0, import_yaml2.parse)(content);
865
+ if (!isJsonObject(parsed)) {
866
+ return {};
867
+ }
868
+ return { target: extractTargetFromSuite(parsed) };
869
+ } catch {
870
+ return {};
871
+ }
872
+ }
873
+ async function loadEvalCases(evalFilePath, repoRoot, options) {
874
+ const verbose = options?.verbose ?? false;
875
+ const evalIdFilter = options?.evalId;
876
+ const absoluteTestPath = import_node_path6.default.resolve(evalFilePath);
877
+ const repoRootPath = resolveToAbsolutePath(repoRoot);
878
+ const searchRoots = buildSearchRoots(absoluteTestPath, repoRootPath);
879
+ const config = await loadConfig(absoluteTestPath, repoRootPath);
880
+ const guidelinePatterns = config?.guideline_patterns;
881
+ const rawFile = await (0, import_promises5.readFile)(absoluteTestPath, "utf8");
882
+ const parsed = (0, import_yaml2.parse)(rawFile);
883
+ if (!isJsonObject(parsed)) {
884
+ throw new Error(`Invalid test file format: ${evalFilePath}`);
885
+ }
886
+ const suite = parsed;
887
+ const datasetNameFromSuite = asString5(suite.dataset)?.trim();
888
+ const fallbackDataset = import_node_path6.default.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
889
+ const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
890
+ const schema = suite.$schema;
891
+ if (schema !== SCHEMA_EVAL_V2) {
892
+ const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${evalFilePath}. Expected '${SCHEMA_EVAL_V2}'` : `Missing required field '$schema' in ${evalFilePath}.
893
+ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
894
+ throw new Error(message);
895
+ }
896
+ const rawTestcases = suite.evalcases;
897
+ if (!Array.isArray(rawTestcases)) {
898
+ throw new Error(`Invalid test file format: ${evalFilePath} - missing 'evalcases' field`);
899
+ }
900
+ const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm_judge";
901
+ const globalExecution = isJsonObject(suite.execution) ? suite.execution : void 0;
902
+ const _globalTarget = asString5(globalExecution?.target) ?? asString5(suite.target);
903
+ const results = [];
904
+ for (const rawEvalcase of rawTestcases) {
905
+ if (!isJsonObject(rawEvalcase)) {
906
+ logWarning5("Skipping invalid eval case entry (expected object)");
907
+ continue;
908
+ }
909
+ const evalcase = rawEvalcase;
910
+ const id = asString5(evalcase.id);
911
+ if (evalIdFilter && id !== evalIdFilter) {
912
+ continue;
913
+ }
914
+ const conversationId = asString5(evalcase.conversation_id);
915
+ const outcome = asString5(evalcase.outcome);
916
+ const inputMessagesValue = evalcase.input_messages;
917
+ const expectedMessagesValue = evalcase.expected_messages;
918
+ if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
919
+ logWarning5(`Skipping incomplete eval case: ${id ?? "unknown"}`);
920
+ continue;
921
+ }
922
+ const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
923
+ const inputMessages = inputMessagesValue.filter((msg) => isTestMessage(msg));
924
+ const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
925
+ if (hasExpectedMessages && expectedMessages.length === 0) {
926
+ logWarning5(`No valid expected message found for eval case: ${id}`);
927
+ continue;
928
+ }
929
+ if (expectedMessages.length > 1) {
930
+ logWarning5(`Multiple expected messages found for eval case: ${id}, using first`);
931
+ }
932
+ const guidelinePaths = [];
933
+ const inputTextParts = [];
934
+ const inputSegments = await processMessages({
935
+ messages: inputMessages,
936
+ searchRoots,
937
+ repoRootPath,
938
+ guidelinePatterns,
939
+ guidelinePaths,
940
+ textParts: inputTextParts,
941
+ messageType: "input",
942
+ verbose
943
+ });
944
+ const outputSegments = hasExpectedMessages ? await processMessages({
945
+ messages: expectedMessages,
946
+ searchRoots,
947
+ repoRootPath,
948
+ guidelinePatterns,
949
+ messageType: "output",
950
+ verbose
951
+ }) : [];
952
+ const codeSnippets = extractCodeBlocks(inputSegments);
953
+ const expectedContent = expectedMessages[0]?.content;
954
+ const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
955
+ const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
956
+ const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
957
+ const evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
958
+ const userFilePaths = [];
959
+ for (const segment of inputSegments) {
960
+ if (segment.type === "file" && typeof segment.resolvedPath === "string") {
961
+ userFilePaths.push(segment.resolvedPath);
962
+ }
963
+ }
964
+ const allFilePaths = [
965
+ ...guidelinePaths.map((guidelinePath) => import_node_path6.default.resolve(guidelinePath)),
966
+ ...userFilePaths
967
+ ];
968
+ const testCase = {
969
+ id,
970
+ dataset: datasetName,
971
+ conversation_id: conversationId,
972
+ question,
973
+ input_messages: inputMessages,
974
+ input_segments: inputSegments,
975
+ output_segments: outputSegments,
976
+ reference_answer: referenceAnswer,
977
+ guideline_paths: guidelinePaths.map((guidelinePath) => import_node_path6.default.resolve(guidelinePath)),
978
+ guideline_patterns: guidelinePatterns,
979
+ file_paths: allFilePaths,
980
+ code_snippets: codeSnippets,
981
+ expected_outcome: outcome,
982
+ evaluator: evalCaseEvaluatorKind,
983
+ evaluators
984
+ };
985
+ if (verbose) {
986
+ console.log(`
987
+ [Eval Case: ${id}]`);
988
+ if (testCase.guideline_paths.length > 0) {
989
+ console.log(` Guidelines used: ${testCase.guideline_paths.length}`);
990
+ for (const guidelinePath of testCase.guideline_paths) {
991
+ console.log(` - ${guidelinePath}`);
992
+ }
993
+ } else {
994
+ console.log(" No guidelines found");
995
+ }
996
+ }
997
+ results.push(testCase);
998
+ }
999
+ return results;
1000
+ }
1001
+ function asString5(value) {
1002
+ return typeof value === "string" ? value : void 0;
1003
+ }
1004
+ function logWarning5(message, details) {
1005
+ if (details && details.length > 0) {
1006
+ const detailBlock = details.join("\n");
1007
+ console.warn(`${ANSI_YELLOW5}Warning: ${message}
1008
+ ${detailBlock}${ANSI_RESET5}`);
1009
+ } else {
1010
+ console.warn(`${ANSI_YELLOW5}Warning: ${message}${ANSI_RESET5}`);
1011
+ }
774
1012
  }
775
- async function fileExists2(absolutePath) {
1013
+
1014
+ // src/evaluation/file-utils.ts
1015
+ var import_node_fs2 = require("fs");
1016
+ var import_promises6 = require("fs/promises");
1017
+ var import_node_path7 = __toESM(require("path"), 1);
1018
+ async function fileExists2(filePath) {
776
1019
  try {
777
- await (0, import_promises2.access)(absolutePath, import_node_fs2.constants.F_OK);
1020
+ await (0, import_promises6.access)(filePath, import_node_fs2.constants.F_OK);
778
1021
  return true;
779
1022
  } catch {
780
1023
  return false;
781
1024
  }
782
1025
  }
783
- function resolveToAbsolutePath(candidate) {
784
- if (candidate instanceof URL) {
785
- return (0, import_node_url.fileURLToPath)(candidate);
786
- }
787
- if (typeof candidate === "string") {
788
- if (candidate.startsWith("file://")) {
789
- return (0, import_node_url.fileURLToPath)(new URL(candidate));
790
- }
791
- return import_node_path2.default.resolve(candidate);
792
- }
793
- throw new TypeError("Unsupported repoRoot value. Expected string or URL.");
794
- }
795
- function asString(value) {
796
- return typeof value === "string" ? value : void 0;
797
- }
798
- function cloneJsonObject(source) {
799
- const entries = Object.entries(source).map(([key, value]) => [key, cloneJsonValue(value)]);
800
- return Object.fromEntries(entries);
1026
+ function normalizeLineEndings(content) {
1027
+ return content.replace(/\r\n/g, "\n");
801
1028
  }
802
- function cloneJsonValue(value) {
803
- if (value === null) {
804
- return null;
805
- }
806
- if (typeof value === "string" || typeof value === "number" || typeof value === "boolean") {
807
- return value;
808
- }
809
- if (Array.isArray(value)) {
810
- return value.map((item) => cloneJsonValue(item));
811
- }
812
- return cloneJsonObject(value);
1029
+ async function readTextFile(filePath) {
1030
+ const content = await (0, import_promises6.readFile)(filePath, "utf8");
1031
+ return normalizeLineEndings(content);
813
1032
  }
814
- function formatFileContents(parts) {
815
- const fileCount = parts.filter((p) => p.isFile).length;
816
- if (fileCount > 0) {
817
- return parts.map((part) => {
818
- if (part.isFile && part.displayPath) {
819
- return `<file path="${part.displayPath}">
820
- ${part.content}
821
- </file>`;
822
- }
823
- return part.content;
824
- }).join("\n\n");
1033
+ async function findGitRoot(startPath) {
1034
+ let currentDir = import_node_path7.default.dirname(import_node_path7.default.resolve(startPath));
1035
+ const root = import_node_path7.default.parse(currentDir).root;
1036
+ while (currentDir !== root) {
1037
+ const gitPath = import_node_path7.default.join(currentDir, ".git");
1038
+ if (await fileExists2(gitPath)) {
1039
+ return currentDir;
1040
+ }
1041
+ const parentDir = import_node_path7.default.dirname(currentDir);
1042
+ if (parentDir === currentDir) {
1043
+ break;
1044
+ }
1045
+ currentDir = parentDir;
825
1046
  }
826
- return parts.map((p) => p.content).join(" ");
1047
+ return null;
827
1048
  }
828
- async function resolveAssistantContent(content, searchRoots, verbose) {
829
- if (typeof content === "string") {
830
- return content;
831
- }
832
- if (!content) {
833
- return "";
834
- }
835
- const parts = [];
836
- for (const entry of content) {
837
- if (typeof entry === "string") {
838
- parts.push({ content: entry, isFile: false });
839
- continue;
1049
+ function buildDirectoryChain2(filePath, repoRoot) {
1050
+ const directories = [];
1051
+ const seen = /* @__PURE__ */ new Set();
1052
+ const boundary = import_node_path7.default.resolve(repoRoot);
1053
+ let current = import_node_path7.default.resolve(import_node_path7.default.dirname(filePath));
1054
+ while (current !== void 0) {
1055
+ if (!seen.has(current)) {
1056
+ directories.push(current);
1057
+ seen.add(current);
840
1058
  }
841
- if (!isJsonObject(entry)) {
842
- continue;
1059
+ if (current === boundary) {
1060
+ break;
843
1061
  }
844
- const segmentType = asString(entry.type);
845
- if (segmentType === "file") {
846
- const rawValue = asString(entry.value);
847
- if (!rawValue) {
848
- continue;
849
- }
850
- const { displayPath, resolvedPath, attempted } = await resolveFileReference(
851
- rawValue,
852
- searchRoots
853
- );
854
- if (!resolvedPath) {
855
- const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
856
- logWarning(`File not found in expected_messages: ${displayPath}`, attempts);
857
- continue;
858
- }
859
- try {
860
- const fileContent = (await (0, import_promises2.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n").trim();
861
- parts.push({ content: fileContent, isFile: true, displayPath });
862
- if (verbose) {
863
- console.log(` [Expected Assistant File] Found: ${displayPath}`);
864
- console.log(` Resolved to: ${resolvedPath}`);
865
- }
866
- } catch (error) {
867
- logWarning(`Could not read file ${resolvedPath}: ${error.message}`);
868
- }
869
- continue;
1062
+ const parent = import_node_path7.default.dirname(current);
1063
+ if (parent === current) {
1064
+ break;
870
1065
  }
871
- const textValue = asString(entry.text);
872
- if (typeof textValue === "string") {
873
- parts.push({ content: textValue, isFile: false });
874
- continue;
1066
+ current = parent;
1067
+ }
1068
+ if (!seen.has(boundary)) {
1069
+ directories.push(boundary);
1070
+ }
1071
+ return directories;
1072
+ }
1073
+ function buildSearchRoots2(evalPath, repoRoot) {
1074
+ const uniqueRoots = [];
1075
+ const addRoot = (root) => {
1076
+ const normalized = import_node_path7.default.resolve(root);
1077
+ if (!uniqueRoots.includes(normalized)) {
1078
+ uniqueRoots.push(normalized);
875
1079
  }
876
- const valueValue = asString(entry.value);
877
- if (typeof valueValue === "string") {
878
- parts.push({ content: valueValue, isFile: false });
879
- continue;
1080
+ };
1081
+ let currentDir = import_node_path7.default.dirname(evalPath);
1082
+ let reachedBoundary = false;
1083
+ while (!reachedBoundary) {
1084
+ addRoot(currentDir);
1085
+ const parentDir = import_node_path7.default.dirname(currentDir);
1086
+ if (currentDir === repoRoot || parentDir === currentDir) {
1087
+ reachedBoundary = true;
1088
+ } else {
1089
+ currentDir = parentDir;
880
1090
  }
881
- parts.push({ content: JSON.stringify(entry), isFile: false });
882
1091
  }
883
- return formatFileContents(parts);
1092
+ addRoot(repoRoot);
1093
+ addRoot(process.cwd());
1094
+ return uniqueRoots;
884
1095
  }
885
- async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
886
- const execution = rawEvalCase.execution;
887
- const candidateEvaluators = isJsonObject(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators ?? globalExecution?.evaluators;
888
- if (candidateEvaluators === void 0) {
889
- return void 0;
1096
+ function trimLeadingSeparators2(value) {
1097
+ const trimmed = value.replace(/^[/\\]+/, "");
1098
+ return trimmed.length > 0 ? trimmed : value;
1099
+ }
1100
+ async function resolveFileReference2(rawValue, searchRoots) {
1101
+ const displayPath = trimLeadingSeparators2(rawValue);
1102
+ const potentialPaths = [];
1103
+ if (import_node_path7.default.isAbsolute(rawValue)) {
1104
+ potentialPaths.push(import_node_path7.default.normalize(rawValue));
890
1105
  }
891
- if (!Array.isArray(candidateEvaluators)) {
892
- logWarning(`Skipping evaluators for '${evalId}': expected array`);
893
- return void 0;
1106
+ for (const base of searchRoots) {
1107
+ potentialPaths.push(import_node_path7.default.resolve(base, displayPath));
894
1108
  }
895
- const evaluators = [];
896
- for (const rawEvaluator of candidateEvaluators) {
897
- if (!isJsonObject(rawEvaluator)) {
898
- logWarning(`Skipping invalid evaluator entry for '${evalId}' (expected object)`);
899
- continue;
900
- }
901
- const name = asString(rawEvaluator.name);
902
- const typeValue = rawEvaluator.type;
903
- if (!name || !isEvaluatorKind(typeValue)) {
904
- logWarning(`Skipping evaluator with invalid name/type in '${evalId}'`);
905
- continue;
906
- }
907
- if (typeValue === "code") {
908
- const script = asString(rawEvaluator.script);
909
- if (!script) {
910
- logWarning(`Skipping code evaluator '${name}' in '${evalId}': missing script`);
911
- continue;
912
- }
913
- const cwd = asString(rawEvaluator.cwd);
914
- let resolvedCwd;
915
- if (cwd) {
916
- const resolved = await resolveFileReference(cwd, searchRoots);
917
- if (resolved.resolvedPath) {
918
- resolvedCwd = import_node_path2.default.resolve(resolved.resolvedPath);
919
- } else {
920
- logWarning(
921
- `Code evaluator '${name}' in '${evalId}': cwd not found (${resolved.displayPath})`,
922
- resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
923
- );
924
- }
925
- } else {
926
- resolvedCwd = searchRoots[0];
927
- }
928
- evaluators.push({
929
- name,
930
- type: "code",
931
- script,
932
- cwd,
933
- resolvedCwd
934
- });
1109
+ const attempted = [];
1110
+ const seen = /* @__PURE__ */ new Set();
1111
+ for (const candidate of potentialPaths) {
1112
+ const absoluteCandidate = import_node_path7.default.resolve(candidate);
1113
+ if (seen.has(absoluteCandidate)) {
935
1114
  continue;
936
1115
  }
937
- const prompt = asString(rawEvaluator.prompt);
938
- let promptPath;
939
- if (prompt) {
940
- const resolved = await resolveFileReference(prompt, searchRoots);
941
- if (resolved.resolvedPath) {
942
- promptPath = import_node_path2.default.resolve(resolved.resolvedPath);
943
- } else {
944
- logWarning(
945
- `Inline prompt used for evaluator '${name}' in '${evalId}' (file not found: ${resolved.displayPath})`,
946
- resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
947
- );
948
- }
1116
+ seen.add(absoluteCandidate);
1117
+ attempted.push(absoluteCandidate);
1118
+ if (await fileExists2(absoluteCandidate)) {
1119
+ return { displayPath, resolvedPath: absoluteCandidate, attempted };
949
1120
  }
950
- const model = asString(rawEvaluator.model);
951
- evaluators.push({
952
- name,
953
- type: "llm_judge",
954
- prompt,
955
- promptPath
956
- });
957
- }
958
- return evaluators.length > 0 ? evaluators : void 0;
959
- }
960
- function coerceEvaluator(candidate, contextId) {
961
- if (typeof candidate !== "string") {
962
- return void 0;
963
- }
964
- if (isEvaluatorKind(candidate)) {
965
- return candidate;
966
- }
967
- logWarning(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
968
- return void 0;
969
- }
970
- function logWarning(message, details) {
971
- if (details && details.length > 0) {
972
- const detailBlock = details.join("\n");
973
- console.warn(`${ANSI_YELLOW}Warning: ${message}
974
- ${detailBlock}${ANSI_RESET}`);
975
- } else {
976
- console.warn(`${ANSI_YELLOW}Warning: ${message}${ANSI_RESET}`);
977
1121
  }
1122
+ return { displayPath, attempted };
978
1123
  }
979
1124
 
980
1125
  // src/evaluation/providers/ax.ts
@@ -1005,9 +1150,8 @@ function buildChatPrompt(request) {
1005
1150
  }
1006
1151
  function resolveSystemContent(request) {
1007
1152
  const systemSegments = [];
1008
- const metadataSystemPrompt = typeof request.metadata?.systemPrompt === "string" ? request.metadata.systemPrompt : void 0;
1009
- if (metadataSystemPrompt && metadataSystemPrompt.trim().length > 0) {
1010
- systemSegments.push(metadataSystemPrompt.trim());
1153
+ if (request.systemPrompt && request.systemPrompt.trim().length > 0) {
1154
+ systemSegments.push(request.systemPrompt.trim());
1011
1155
  } else {
1012
1156
  systemSegments.push(DEFAULT_SYSTEM_PROMPT);
1013
1157
  }
@@ -1258,9 +1402,9 @@ var GeminiProvider = class {
1258
1402
 
1259
1403
  // src/evaluation/providers/cli.ts
1260
1404
  var import_node_child_process = require("child_process");
1261
- var import_promises3 = __toESM(require("fs/promises"), 1);
1405
+ var import_promises7 = __toESM(require("fs/promises"), 1);
1262
1406
  var import_node_os = __toESM(require("os"), 1);
1263
- var import_node_path3 = __toESM(require("path"), 1);
1407
+ var import_node_path8 = __toESM(require("path"), 1);
1264
1408
  var import_node_util = require("util");
1265
1409
  var execAsync = (0, import_node_util.promisify)(import_node_child_process.exec);
1266
1410
  var DEFAULT_MAX_BUFFER = 10 * 1024 * 1024;
@@ -1357,7 +1501,7 @@ var CliProvider = class {
1357
1501
  const errorMsg = error instanceof Error ? error.message : String(error);
1358
1502
  throw new Error(`Failed to read output file '${filePath}': ${errorMsg}`);
1359
1503
  } finally {
1360
- await import_promises3.default.unlink(filePath).catch(() => {
1504
+ await import_promises7.default.unlink(filePath).catch(() => {
1361
1505
  });
1362
1506
  }
1363
1507
  }
@@ -1439,7 +1583,7 @@ function normalizeInputFiles(inputFiles) {
1439
1583
  }
1440
1584
  const unique = /* @__PURE__ */ new Map();
1441
1585
  for (const inputFile of inputFiles) {
1442
- const absolutePath = import_node_path3.default.resolve(inputFile);
1586
+ const absolutePath = import_node_path8.default.resolve(inputFile);
1443
1587
  if (!unique.has(absolutePath)) {
1444
1588
  unique.set(absolutePath, absolutePath);
1445
1589
  }
@@ -1453,7 +1597,7 @@ function formatFileList(files, template) {
1453
1597
  const formatter = template ?? "{path}";
1454
1598
  return files.map((filePath) => {
1455
1599
  const escapedPath = shellEscape(filePath);
1456
- const escapedName = shellEscape(import_node_path3.default.basename(filePath));
1600
+ const escapedName = shellEscape(import_node_path8.default.basename(filePath));
1457
1601
  return formatter.replaceAll("{path}", escapedPath).replaceAll("{basename}", escapedName);
1458
1602
  }).join(" ");
1459
1603
  }
@@ -1477,7 +1621,7 @@ function generateOutputFilePath(evalCaseId) {
1477
1621
  const safeEvalId = evalCaseId || "unknown";
1478
1622
  const timestamp = Date.now();
1479
1623
  const random = Math.random().toString(36).substring(2, 9);
1480
- return import_node_path3.default.join(import_node_os.default.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}.json`);
1624
+ return import_node_path8.default.join(import_node_os.default.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}.json`);
1481
1625
  }
1482
1626
  function formatTimeoutSuffix(timeoutMs) {
1483
1627
  if (!timeoutMs || timeoutMs <= 0) {
@@ -1491,9 +1635,9 @@ function formatTimeoutSuffix(timeoutMs) {
1491
1635
  var import_node_child_process2 = require("child_process");
1492
1636
  var import_node_crypto = require("crypto");
1493
1637
  var import_node_fs3 = require("fs");
1494
- var import_promises4 = require("fs/promises");
1638
+ var import_promises8 = require("fs/promises");
1495
1639
  var import_node_os2 = require("os");
1496
- var import_node_path5 = __toESM(require("path"), 1);
1640
+ var import_node_path10 = __toESM(require("path"), 1);
1497
1641
  var import_node_util2 = require("util");
1498
1642
 
1499
1643
  // src/evaluation/providers/codex-log-tracker.ts
@@ -1550,7 +1694,7 @@ function subscribeToCodexLogEntries(listener) {
1550
1694
  }
1551
1695
 
1552
1696
  // src/evaluation/providers/preread.ts
1553
- var import_node_path4 = __toESM(require("path"), 1);
1697
+ var import_node_path9 = __toESM(require("path"), 1);
1554
1698
  function buildPromptDocument(request, inputFiles, options) {
1555
1699
  const parts = [];
1556
1700
  const guidelineFiles = collectGuidelineFiles(
@@ -1575,7 +1719,7 @@ function normalizeInputFiles2(inputFiles) {
1575
1719
  }
1576
1720
  const deduped = /* @__PURE__ */ new Map();
1577
1721
  for (const inputFile of inputFiles) {
1578
- const absolutePath = import_node_path4.default.resolve(inputFile);
1722
+ const absolutePath = import_node_path9.default.resolve(inputFile);
1579
1723
  if (!deduped.has(absolutePath)) {
1580
1724
  deduped.set(absolutePath, absolutePath);
1581
1725
  }
@@ -1588,14 +1732,14 @@ function collectGuidelineFiles(inputFiles, guidelinePatterns, overrides) {
1588
1732
  }
1589
1733
  const unique = /* @__PURE__ */ new Map();
1590
1734
  for (const inputFile of inputFiles) {
1591
- const absolutePath = import_node_path4.default.resolve(inputFile);
1735
+ const absolutePath = import_node_path9.default.resolve(inputFile);
1592
1736
  if (overrides?.has(absolutePath)) {
1593
1737
  if (!unique.has(absolutePath)) {
1594
1738
  unique.set(absolutePath, absolutePath);
1595
1739
  }
1596
1740
  continue;
1597
1741
  }
1598
- const normalized = absolutePath.split(import_node_path4.default.sep).join("/");
1742
+ const normalized = absolutePath.split(import_node_path9.default.sep).join("/");
1599
1743
  if (isGuidelineFile(normalized, guidelinePatterns)) {
1600
1744
  if (!unique.has(absolutePath)) {
1601
1745
  unique.set(absolutePath, absolutePath);
@@ -1610,7 +1754,7 @@ function collectInputFiles(inputFiles) {
1610
1754
  }
1611
1755
  const unique = /* @__PURE__ */ new Map();
1612
1756
  for (const inputFile of inputFiles) {
1613
- const absolutePath = import_node_path4.default.resolve(inputFile);
1757
+ const absolutePath = import_node_path9.default.resolve(inputFile);
1614
1758
  if (!unique.has(absolutePath)) {
1615
1759
  unique.set(absolutePath, absolutePath);
1616
1760
  }
@@ -1622,7 +1766,7 @@ function buildMandatoryPrereadBlock(guidelineFiles, inputFiles) {
1622
1766
  return "";
1623
1767
  }
1624
1768
  const buildList = (files) => files.map((absolutePath) => {
1625
- const fileName = import_node_path4.default.basename(absolutePath);
1769
+ const fileName = import_node_path9.default.basename(absolutePath);
1626
1770
  const fileUri = pathToFileUri(absolutePath);
1627
1771
  return `* [${fileName}](${fileUri})`;
1628
1772
  });
@@ -1642,7 +1786,7 @@ ${buildList(inputFiles).join("\n")}.`);
1642
1786
  return sections.join("\n");
1643
1787
  }
1644
1788
  function pathToFileUri(filePath) {
1645
- const absolutePath = import_node_path4.default.isAbsolute(filePath) ? filePath : import_node_path4.default.resolve(filePath);
1789
+ const absolutePath = import_node_path9.default.isAbsolute(filePath) ? filePath : import_node_path9.default.resolve(filePath);
1646
1790
  const normalizedPath = absolutePath.replace(/\\/g, "/");
1647
1791
  if (/^[a-zA-Z]:\//.test(normalizedPath)) {
1648
1792
  return `file:///${normalizedPath}`;
@@ -1680,8 +1824,8 @@ var CodexProvider = class {
1680
1824
  const logger = await this.createStreamLogger(request).catch(() => void 0);
1681
1825
  try {
1682
1826
  const promptContent = buildPromptDocument(request, inputFiles);
1683
- const promptFile = import_node_path5.default.join(workspaceRoot, PROMPT_FILENAME);
1684
- await (0, import_promises4.writeFile)(promptFile, promptContent, "utf8");
1827
+ const promptFile = import_node_path10.default.join(workspaceRoot, PROMPT_FILENAME);
1828
+ await (0, import_promises8.writeFile)(promptFile, promptContent, "utf8");
1685
1829
  const args = this.buildCodexArgs();
1686
1830
  const cwd = this.resolveCwd(workspaceRoot);
1687
1831
  const result = await this.executeCodex(args, cwd, promptContent, request.signal, logger);
@@ -1730,7 +1874,7 @@ var CodexProvider = class {
1730
1874
  if (!this.config.cwd) {
1731
1875
  return workspaceRoot;
1732
1876
  }
1733
- return import_node_path5.default.resolve(this.config.cwd);
1877
+ return import_node_path10.default.resolve(this.config.cwd);
1734
1878
  }
1735
1879
  buildCodexArgs() {
1736
1880
  const args = ["--ask-for-approval", "never", "exec", "--json", "--color", "never", "--skip-git-repo-check"];
@@ -1764,11 +1908,11 @@ var CodexProvider = class {
1764
1908
  }
1765
1909
  }
1766
1910
  async createWorkspace() {
1767
- return await (0, import_promises4.mkdtemp)(import_node_path5.default.join((0, import_node_os2.tmpdir)(), WORKSPACE_PREFIX));
1911
+ return await (0, import_promises8.mkdtemp)(import_node_path10.default.join((0, import_node_os2.tmpdir)(), WORKSPACE_PREFIX));
1768
1912
  }
1769
1913
  async cleanupWorkspace(workspaceRoot) {
1770
1914
  try {
1771
- await (0, import_promises4.rm)(workspaceRoot, { recursive: true, force: true });
1915
+ await (0, import_promises8.rm)(workspaceRoot, { recursive: true, force: true });
1772
1916
  } catch {
1773
1917
  }
1774
1918
  }
@@ -1778,9 +1922,9 @@ var CodexProvider = class {
1778
1922
  return void 0;
1779
1923
  }
1780
1924
  if (this.config.logDir) {
1781
- return import_node_path5.default.resolve(this.config.logDir);
1925
+ return import_node_path10.default.resolve(this.config.logDir);
1782
1926
  }
1783
- return import_node_path5.default.join(process.cwd(), ".agentv", "logs", "codex");
1927
+ return import_node_path10.default.join(process.cwd(), ".agentv", "logs", "codex");
1784
1928
  }
1785
1929
  async createStreamLogger(request) {
1786
1930
  const logDir = this.resolveLogDirectory();
@@ -1788,13 +1932,13 @@ var CodexProvider = class {
1788
1932
  return void 0;
1789
1933
  }
1790
1934
  try {
1791
- await (0, import_promises4.mkdir)(logDir, { recursive: true });
1935
+ await (0, import_promises8.mkdir)(logDir, { recursive: true });
1792
1936
  } catch (error) {
1793
1937
  const message = error instanceof Error ? error.message : String(error);
1794
1938
  console.warn(`Skipping Codex stream logging (could not create ${logDir}): ${message}`);
1795
1939
  return void 0;
1796
1940
  }
1797
- const filePath = import_node_path5.default.join(logDir, buildLogFilename(request, this.targetName));
1941
+ const filePath = import_node_path10.default.join(logDir, buildLogFilename(request, this.targetName));
1798
1942
  try {
1799
1943
  const logger = await CodexStreamLogger.create({
1800
1944
  filePath,
@@ -2009,9 +2153,9 @@ function tryParseJsonValue(rawLine) {
2009
2153
  async function locateExecutable(candidate) {
2010
2154
  const includesPathSeparator = candidate.includes("/") || candidate.includes("\\");
2011
2155
  if (includesPathSeparator) {
2012
- const resolved = import_node_path5.default.isAbsolute(candidate) ? candidate : import_node_path5.default.resolve(candidate);
2156
+ const resolved = import_node_path10.default.isAbsolute(candidate) ? candidate : import_node_path10.default.resolve(candidate);
2013
2157
  const executablePath = await ensureWindowsExecutableVariant(resolved);
2014
- await (0, import_promises4.access)(executablePath, import_node_fs3.constants.F_OK);
2158
+ await (0, import_promises8.access)(executablePath, import_node_fs3.constants.F_OK);
2015
2159
  return executablePath;
2016
2160
  }
2017
2161
  const locator = process.platform === "win32" ? "where" : "which";
@@ -2021,7 +2165,7 @@ async function locateExecutable(candidate) {
2021
2165
  const preferred = selectExecutableCandidate(lines);
2022
2166
  if (preferred) {
2023
2167
  const executablePath = await ensureWindowsExecutableVariant(preferred);
2024
- await (0, import_promises4.access)(executablePath, import_node_fs3.constants.F_OK);
2168
+ await (0, import_promises8.access)(executablePath, import_node_fs3.constants.F_OK);
2025
2169
  return executablePath;
2026
2170
  }
2027
2171
  } catch {
@@ -2055,7 +2199,7 @@ async function ensureWindowsExecutableVariant(candidate) {
2055
2199
  for (const ext of extensions) {
2056
2200
  const withExtension = `${candidate}${ext}`;
2057
2201
  try {
2058
- await (0, import_promises4.access)(withExtension, import_node_fs3.constants.F_OK);
2202
+ await (0, import_promises8.access)(withExtension, import_node_fs3.constants.F_OK);
2059
2203
  return withExtension;
2060
2204
  } catch {
2061
2205
  }
@@ -2867,7 +3011,7 @@ function resolveOptionalNumberArray(source, description) {
2867
3011
  }
2868
3012
 
2869
3013
  // src/evaluation/providers/vscode.ts
2870
- var import_node_path6 = __toESM(require("path"), 1);
3014
+ var import_node_path11 = __toESM(require("path"), 1);
2871
3015
  var import_subagent = require("subagent");
2872
3016
  var VSCodeProvider = class {
2873
3017
  id;
@@ -2980,6 +3124,9 @@ var VSCodeProvider = class {
2980
3124
  };
2981
3125
  function buildPromptDocument2(request, attachments, guidelinePatterns) {
2982
3126
  const parts = [];
3127
+ if (request.systemPrompt && request.systemPrompt.trim().length > 0) {
3128
+ parts.push(request.systemPrompt.trim());
3129
+ }
2983
3130
  const guidelineFiles = collectGuidelineFiles2(attachments, guidelinePatterns);
2984
3131
  const attachmentFiles = collectAttachmentFiles(attachments);
2985
3132
  const nonGuidelineAttachments = attachmentFiles.filter(
@@ -2997,7 +3144,7 @@ function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
2997
3144
  return "";
2998
3145
  }
2999
3146
  const buildList = (files) => files.map((absolutePath) => {
3000
- const fileName = import_node_path6.default.basename(absolutePath);
3147
+ const fileName = import_node_path11.default.basename(absolutePath);
3001
3148
  const fileUri = pathToFileUri2(absolutePath);
3002
3149
  return `* [${fileName}](${fileUri})`;
3003
3150
  });
@@ -3022,8 +3169,8 @@ function collectGuidelineFiles2(attachments, guidelinePatterns) {
3022
3169
  }
3023
3170
  const unique = /* @__PURE__ */ new Map();
3024
3171
  for (const attachment of attachments) {
3025
- const absolutePath = import_node_path6.default.resolve(attachment);
3026
- const normalized = absolutePath.split(import_node_path6.default.sep).join("/");
3172
+ const absolutePath = import_node_path11.default.resolve(attachment);
3173
+ const normalized = absolutePath.split(import_node_path11.default.sep).join("/");
3027
3174
  if (isGuidelineFile(normalized, guidelinePatterns)) {
3028
3175
  if (!unique.has(absolutePath)) {
3029
3176
  unique.set(absolutePath, absolutePath);
@@ -3038,7 +3185,7 @@ function collectAttachmentFiles(attachments) {
3038
3185
  }
3039
3186
  const unique = /* @__PURE__ */ new Map();
3040
3187
  for (const attachment of attachments) {
3041
- const absolutePath = import_node_path6.default.resolve(attachment);
3188
+ const absolutePath = import_node_path11.default.resolve(attachment);
3042
3189
  if (!unique.has(absolutePath)) {
3043
3190
  unique.set(absolutePath, absolutePath);
3044
3191
  }
@@ -3046,7 +3193,7 @@ function collectAttachmentFiles(attachments) {
3046
3193
  return Array.from(unique.values());
3047
3194
  }
3048
3195
  function pathToFileUri2(filePath) {
3049
- const absolutePath = import_node_path6.default.isAbsolute(filePath) ? filePath : import_node_path6.default.resolve(filePath);
3196
+ const absolutePath = import_node_path11.default.isAbsolute(filePath) ? filePath : import_node_path11.default.resolve(filePath);
3050
3197
  const normalizedPath = absolutePath.replace(/\\/g, "/");
3051
3198
  if (/^[a-zA-Z]:\//.test(normalizedPath)) {
3052
3199
  return `file:///${normalizedPath}`;
@@ -3059,7 +3206,7 @@ function normalizeAttachments(attachments) {
3059
3206
  }
3060
3207
  const deduped = /* @__PURE__ */ new Set();
3061
3208
  for (const attachment of attachments) {
3062
- deduped.add(import_node_path6.default.resolve(attachment));
3209
+ deduped.add(import_node_path11.default.resolve(attachment));
3063
3210
  }
3064
3211
  return Array.from(deduped);
3065
3212
  }
@@ -3068,7 +3215,7 @@ function mergeAttachments(all) {
3068
3215
  for (const list of all) {
3069
3216
  if (!list) continue;
3070
3217
  for (const inputFile of list) {
3071
- deduped.add(import_node_path6.default.resolve(inputFile));
3218
+ deduped.add(import_node_path11.default.resolve(inputFile));
3072
3219
  }
3073
3220
  }
3074
3221
  return deduped.size > 0 ? Array.from(deduped) : void 0;
@@ -3114,9 +3261,9 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
3114
3261
 
3115
3262
  // src/evaluation/providers/targets-file.ts
3116
3263
  var import_node_fs4 = require("fs");
3117
- var import_promises5 = require("fs/promises");
3118
- var import_node_path7 = __toESM(require("path"), 1);
3119
- var import_yaml2 = require("yaml");
3264
+ var import_promises9 = require("fs/promises");
3265
+ var import_node_path12 = __toESM(require("path"), 1);
3266
+ var import_yaml3 = require("yaml");
3120
3267
 
3121
3268
  // src/evaluation/providers/types.ts
3122
3269
  var AGENT_PROVIDER_KINDS = [
@@ -3177,19 +3324,19 @@ function assertTargetDefinition(value, index, filePath) {
3177
3324
  }
3178
3325
  async function fileExists3(filePath) {
3179
3326
  try {
3180
- await (0, import_promises5.access)(filePath, import_node_fs4.constants.F_OK);
3327
+ await (0, import_promises9.access)(filePath, import_node_fs4.constants.F_OK);
3181
3328
  return true;
3182
3329
  } catch {
3183
3330
  return false;
3184
3331
  }
3185
3332
  }
3186
3333
  async function readTargetDefinitions(filePath) {
3187
- const absolutePath = import_node_path7.default.resolve(filePath);
3334
+ const absolutePath = import_node_path12.default.resolve(filePath);
3188
3335
  if (!await fileExists3(absolutePath)) {
3189
3336
  throw new Error(`targets.yaml not found at ${absolutePath}`);
3190
3337
  }
3191
- const raw = await (0, import_promises5.readFile)(absolutePath, "utf8");
3192
- const parsed = (0, import_yaml2.parse)(raw);
3338
+ const raw = await (0, import_promises9.readFile)(absolutePath, "utf8");
3339
+ const parsed = (0, import_yaml3.parse)(raw);
3193
3340
  if (!isRecord(parsed)) {
3194
3341
  throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with '$schema' and 'targets' fields`);
3195
3342
  }
@@ -3232,18 +3379,34 @@ function resolveAndCreateProvider(definition, env = process.env) {
3232
3379
  }
3233
3380
 
3234
3381
  // src/evaluation/evaluators.ts
3235
- var import_node_crypto2 = require("crypto");
3382
+ var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
3383
+
3384
+ Use the reference_answer as a gold standard for a high-quality response (if provided). The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
3385
+
3386
+ Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.
3387
+
3388
+ [[ ## expected_outcome ## ]]
3389
+ {{expected_outcome}}
3390
+
3391
+ [[ ## question ## ]]
3392
+ {{question}}
3393
+
3394
+ [[ ## reference_answer ## ]]
3395
+ {{reference_answer}}
3396
+
3397
+ [[ ## candidate_answer ## ]]
3398
+ {{candidate_answer}}`;
3236
3399
  var LlmJudgeEvaluator = class {
3237
3400
  kind = "llm_judge";
3238
3401
  resolveJudgeProvider;
3239
3402
  maxOutputTokens;
3240
3403
  temperature;
3241
- customPrompt;
3404
+ evaluatorTemplate;
3242
3405
  constructor(options) {
3243
3406
  this.resolveJudgeProvider = options.resolveJudgeProvider;
3244
3407
  this.maxOutputTokens = options.maxOutputTokens;
3245
3408
  this.temperature = options.temperature;
3246
- this.customPrompt = options.customPrompt;
3409
+ this.evaluatorTemplate = options.evaluatorTemplate;
3247
3410
  }
3248
3411
  async evaluate(context) {
3249
3412
  const judgeProvider = await this.resolveJudgeProvider(context);
@@ -3253,26 +3416,21 @@ var LlmJudgeEvaluator = class {
3253
3416
  return this.evaluateWithPrompt(context, judgeProvider);
3254
3417
  }
3255
3418
  async evaluateWithPrompt(context, judgeProvider) {
3256
- const hasReferenceAnswer = hasNonEmptyReferenceAnswer(context.evalCase);
3257
3419
  const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
3258
- let prompt = buildQualityPrompt(context.evalCase, context.candidate, formattedQuestion);
3259
- let systemPrompt = context.systemPrompt ?? this.customPrompt ?? buildSystemPrompt(hasReferenceAnswer);
3260
- if (systemPrompt && hasTemplateVariables(systemPrompt)) {
3261
- const variables = {
3262
- input_messages: JSON.stringify(context.evalCase.input_segments, null, 2),
3263
- output_messages: JSON.stringify(context.evalCase.output_segments, null, 2),
3264
- candidate_answer: context.candidate,
3265
- reference_answer: context.evalCase.reference_answer ?? "",
3266
- expected_outcome: context.evalCase.expected_outcome,
3267
- question: formattedQuestion
3268
- };
3269
- prompt = substituteVariables(systemPrompt, variables);
3270
- systemPrompt = buildSystemPrompt(hasReferenceAnswer);
3271
- }
3272
- const metadata = systemPrompt !== void 0 ? { systemPrompt } : {};
3420
+ const variables = {
3421
+ input_messages: JSON.stringify(context.evalCase.input_segments, null, 2),
3422
+ output_messages: JSON.stringify(context.evalCase.output_segments, null, 2),
3423
+ candidate_answer: context.candidate.trim(),
3424
+ reference_answer: (context.evalCase.reference_answer ?? "").trim(),
3425
+ expected_outcome: context.evalCase.expected_outcome.trim(),
3426
+ question: formattedQuestion.trim()
3427
+ };
3428
+ const systemPrompt = buildOutputSchema();
3429
+ const evaluatorTemplate = context.evaluatorTemplateOverride ?? this.evaluatorTemplate ?? DEFAULT_EVALUATOR_TEMPLATE;
3430
+ const userPrompt = substituteVariables(evaluatorTemplate, variables);
3273
3431
  const response = await judgeProvider.invoke({
3274
- question: prompt,
3275
- metadata,
3432
+ question: userPrompt,
3433
+ systemPrompt,
3276
3434
  evalCaseId: context.evalCase.id,
3277
3435
  attempt: context.attempt,
3278
3436
  maxOutputTokens: this.maxOutputTokens,
@@ -3285,11 +3443,9 @@ var LlmJudgeEvaluator = class {
3285
3443
  const reasoning = parsed.reasoning ?? response.reasoning;
3286
3444
  const expectedAspectCount = Math.max(hits.length + misses.length, 1);
3287
3445
  const evaluatorRawRequest = {
3288
- id: (0, import_node_crypto2.randomUUID)(),
3289
- provider: judgeProvider.id,
3290
- prompt,
3291
- target: context.target.name,
3292
- ...systemPrompt !== void 0 && { systemPrompt }
3446
+ userPrompt,
3447
+ systemPrompt,
3448
+ target: judgeProvider.targetName
3293
3449
  };
3294
3450
  return {
3295
3451
  score,
@@ -3301,20 +3457,8 @@ var LlmJudgeEvaluator = class {
3301
3457
  };
3302
3458
  }
3303
3459
  };
3304
- function buildSystemPrompt(hasReferenceAnswer) {
3305
- const basePrompt = [
3306
- "You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.",
3307
- ""
3308
- ];
3309
- if (hasReferenceAnswer) {
3310
- basePrompt.push(
3311
- "Use the reference_answer as a gold standard for a high-quality response. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.",
3312
- ""
3313
- );
3314
- }
3315
- basePrompt.push(
3316
- "Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.",
3317
- "",
3460
+ function buildOutputSchema() {
3461
+ return [
3318
3462
  "You must respond with a single JSON object matching this schema:",
3319
3463
  "",
3320
3464
  "{",
@@ -3323,30 +3467,7 @@ function buildSystemPrompt(hasReferenceAnswer) {
3323
3467
  ' "misses": [<array of strings, max 4 items, brief specific failures or omissions, empty if none>],',
3324
3468
  ' "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
3325
3469
  "}"
3326
- );
3327
- return basePrompt.join("\n");
3328
- }
3329
- function buildQualityPrompt(evalCase, candidate, question) {
3330
- const parts = [
3331
- "[[ ## expected_outcome ## ]]",
3332
- evalCase.expected_outcome.trim(),
3333
- "",
3334
- "[[ ## question ## ]]",
3335
- question.trim(),
3336
- ""
3337
- ];
3338
- if (hasNonEmptyReferenceAnswer(evalCase)) {
3339
- parts.push(
3340
- "[[ ## reference_answer ## ]]",
3341
- evalCase.reference_answer.trim(),
3342
- ""
3343
- );
3344
- }
3345
- parts.push(
3346
- "[[ ## candidate_answer ## ]]",
3347
- candidate.trim()
3348
- );
3349
- return parts.join("\n");
3470
+ ].join("\n");
3350
3471
  }
3351
3472
  function clampScore(value) {
3352
3473
  if (Number.isNaN(value) || !Number.isFinite(value)) {
@@ -3428,9 +3549,6 @@ function extractJsonBlob(text) {
3428
3549
  function isNonEmptyString(value) {
3429
3550
  return typeof value === "string" && value.trim().length > 0;
3430
3551
  }
3431
- function hasNonEmptyReferenceAnswer(evalCase) {
3432
- return evalCase.reference_answer !== void 0 && evalCase.reference_answer.trim().length > 0;
3433
- }
3434
3552
  var CodeEvaluator = class {
3435
3553
  kind = "code";
3436
3554
  script;
@@ -3536,19 +3654,16 @@ function parseJsonSafe(payload) {
3536
3654
  return void 0;
3537
3655
  }
3538
3656
  }
3539
- function hasTemplateVariables(text) {
3540
- return /\$\{[a-zA-Z0-9_]+\}/.test(text);
3541
- }
3542
3657
  function substituteVariables(template, variables) {
3543
- return template.replace(/\$\{([a-zA-Z0-9_]+)\}/g, (match, varName) => {
3658
+ return template.replace(/\{\{([a-zA-Z0-9_]+)\}\}/g, (match, varName) => {
3544
3659
  return variables[varName] ?? match;
3545
3660
  });
3546
3661
  }
3547
3662
 
3548
3663
  // src/evaluation/orchestrator.ts
3549
- var import_node_crypto3 = require("crypto");
3550
- var import_promises6 = require("fs/promises");
3551
- var import_node_path8 = __toESM(require("path"), 1);
3664
+ var import_node_crypto2 = require("crypto");
3665
+ var import_promises10 = require("fs/promises");
3666
+ var import_node_path13 = __toESM(require("path"), 1);
3552
3667
 
3553
3668
  // ../../node_modules/.pnpm/yocto-queue@1.2.1/node_modules/yocto-queue/index.js
3554
3669
  var Node = class {
@@ -4111,6 +4226,7 @@ async function evaluateCandidate(options) {
4111
4226
  }
4112
4227
  }
4113
4228
  return {
4229
+ timestamp: completedAt.toISOString(),
4114
4230
  eval_id: evalCase.id,
4115
4231
  dataset: evalCase.dataset,
4116
4232
  conversation_id: evalCase.conversation_id,
@@ -4118,14 +4234,12 @@ async function evaluateCandidate(options) {
4118
4234
  hits: score.hits,
4119
4235
  misses: score.misses,
4120
4236
  candidate_answer: candidate,
4121
- expected_aspect_count: score.expectedAspectCount,
4122
4237
  target: target.name,
4123
- timestamp: completedAt.toISOString(),
4124
4238
  reasoning: score.reasoning,
4125
4239
  raw_aspects: score.rawAspects,
4126
4240
  agent_provider_request: agentProviderRequest,
4127
4241
  lm_provider_request: lmProviderRequest,
4128
- evaluator_raw_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
4242
+ evaluator_provider_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
4129
4243
  evaluator_results: evaluatorResults
4130
4244
  };
4131
4245
  }
@@ -4202,7 +4316,7 @@ async function runEvaluatorList(options) {
4202
4316
  hits: score2.hits,
4203
4317
  misses: score2.misses,
4204
4318
  reasoning: score2.reasoning,
4205
- evaluator_raw_request: score2.evaluatorRawRequest
4319
+ evaluator_provider_request: score2.evaluatorRawRequest
4206
4320
  });
4207
4321
  continue;
4208
4322
  }
@@ -4229,7 +4343,7 @@ async function runEvaluatorList(options) {
4229
4343
  hits: score2.hits,
4230
4344
  misses: score2.misses,
4231
4345
  reasoning: score2.reasoning,
4232
- evaluator_raw_request: score2.evaluatorRawRequest
4346
+ evaluator_provider_request: score2.evaluatorRawRequest
4233
4347
  });
4234
4348
  continue;
4235
4349
  }
@@ -4282,7 +4396,7 @@ async function runLlmJudgeEvaluator(options) {
4282
4396
  promptInputs,
4283
4397
  now,
4284
4398
  judgeProvider,
4285
- systemPrompt: customPrompt,
4399
+ evaluatorTemplateOverride: customPrompt,
4286
4400
  evaluator: config
4287
4401
  });
4288
4402
  }
@@ -4323,22 +4437,22 @@ function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
4323
4437
  async function dumpPrompt(directory, evalCase, promptInputs) {
4324
4438
  const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
4325
4439
  const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
4326
- const filePath = import_node_path8.default.resolve(directory, filename);
4327
- await (0, import_promises6.mkdir)(import_node_path8.default.dirname(filePath), { recursive: true });
4440
+ const filePath = import_node_path13.default.resolve(directory, filename);
4441
+ await (0, import_promises10.mkdir)(import_node_path13.default.dirname(filePath), { recursive: true });
4328
4442
  const payload = {
4329
4443
  eval_id: evalCase.id,
4330
4444
  question: promptInputs.question,
4331
4445
  guidelines: promptInputs.guidelines,
4332
4446
  guideline_paths: evalCase.guideline_paths
4333
4447
  };
4334
- await (0, import_promises6.writeFile)(filePath, JSON.stringify(payload, null, 2), "utf8");
4448
+ await (0, import_promises10.writeFile)(filePath, JSON.stringify(payload, null, 2), "utf8");
4335
4449
  }
4336
4450
  function sanitizeFilename(value) {
4337
4451
  if (!value) {
4338
4452
  return "prompt";
4339
4453
  }
4340
4454
  const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
4341
- return sanitized.length > 0 ? sanitized : (0, import_node_crypto3.randomUUID)();
4455
+ return sanitized.length > 0 ? sanitized : (0, import_node_crypto2.randomUUID)();
4342
4456
  }
4343
4457
  async function invokeProvider(provider, options) {
4344
4458
  const { evalCase, promptInputs, attempt, agentTimeoutMs, signal } = options;
@@ -4394,6 +4508,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
4394
4508
  }
4395
4509
  }
4396
4510
  return {
4511
+ timestamp: timestamp.toISOString(),
4397
4512
  eval_id: evalCase.id,
4398
4513
  dataset: evalCase.dataset,
4399
4514
  conversation_id: evalCase.conversation_id,
@@ -4401,9 +4516,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
4401
4516
  hits: [],
4402
4517
  misses: [`Error: ${message}`],
4403
4518
  candidate_answer: `Error occurred: ${message}`,
4404
- expected_aspect_count: 0,
4405
4519
  target: targetName,
4406
- timestamp: timestamp.toISOString(),
4407
4520
  raw_aspects: [],
4408
4521
  agent_provider_request: agentProviderRequest,
4409
4522
  lm_provider_request: lmProviderRequest,
@@ -4411,7 +4524,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
4411
4524
  };
4412
4525
  }
4413
4526
  function createCacheKey(provider, target, evalCase, promptInputs) {
4414
- const hash = (0, import_node_crypto3.createHash)("sha256");
4527
+ const hash = (0, import_node_crypto2.createHash)("sha256");
4415
4528
  hash.update(provider.id);
4416
4529
  hash.update(target.name);
4417
4530
  hash.update(evalCase.id);