@agentv/core 0.11.0 → 0.14.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -33,15 +33,15 @@ __export(index_exports, {
33
33
  CodeEvaluator: () => CodeEvaluator,
34
34
  LlmJudgeEvaluator: () => LlmJudgeEvaluator,
35
35
  TEST_MESSAGE_ROLES: () => TEST_MESSAGE_ROLES,
36
- buildDirectoryChain: () => buildDirectoryChain,
36
+ buildDirectoryChain: () => buildDirectoryChain2,
37
37
  buildPromptInputs: () => buildPromptInputs,
38
- buildSearchRoots: () => buildSearchRoots,
38
+ buildSearchRoots: () => buildSearchRoots2,
39
39
  consumeCodexLogEntries: () => consumeCodexLogEntries,
40
40
  createAgentKernel: () => createAgentKernel,
41
41
  createProvider: () => createProvider,
42
42
  ensureVSCodeSubagents: () => ensureVSCodeSubagents,
43
43
  extractCodeBlocks: () => extractCodeBlocks,
44
- fileExists: () => fileExists,
44
+ fileExists: () => fileExists2,
45
45
  findGitRoot: () => findGitRoot,
46
46
  getHitCount: () => getHitCount,
47
47
  isEvaluatorKind: () => isEvaluatorKind,
@@ -57,7 +57,7 @@ __export(index_exports, {
57
57
  readTestSuiteMetadata: () => readTestSuiteMetadata,
58
58
  readTextFile: () => readTextFile,
59
59
  resolveAndCreateProvider: () => resolveAndCreateProvider,
60
- resolveFileReference: () => resolveFileReference,
60
+ resolveFileReference: () => resolveFileReference2,
61
61
  resolveTargetDefinition: () => resolveTargetDefinition,
62
62
  runEvalCase: () => runEvalCase,
63
63
  runEvaluation: () => runEvaluation,
@@ -116,47 +116,112 @@ function getHitCount(result) {
116
116
  }
117
117
 
118
118
  // src/evaluation/yaml-parser.ts
119
+ var import_promises5 = require("fs/promises");
120
+ var import_node_path6 = __toESM(require("path"), 1);
121
+ var import_yaml2 = require("yaml");
122
+
123
+ // src/evaluation/formatting/segment-formatter.ts
124
+ function extractCodeBlocks(segments) {
125
+ const CODE_BLOCK_PATTERN = /```[\s\S]*?```/g;
126
+ const codeBlocks = [];
127
+ for (const segment of segments) {
128
+ const typeValue = segment["type"];
129
+ if (typeof typeValue !== "string" || typeValue !== "text") {
130
+ continue;
131
+ }
132
+ const textValue = segment["value"];
133
+ if (typeof textValue !== "string") {
134
+ continue;
135
+ }
136
+ const matches = textValue.match(CODE_BLOCK_PATTERN);
137
+ if (matches) {
138
+ codeBlocks.push(...matches);
139
+ }
140
+ }
141
+ return codeBlocks;
142
+ }
143
+ function formatFileContents(parts) {
144
+ const fileCount = parts.filter((p) => p.isFile).length;
145
+ if (fileCount > 0) {
146
+ return parts.map((part) => {
147
+ if (part.isFile && part.displayPath) {
148
+ return `<file path="${part.displayPath}">
149
+ ${part.content}
150
+ </file>`;
151
+ }
152
+ return part.content;
153
+ }).join("\n\n");
154
+ }
155
+ return parts.map((p) => p.content).join(" ");
156
+ }
157
+ function formatSegment(segment) {
158
+ const type = asString(segment.type);
159
+ if (type === "text") {
160
+ return asString(segment.value);
161
+ }
162
+ if (type === "guideline_ref") {
163
+ const refPath = asString(segment.path);
164
+ return refPath ? `<Attached: ${refPath}>` : void 0;
165
+ }
166
+ if (type === "file") {
167
+ const text = asString(segment.text);
168
+ const filePath = asString(segment.path);
169
+ if (text && filePath) {
170
+ return formatFileContents([{ content: text.trim(), isFile: true, displayPath: filePath }]);
171
+ }
172
+ }
173
+ return void 0;
174
+ }
175
+ function hasVisibleContent(segments) {
176
+ return segments.some((segment) => {
177
+ const type = asString(segment.type);
178
+ if (type === "text") {
179
+ const value = asString(segment.value);
180
+ return value !== void 0 && value.trim().length > 0;
181
+ }
182
+ if (type === "guideline_ref") {
183
+ return false;
184
+ }
185
+ if (type === "file") {
186
+ const text = asString(segment.text);
187
+ return text !== void 0 && text.trim().length > 0;
188
+ }
189
+ return false;
190
+ });
191
+ }
192
+ function asString(value) {
193
+ return typeof value === "string" ? value : void 0;
194
+ }
195
+
196
+ // src/evaluation/loaders/config-loader.ts
119
197
  var import_micromatch = __toESM(require("micromatch"), 1);
120
- var import_node_fs2 = require("fs");
121
198
  var import_promises2 = require("fs/promises");
122
199
  var import_node_path2 = __toESM(require("path"), 1);
123
- var import_node_url = require("url");
124
200
  var import_yaml = require("yaml");
125
201
 
126
- // src/evaluation/file-utils.ts
202
+ // src/evaluation/loaders/file-resolver.ts
127
203
  var import_node_fs = require("fs");
128
204
  var import_promises = require("fs/promises");
129
205
  var import_node_path = __toESM(require("path"), 1);
130
- async function fileExists(filePath) {
206
+ async function fileExists(absolutePath) {
131
207
  try {
132
- await (0, import_promises.access)(filePath, import_node_fs.constants.F_OK);
208
+ await (0, import_promises.access)(absolutePath, import_node_fs.constants.F_OK);
133
209
  return true;
134
210
  } catch {
135
211
  return false;
136
212
  }
137
213
  }
138
- function normalizeLineEndings(content) {
139
- return content.replace(/\r\n/g, "\n");
140
- }
141
- async function readTextFile(filePath) {
142
- const content = await (0, import_promises.readFile)(filePath, "utf8");
143
- return normalizeLineEndings(content);
144
- }
145
- async function findGitRoot(startPath) {
146
- let currentDir = import_node_path.default.dirname(import_node_path.default.resolve(startPath));
147
- const root = import_node_path.default.parse(currentDir).root;
148
- while (currentDir !== root) {
149
- const gitPath = import_node_path.default.join(currentDir, ".git");
150
- if (await fileExists(gitPath)) {
151
- return currentDir;
152
- }
153
- const parentDir = import_node_path.default.dirname(currentDir);
154
- if (parentDir === currentDir) {
155
- break;
214
+ function resolveToAbsolutePath(candidate) {
215
+ if (candidate instanceof URL) {
216
+ return new URL(candidate).pathname;
217
+ }
218
+ if (typeof candidate === "string") {
219
+ if (candidate.startsWith("file://")) {
220
+ return new URL(candidate).pathname;
156
221
  }
157
- currentDir = parentDir;
222
+ return import_node_path.default.resolve(candidate);
158
223
  }
159
- return null;
224
+ throw new TypeError("Unsupported repoRoot value. Expected string or URL.");
160
225
  }
161
226
  function buildDirectoryChain(filePath, repoRoot) {
162
227
  const directories = [];
@@ -234,44 +299,15 @@ async function resolveFileReference(rawValue, searchRoots) {
234
299
  return { displayPath, attempted };
235
300
  }
236
301
 
237
- // src/evaluation/yaml-parser.ts
238
- var CODE_BLOCK_PATTERN = /```[\s\S]*?```/g;
302
+ // src/evaluation/loaders/config-loader.ts
303
+ var SCHEMA_CONFIG_V2 = "agentv-config-v2";
239
304
  var ANSI_YELLOW = "\x1B[33m";
240
305
  var ANSI_RESET = "\x1B[0m";
241
- var SCHEMA_EVAL_V2 = "agentv-eval-v2";
242
- var SCHEMA_CONFIG_V2 = "agentv-config-v2";
243
- async function readTestSuiteMetadata(testFilePath) {
244
- try {
245
- const absolutePath = import_node_path2.default.resolve(testFilePath);
246
- const content = await (0, import_promises2.readFile)(absolutePath, "utf8");
247
- const parsed = (0, import_yaml.parse)(content);
248
- if (!isJsonObject(parsed)) {
249
- return {};
250
- }
251
- return { target: extractTargetFromSuite(parsed) };
252
- } catch {
253
- return {};
254
- }
255
- }
256
- function extractTargetFromSuite(suite) {
257
- const execution = suite.execution;
258
- if (execution && typeof execution === "object" && !Array.isArray(execution)) {
259
- const executionTarget = execution.target;
260
- if (typeof executionTarget === "string" && executionTarget.trim().length > 0) {
261
- return executionTarget.trim();
262
- }
263
- }
264
- const targetValue = suite.target;
265
- if (typeof targetValue === "string" && targetValue.trim().length > 0) {
266
- return targetValue.trim();
267
- }
268
- return void 0;
269
- }
270
306
  async function loadConfig(evalFilePath, repoRoot) {
271
307
  const directories = buildDirectoryChain(evalFilePath, repoRoot);
272
308
  for (const directory of directories) {
273
309
  const configPath = import_node_path2.default.join(directory, ".agentv", "config.yaml");
274
- if (!await fileExists2(configPath)) {
310
+ if (!await fileExists(configPath)) {
275
311
  continue;
276
312
  }
277
313
  try {
@@ -313,24 +349,134 @@ function isGuidelineFile(filePath, patterns) {
313
349
  const patternsToUse = patterns ?? [];
314
350
  return import_micromatch.default.isMatch(normalized, patternsToUse);
315
351
  }
316
- function extractCodeBlocks(segments) {
317
- const codeBlocks = [];
318
- for (const segment of segments) {
319
- const typeValue = segment["type"];
320
- if (typeof typeValue !== "string" || typeValue !== "text") {
352
+ function extractTargetFromSuite(suite) {
353
+ const execution = suite.execution;
354
+ if (execution && typeof execution === "object" && !Array.isArray(execution)) {
355
+ const executionTarget = execution.target;
356
+ if (typeof executionTarget === "string" && executionTarget.trim().length > 0) {
357
+ return executionTarget.trim();
358
+ }
359
+ }
360
+ const targetValue = suite.target;
361
+ if (typeof targetValue === "string" && targetValue.trim().length > 0) {
362
+ return targetValue.trim();
363
+ }
364
+ return void 0;
365
+ }
366
+ function logWarning(message) {
367
+ console.warn(`${ANSI_YELLOW}Warning: ${message}${ANSI_RESET}`);
368
+ }
369
+
370
+ // src/evaluation/loaders/evaluator-parser.ts
371
+ var import_node_path3 = __toESM(require("path"), 1);
372
+ var ANSI_YELLOW2 = "\x1B[33m";
373
+ var ANSI_RESET2 = "\x1B[0m";
374
+ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
375
+ const execution = rawEvalCase.execution;
376
+ const candidateEvaluators = isJsonObject2(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators ?? globalExecution?.evaluators;
377
+ if (candidateEvaluators === void 0) {
378
+ return void 0;
379
+ }
380
+ if (!Array.isArray(candidateEvaluators)) {
381
+ logWarning2(`Skipping evaluators for '${evalId}': expected array`);
382
+ return void 0;
383
+ }
384
+ const evaluators = [];
385
+ for (const rawEvaluator of candidateEvaluators) {
386
+ if (!isJsonObject2(rawEvaluator)) {
387
+ logWarning2(`Skipping invalid evaluator entry for '${evalId}' (expected object)`);
321
388
  continue;
322
389
  }
323
- const textValue = segment["value"];
324
- if (typeof textValue !== "string") {
390
+ const name = asString2(rawEvaluator.name);
391
+ const typeValue = rawEvaluator.type;
392
+ if (!name || !isEvaluatorKind(typeValue)) {
393
+ logWarning2(`Skipping evaluator with invalid name/type in '${evalId}'`);
325
394
  continue;
326
395
  }
327
- const matches = textValue.match(CODE_BLOCK_PATTERN);
328
- if (matches) {
329
- codeBlocks.push(...matches);
396
+ if (typeValue === "code") {
397
+ const script = asString2(rawEvaluator.script);
398
+ if (!script) {
399
+ logWarning2(`Skipping code evaluator '${name}' in '${evalId}': missing script`);
400
+ continue;
401
+ }
402
+ const cwd = asString2(rawEvaluator.cwd);
403
+ let resolvedCwd;
404
+ if (cwd) {
405
+ const resolved = await resolveFileReference(cwd, searchRoots);
406
+ if (resolved.resolvedPath) {
407
+ resolvedCwd = import_node_path3.default.resolve(resolved.resolvedPath);
408
+ } else {
409
+ logWarning2(
410
+ `Code evaluator '${name}' in '${evalId}': cwd not found (${resolved.displayPath})`,
411
+ resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
412
+ );
413
+ }
414
+ } else {
415
+ resolvedCwd = searchRoots[0];
416
+ }
417
+ evaluators.push({
418
+ name,
419
+ type: "code",
420
+ script,
421
+ cwd,
422
+ resolvedCwd
423
+ });
424
+ continue;
425
+ }
426
+ const prompt = asString2(rawEvaluator.prompt);
427
+ let promptPath;
428
+ if (prompt) {
429
+ const resolved = await resolveFileReference(prompt, searchRoots);
430
+ if (resolved.resolvedPath) {
431
+ promptPath = import_node_path3.default.resolve(resolved.resolvedPath);
432
+ } else {
433
+ logWarning2(
434
+ `Inline prompt used for evaluator '${name}' in '${evalId}' (file not found: ${resolved.displayPath})`,
435
+ resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
436
+ );
437
+ }
330
438
  }
439
+ const _model = asString2(rawEvaluator.model);
440
+ evaluators.push({
441
+ name,
442
+ type: "llm_judge",
443
+ prompt,
444
+ promptPath
445
+ });
446
+ }
447
+ return evaluators.length > 0 ? evaluators : void 0;
448
+ }
449
+ function coerceEvaluator(candidate, contextId) {
450
+ if (typeof candidate !== "string") {
451
+ return void 0;
452
+ }
453
+ if (isEvaluatorKind(candidate)) {
454
+ return candidate;
455
+ }
456
+ logWarning2(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
457
+ return void 0;
458
+ }
459
+ function asString2(value) {
460
+ return typeof value === "string" ? value : void 0;
461
+ }
462
+ function isJsonObject2(value) {
463
+ return typeof value === "object" && value !== null && !Array.isArray(value);
464
+ }
465
+ function logWarning2(message, details) {
466
+ if (details && details.length > 0) {
467
+ const detailBlock = details.join("\n");
468
+ console.warn(`${ANSI_YELLOW2}Warning: ${message}
469
+ ${detailBlock}${ANSI_RESET2}`);
470
+ } else {
471
+ console.warn(`${ANSI_YELLOW2}Warning: ${message}${ANSI_RESET2}`);
331
472
  }
332
- return codeBlocks;
333
473
  }
474
+
475
+ // src/evaluation/loaders/message-processor.ts
476
+ var import_promises3 = require("fs/promises");
477
+ var import_node_path4 = __toESM(require("path"), 1);
478
+ var ANSI_YELLOW3 = "\x1B[33m";
479
+ var ANSI_RESET3 = "\x1B[0m";
334
480
  async function processMessages(options) {
335
481
  const {
336
482
  messages,
@@ -356,9 +502,9 @@ async function processMessages(options) {
356
502
  if (!isJsonObject(rawSegment)) {
357
503
  continue;
358
504
  }
359
- const segmentType = asString(rawSegment.type);
505
+ const segmentType = asString3(rawSegment.type);
360
506
  if (segmentType === "file") {
361
- const rawValue = asString(rawSegment.value);
507
+ const rawValue = asString3(rawSegment.value);
362
508
  if (!rawValue) {
363
509
  continue;
364
510
  }
@@ -369,15 +515,15 @@ async function processMessages(options) {
369
515
  if (!resolvedPath) {
370
516
  const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
371
517
  const context = messageType === "input" ? "" : " in expected_messages";
372
- logWarning(`File not found${context}: ${displayPath}`, attempts);
518
+ logWarning3(`File not found${context}: ${displayPath}`, attempts);
373
519
  continue;
374
520
  }
375
521
  try {
376
- const fileContent = (await (0, import_promises2.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
522
+ const fileContent = (await (0, import_promises3.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
377
523
  if (messageType === "input" && guidelinePatterns && guidelinePaths) {
378
- const relativeToRepo = import_node_path2.default.relative(repoRootPath, resolvedPath);
524
+ const relativeToRepo = import_node_path4.default.relative(repoRootPath, resolvedPath);
379
525
  if (isGuidelineFile(relativeToRepo, guidelinePatterns)) {
380
- guidelinePaths.push(import_node_path2.default.resolve(resolvedPath));
526
+ guidelinePaths.push(import_node_path4.default.resolve(resolvedPath));
381
527
  if (verbose) {
382
528
  console.log(` [Guideline] Found: ${displayPath}`);
383
529
  console.log(` Resolved to: ${resolvedPath}`);
@@ -389,7 +535,7 @@ async function processMessages(options) {
389
535
  type: "file",
390
536
  path: displayPath,
391
537
  text: fileContent,
392
- resolvedPath: import_node_path2.default.resolve(resolvedPath)
538
+ resolvedPath: import_node_path4.default.resolve(resolvedPath)
393
539
  });
394
540
  if (verbose) {
395
541
  const label = messageType === "input" ? "[File]" : "[Expected Output File]";
@@ -398,7 +544,7 @@ async function processMessages(options) {
398
544
  }
399
545
  } catch (error) {
400
546
  const context = messageType === "input" ? "" : " expected output";
401
- logWarning(`Could not read${context} file ${resolvedPath}: ${error.message}`);
547
+ logWarning3(`Could not read${context} file ${resolvedPath}: ${error.message}`);
402
548
  }
403
549
  continue;
404
550
  }
@@ -412,201 +558,117 @@ async function processMessages(options) {
412
558
  }
413
559
  return segments;
414
560
  }
415
- async function loadEvalCases(evalFilePath, repoRoot, options) {
416
- const verbose = options?.verbose ?? false;
417
- const evalIdFilter = options?.evalId;
418
- const absoluteTestPath = import_node_path2.default.resolve(evalFilePath);
419
- if (!await fileExists2(absoluteTestPath)) {
420
- throw new Error(`Test file not found: ${evalFilePath}`);
421
- }
422
- const repoRootPath = resolveToAbsolutePath(repoRoot);
423
- const searchRoots = buildSearchRoots(absoluteTestPath, repoRootPath);
424
- const config = await loadConfig(absoluteTestPath, repoRootPath);
425
- const guidelinePatterns = config?.guideline_patterns;
426
- const rawFile = await (0, import_promises2.readFile)(absoluteTestPath, "utf8");
427
- const parsed = (0, import_yaml.parse)(rawFile);
428
- if (!isJsonObject(parsed)) {
429
- throw new Error(`Invalid test file format: ${evalFilePath}`);
430
- }
431
- const suite = parsed;
432
- const datasetNameFromSuite = asString(suite.dataset)?.trim();
433
- const fallbackDataset = import_node_path2.default.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
434
- const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
435
- const schema = suite.$schema;
436
- if (schema !== SCHEMA_EVAL_V2) {
437
- const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${evalFilePath}. Expected '${SCHEMA_EVAL_V2}'` : `Missing required field '$schema' in ${evalFilePath}.
438
- Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
439
- throw new Error(message);
561
+ async function resolveAssistantContent(content, searchRoots, verbose) {
562
+ if (typeof content === "string") {
563
+ return content;
440
564
  }
441
- const rawTestcases = suite.evalcases;
442
- if (!Array.isArray(rawTestcases)) {
443
- throw new Error(`Invalid test file format: ${evalFilePath} - missing 'evalcases' field`);
565
+ if (!content) {
566
+ return "";
444
567
  }
445
- const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm_judge";
446
- const globalExecution = isJsonObject(suite.execution) ? suite.execution : void 0;
447
- const globalTarget = asString(globalExecution?.target) ?? asString(suite.target);
448
- const results = [];
449
- for (const rawEvalcase of rawTestcases) {
450
- if (!isJsonObject(rawEvalcase)) {
451
- logWarning("Skipping invalid eval case entry (expected object)");
568
+ const parts = [];
569
+ for (const entry of content) {
570
+ if (typeof entry === "string") {
571
+ parts.push({ content: entry, isFile: false });
452
572
  continue;
453
573
  }
454
- const evalcase = rawEvalcase;
455
- const id = asString(evalcase.id);
456
- if (evalIdFilter && id !== evalIdFilter) {
574
+ if (!isJsonObject(entry)) {
457
575
  continue;
458
576
  }
459
- const conversationId = asString(evalcase.conversation_id);
460
- const outcome = asString(evalcase.outcome);
461
- const inputMessagesValue = evalcase.input_messages;
462
- const expectedMessagesValue = evalcase.expected_messages;
463
- if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
464
- logWarning(`Skipping incomplete eval case: ${id ?? "unknown"}`);
577
+ const segmentType = asString3(entry.type);
578
+ if (segmentType === "file") {
579
+ const rawValue = asString3(entry.value);
580
+ if (!rawValue) {
581
+ continue;
582
+ }
583
+ const { displayPath, resolvedPath, attempted } = await resolveFileReference(
584
+ rawValue,
585
+ searchRoots
586
+ );
587
+ if (!resolvedPath) {
588
+ const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
589
+ logWarning3(`File not found in expected_messages: ${displayPath}`, attempts);
590
+ continue;
591
+ }
592
+ try {
593
+ const fileContent = (await (0, import_promises3.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n").trim();
594
+ parts.push({ content: fileContent, isFile: true, displayPath });
595
+ if (verbose) {
596
+ console.log(` [Expected Assistant File] Found: ${displayPath}`);
597
+ console.log(` Resolved to: ${resolvedPath}`);
598
+ }
599
+ } catch (error) {
600
+ logWarning3(`Could not read file ${resolvedPath}: ${error.message}`);
601
+ }
465
602
  continue;
466
603
  }
467
- const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
468
- const inputMessages = inputMessagesValue.filter((msg) => isTestMessage(msg));
469
- const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
470
- if (hasExpectedMessages && expectedMessages.length === 0) {
471
- logWarning(`No valid expected message found for eval case: ${id}`);
604
+ const textValue = asString3(entry.text);
605
+ if (typeof textValue === "string") {
606
+ parts.push({ content: textValue, isFile: false });
472
607
  continue;
473
608
  }
474
- if (expectedMessages.length > 1) {
475
- logWarning(`Multiple expected messages found for eval case: ${id}, using first`);
476
- }
477
- const guidelinePaths = [];
478
- const inputTextParts = [];
479
- const inputSegments = await processMessages({
480
- messages: inputMessages,
481
- searchRoots,
482
- repoRootPath,
483
- guidelinePatterns,
484
- guidelinePaths,
485
- textParts: inputTextParts,
486
- messageType: "input",
487
- verbose
488
- });
489
- const outputSegments = hasExpectedMessages ? await processMessages({
490
- messages: expectedMessages,
491
- searchRoots,
492
- repoRootPath,
493
- guidelinePatterns,
494
- messageType: "output",
495
- verbose
496
- }) : [];
497
- const codeSnippets = extractCodeBlocks(inputSegments);
498
- const expectedContent = expectedMessages[0]?.content;
499
- const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
500
- const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
501
- const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
502
- const evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
503
- const userFilePaths = [];
504
- for (const segment of inputSegments) {
505
- if (segment.type === "file" && typeof segment.resolvedPath === "string") {
506
- userFilePaths.push(segment.resolvedPath);
507
- }
508
- }
509
- const allFilePaths = [
510
- ...guidelinePaths.map((guidelinePath) => import_node_path2.default.resolve(guidelinePath)),
511
- ...userFilePaths
512
- ];
513
- const testCase = {
514
- id,
515
- dataset: datasetName,
516
- conversation_id: conversationId,
517
- question,
518
- input_messages: inputMessages,
519
- input_segments: inputSegments,
520
- output_segments: outputSegments,
521
- reference_answer: referenceAnswer,
522
- guideline_paths: guidelinePaths.map((guidelinePath) => import_node_path2.default.resolve(guidelinePath)),
523
- guideline_patterns: guidelinePatterns,
524
- file_paths: allFilePaths,
525
- code_snippets: codeSnippets,
526
- expected_outcome: outcome,
527
- evaluator: evalCaseEvaluatorKind,
528
- evaluators
529
- };
530
- if (verbose) {
531
- console.log(`
532
- [Eval Case: ${id}]`);
533
- if (testCase.guideline_paths.length > 0) {
534
- console.log(` Guidelines used: ${testCase.guideline_paths.length}`);
535
- for (const guidelinePath of testCase.guideline_paths) {
536
- console.log(` - ${guidelinePath}`);
537
- }
538
- } else {
539
- console.log(" No guidelines found");
540
- }
609
+ const valueValue = asString3(entry.value);
610
+ if (typeof valueValue === "string") {
611
+ parts.push({ content: valueValue, isFile: false });
612
+ continue;
541
613
  }
542
- results.push(testCase);
614
+ parts.push({ content: JSON.stringify(entry), isFile: false });
543
615
  }
544
- return results;
616
+ return formatFileContents(parts);
545
617
  }
546
- function needsRoleMarkers(messages, processedSegmentsByMessage) {
547
- if (messages.some((msg) => msg.role === "assistant" || msg.role === "tool")) {
548
- return true;
549
- }
550
- let messagesWithContent = 0;
551
- for (const segments of processedSegmentsByMessage) {
552
- if (hasVisibleContent(segments)) {
553
- messagesWithContent++;
554
- }
555
- }
556
- return messagesWithContent > 1;
618
+ function asString3(value) {
619
+ return typeof value === "string" ? value : void 0;
557
620
  }
558
- function hasVisibleContent(segments) {
559
- return segments.some((segment) => {
560
- const type = asString(segment.type);
561
- if (type === "text") {
562
- const value = asString(segment.value);
563
- return value !== void 0 && value.trim().length > 0;
564
- }
565
- if (type === "guideline_ref") {
566
- return false;
567
- }
568
- if (type === "file") {
569
- const text = asString(segment.text);
570
- return text !== void 0 && text.trim().length > 0;
571
- }
572
- return false;
573
- });
621
+ function cloneJsonObject(source) {
622
+ const entries = Object.entries(source).map(([key, value]) => [key, cloneJsonValue(value)]);
623
+ return Object.fromEntries(entries);
574
624
  }
575
- function formatSegment(segment) {
576
- const type = asString(segment.type);
577
- if (type === "text") {
578
- return asString(segment.value);
625
+ function cloneJsonValue(value) {
626
+ if (value === null) {
627
+ return null;
579
628
  }
580
- if (type === "guideline_ref") {
581
- const refPath = asString(segment.path);
582
- return refPath ? `<Attached: ${refPath}>` : void 0;
629
+ if (typeof value === "string" || typeof value === "number" || typeof value === "boolean") {
630
+ return value;
583
631
  }
584
- if (type === "file") {
585
- const text = asString(segment.text);
586
- const filePath = asString(segment.path);
587
- if (text && filePath) {
588
- return formatFileContents([{ content: text.trim(), isFile: true, displayPath: filePath }]);
589
- }
632
+ if (Array.isArray(value)) {
633
+ return value.map((item) => cloneJsonValue(item));
634
+ }
635
+ if (typeof value === "object") {
636
+ return cloneJsonObject(value);
637
+ }
638
+ return value;
639
+ }
640
+ function logWarning3(message, details) {
641
+ if (details && details.length > 0) {
642
+ const detailBlock = details.join("\n");
643
+ console.warn(`${ANSI_YELLOW3}Warning: ${message}
644
+ ${detailBlock}${ANSI_RESET3}`);
645
+ } else {
646
+ console.warn(`${ANSI_YELLOW3}Warning: ${message}${ANSI_RESET3}`);
590
647
  }
591
- return void 0;
592
648
  }
649
+
650
+ // src/evaluation/formatting/prompt-builder.ts
651
+ var import_promises4 = require("fs/promises");
652
+ var import_node_path5 = __toESM(require("path"), 1);
653
+ var ANSI_YELLOW4 = "\x1B[33m";
654
+ var ANSI_RESET4 = "\x1B[0m";
593
655
  async function buildPromptInputs(testCase) {
594
656
  const guidelineParts = [];
595
657
  for (const rawPath of testCase.guideline_paths) {
596
- const absolutePath = import_node_path2.default.resolve(rawPath);
597
- if (!await fileExists2(absolutePath)) {
598
- logWarning(`Could not read guideline file ${absolutePath}: file does not exist`);
658
+ const absolutePath = import_node_path5.default.resolve(rawPath);
659
+ if (!await fileExists(absolutePath)) {
660
+ logWarning4(`Could not read guideline file ${absolutePath}: file does not exist`);
599
661
  continue;
600
662
  }
601
663
  try {
602
- const content = (await (0, import_promises2.readFile)(absolutePath, "utf8")).replace(/\r\n/g, "\n").trim();
664
+ const content = (await (0, import_promises4.readFile)(absolutePath, "utf8")).replace(/\r\n/g, "\n").trim();
603
665
  guidelineParts.push({
604
666
  content,
605
667
  isFile: true,
606
- displayPath: import_node_path2.default.basename(absolutePath)
668
+ displayPath: import_node_path5.default.basename(absolutePath)
607
669
  });
608
670
  } catch (error) {
609
- logWarning(`Could not read guideline file ${absolutePath}: ${error.message}`);
671
+ logWarning4(`Could not read guideline file ${absolutePath}: ${error.message}`);
610
672
  }
611
673
  }
612
674
  const guidelines = formatFileContents(guidelineParts);
@@ -630,9 +692,9 @@ async function buildPromptInputs(testCase) {
630
692
  messageSegments.push({ type: "text", value: segment });
631
693
  }
632
694
  } else if (isJsonObject(segment)) {
633
- const type = asString(segment.type);
695
+ const type = asString4(segment.type);
634
696
  if (type === "file") {
635
- const value = asString(segment.value);
697
+ const value = asString4(segment.value);
636
698
  if (!value) continue;
637
699
  if (testCase.guideline_patterns && isGuidelineFile(value, testCase.guideline_patterns)) {
638
700
  messageSegments.push({ type: "guideline_ref", path: value });
@@ -643,7 +705,7 @@ async function buildPromptInputs(testCase) {
643
705
  messageSegments.push({ type: "file", text: fileText, path: value });
644
706
  }
645
707
  } else if (type === "text") {
646
- const textValue = asString(segment.value);
708
+ const textValue = asString4(segment.value);
647
709
  if (textValue && textValue.trim().length > 0) {
648
710
  messageSegments.push({ type: "text", value: textValue });
649
711
  }
@@ -699,6 +761,18 @@ ${messageContent}`);
699
761
  }) : void 0;
700
762
  return { question, guidelines, chatPrompt };
701
763
  }
764
+ function needsRoleMarkers(messages, processedSegmentsByMessage) {
765
+ if (messages.some((msg) => msg.role === "assistant" || msg.role === "tool")) {
766
+ return true;
767
+ }
768
+ let messagesWithContent = 0;
769
+ for (const segments of processedSegmentsByMessage) {
770
+ if (hasVisibleContent(segments)) {
771
+ messagesWithContent++;
772
+ }
773
+ }
774
+ return messagesWithContent > 1;
775
+ }
702
776
  function buildChatPromptFromSegments(options) {
703
777
  const { messages, segmentsByMessage, guidelinePatterns, guidelineContent, systemPrompt } = options;
704
778
  if (messages.length === 0) {
@@ -740,13 +814,12 @@ ${guidelineContent.trim()}`);
740
814
  const segments = segmentsByMessage[i];
741
815
  const contentParts = [];
742
816
  let role = message.role;
743
- let name;
744
817
  if (role === "system") {
745
818
  role = "assistant";
746
819
  contentParts.push("@[System]:");
747
820
  } else if (role === "tool") {
748
- role = "function";
749
- name = "tool";
821
+ role = "assistant";
822
+ contentParts.push("@[Tool]:");
750
823
  }
751
824
  for (const segment of segments) {
752
825
  if (segment.type === "guideline_ref") {
@@ -764,282 +837,509 @@ ${guidelineContent.trim()}`);
764
837
  if (contentParts.length === 0) {
765
838
  continue;
766
839
  }
840
+ const content = contentParts.join("\n");
767
841
  chatPrompt.push({
768
842
  role,
769
- content: contentParts.join("\n"),
770
- ...name ? { name } : {}
843
+ content
771
844
  });
772
845
  }
773
846
  return chatPrompt.length > 0 ? chatPrompt : void 0;
774
847
  }
775
- async function fileExists2(absolutePath) {
848
+ function asString4(value) {
849
+ return typeof value === "string" ? value : void 0;
850
+ }
851
+ function logWarning4(message) {
852
+ console.warn(`${ANSI_YELLOW4}Warning: ${message}${ANSI_RESET4}`);
853
+ }
854
+
855
+ // src/evaluation/yaml-parser.ts
856
+ var ANSI_YELLOW5 = "\x1B[33m";
857
+ var ANSI_RESET5 = "\x1B[0m";
858
+ var SCHEMA_EVAL_V2 = "agentv-eval-v2";
859
+ async function readTestSuiteMetadata(testFilePath) {
776
860
  try {
777
- await (0, import_promises2.access)(absolutePath, import_node_fs2.constants.F_OK);
778
- return true;
861
+ const absolutePath = import_node_path6.default.resolve(testFilePath);
862
+ const content = await (0, import_promises5.readFile)(absolutePath, "utf8");
863
+ const parsed = (0, import_yaml2.parse)(content);
864
+ if (!isJsonObject(parsed)) {
865
+ return {};
866
+ }
867
+ return { target: extractTargetFromSuite(parsed) };
779
868
  } catch {
780
- return false;
869
+ return {};
781
870
  }
782
871
  }
783
- function resolveToAbsolutePath(candidate) {
784
- if (candidate instanceof URL) {
785
- return (0, import_node_url.fileURLToPath)(candidate);
872
+ async function loadEvalCases(evalFilePath, repoRoot, options) {
873
+ const verbose = options?.verbose ?? false;
874
+ const evalIdFilter = options?.evalId;
875
+ const absoluteTestPath = import_node_path6.default.resolve(evalFilePath);
876
+ const repoRootPath = resolveToAbsolutePath(repoRoot);
877
+ const searchRoots = buildSearchRoots(absoluteTestPath, repoRootPath);
878
+ const config = await loadConfig(absoluteTestPath, repoRootPath);
879
+ const guidelinePatterns = config?.guideline_patterns;
880
+ const rawFile = await (0, import_promises5.readFile)(absoluteTestPath, "utf8");
881
+ const parsed = (0, import_yaml2.parse)(rawFile);
882
+ if (!isJsonObject(parsed)) {
883
+ throw new Error(`Invalid test file format: ${evalFilePath}`);
786
884
  }
787
- if (typeof candidate === "string") {
788
- if (candidate.startsWith("file://")) {
789
- return (0, import_node_url.fileURLToPath)(new URL(candidate));
885
+ const suite = parsed;
886
+ const datasetNameFromSuite = asString5(suite.dataset)?.trim();
887
+ const fallbackDataset = import_node_path6.default.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
888
+ const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
889
+ const schema = suite.$schema;
890
+ if (schema !== SCHEMA_EVAL_V2) {
891
+ const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${evalFilePath}. Expected '${SCHEMA_EVAL_V2}'` : `Missing required field '$schema' in ${evalFilePath}.
892
+ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
893
+ throw new Error(message);
894
+ }
895
+ const rawTestcases = suite.evalcases;
896
+ if (!Array.isArray(rawTestcases)) {
897
+ throw new Error(`Invalid test file format: ${evalFilePath} - missing 'evalcases' field`);
898
+ }
899
+ const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm_judge";
900
+ const globalExecution = isJsonObject(suite.execution) ? suite.execution : void 0;
901
+ const _globalTarget = asString5(globalExecution?.target) ?? asString5(suite.target);
902
+ const results = [];
903
+ for (const rawEvalcase of rawTestcases) {
904
+ if (!isJsonObject(rawEvalcase)) {
905
+ logWarning5("Skipping invalid eval case entry (expected object)");
906
+ continue;
790
907
  }
791
- return import_node_path2.default.resolve(candidate);
908
+ const evalcase = rawEvalcase;
909
+ const id = asString5(evalcase.id);
910
+ if (evalIdFilter && id !== evalIdFilter) {
911
+ continue;
912
+ }
913
+ const conversationId = asString5(evalcase.conversation_id);
914
+ const outcome = asString5(evalcase.outcome);
915
+ const inputMessagesValue = evalcase.input_messages;
916
+ const expectedMessagesValue = evalcase.expected_messages;
917
+ if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
918
+ logWarning5(`Skipping incomplete eval case: ${id ?? "unknown"}`);
919
+ continue;
920
+ }
921
+ const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
922
+ const inputMessages = inputMessagesValue.filter((msg) => isTestMessage(msg));
923
+ const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
924
+ if (hasExpectedMessages && expectedMessages.length === 0) {
925
+ logWarning5(`No valid expected message found for eval case: ${id}`);
926
+ continue;
927
+ }
928
+ if (expectedMessages.length > 1) {
929
+ logWarning5(`Multiple expected messages found for eval case: ${id}, using first`);
930
+ }
931
+ const guidelinePaths = [];
932
+ const inputTextParts = [];
933
+ const inputSegments = await processMessages({
934
+ messages: inputMessages,
935
+ searchRoots,
936
+ repoRootPath,
937
+ guidelinePatterns,
938
+ guidelinePaths,
939
+ textParts: inputTextParts,
940
+ messageType: "input",
941
+ verbose
942
+ });
943
+ const outputSegments = hasExpectedMessages ? await processMessages({
944
+ messages: expectedMessages,
945
+ searchRoots,
946
+ repoRootPath,
947
+ guidelinePatterns,
948
+ messageType: "output",
949
+ verbose
950
+ }) : [];
951
+ const codeSnippets = extractCodeBlocks(inputSegments);
952
+ const expectedContent = expectedMessages[0]?.content;
953
+ const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
954
+ const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
955
+ const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
956
+ const evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
957
+ const userFilePaths = [];
958
+ for (const segment of inputSegments) {
959
+ if (segment.type === "file" && typeof segment.resolvedPath === "string") {
960
+ userFilePaths.push(segment.resolvedPath);
961
+ }
962
+ }
963
+ const allFilePaths = [
964
+ ...guidelinePaths.map((guidelinePath) => import_node_path6.default.resolve(guidelinePath)),
965
+ ...userFilePaths
966
+ ];
967
+ const testCase = {
968
+ id,
969
+ dataset: datasetName,
970
+ conversation_id: conversationId,
971
+ question,
972
+ input_messages: inputMessages,
973
+ input_segments: inputSegments,
974
+ output_segments: outputSegments,
975
+ reference_answer: referenceAnswer,
976
+ guideline_paths: guidelinePaths.map((guidelinePath) => import_node_path6.default.resolve(guidelinePath)),
977
+ guideline_patterns: guidelinePatterns,
978
+ file_paths: allFilePaths,
979
+ code_snippets: codeSnippets,
980
+ expected_outcome: outcome,
981
+ evaluator: evalCaseEvaluatorKind,
982
+ evaluators
983
+ };
984
+ if (verbose) {
985
+ console.log(`
986
+ [Eval Case: ${id}]`);
987
+ if (testCase.guideline_paths.length > 0) {
988
+ console.log(` Guidelines used: ${testCase.guideline_paths.length}`);
989
+ for (const guidelinePath of testCase.guideline_paths) {
990
+ console.log(` - ${guidelinePath}`);
991
+ }
992
+ } else {
993
+ console.log(" No guidelines found");
994
+ }
995
+ }
996
+ results.push(testCase);
792
997
  }
793
- throw new TypeError("Unsupported repoRoot value. Expected string or URL.");
998
+ return results;
794
999
  }
795
- function asString(value) {
1000
+ function asString5(value) {
796
1001
  return typeof value === "string" ? value : void 0;
797
1002
  }
798
- function cloneJsonObject(source) {
799
- const entries = Object.entries(source).map(([key, value]) => [key, cloneJsonValue(value)]);
800
- return Object.fromEntries(entries);
801
- }
802
- function cloneJsonValue(value) {
803
- if (value === null) {
804
- return null;
805
- }
806
- if (typeof value === "string" || typeof value === "number" || typeof value === "boolean") {
807
- return value;
1003
+ function logWarning5(message, details) {
1004
+ if (details && details.length > 0) {
1005
+ const detailBlock = details.join("\n");
1006
+ console.warn(`${ANSI_YELLOW5}Warning: ${message}
1007
+ ${detailBlock}${ANSI_RESET5}`);
1008
+ } else {
1009
+ console.warn(`${ANSI_YELLOW5}Warning: ${message}${ANSI_RESET5}`);
808
1010
  }
809
- if (Array.isArray(value)) {
810
- return value.map((item) => cloneJsonValue(item));
1011
+ }
1012
+
1013
+ // src/evaluation/file-utils.ts
1014
+ var import_node_fs2 = require("fs");
1015
+ var import_promises6 = require("fs/promises");
1016
+ var import_node_path7 = __toESM(require("path"), 1);
1017
+ async function fileExists2(filePath) {
1018
+ try {
1019
+ await (0, import_promises6.access)(filePath, import_node_fs2.constants.F_OK);
1020
+ return true;
1021
+ } catch {
1022
+ return false;
811
1023
  }
812
- return cloneJsonObject(value);
813
1024
  }
814
- function formatFileContents(parts) {
815
- const fileCount = parts.filter((p) => p.isFile).length;
816
- if (fileCount > 0) {
817
- return parts.map((part) => {
818
- if (part.isFile && part.displayPath) {
819
- return `<file path="${part.displayPath}">
820
- ${part.content}
821
- </file>`;
822
- }
823
- return part.content;
824
- }).join("\n\n");
1025
+ function normalizeLineEndings(content) {
1026
+ return content.replace(/\r\n/g, "\n");
1027
+ }
1028
+ async function readTextFile(filePath) {
1029
+ const content = await (0, import_promises6.readFile)(filePath, "utf8");
1030
+ return normalizeLineEndings(content);
1031
+ }
1032
+ async function findGitRoot(startPath) {
1033
+ let currentDir = import_node_path7.default.dirname(import_node_path7.default.resolve(startPath));
1034
+ const root = import_node_path7.default.parse(currentDir).root;
1035
+ while (currentDir !== root) {
1036
+ const gitPath = import_node_path7.default.join(currentDir, ".git");
1037
+ if (await fileExists2(gitPath)) {
1038
+ return currentDir;
1039
+ }
1040
+ const parentDir = import_node_path7.default.dirname(currentDir);
1041
+ if (parentDir === currentDir) {
1042
+ break;
1043
+ }
1044
+ currentDir = parentDir;
825
1045
  }
826
- return parts.map((p) => p.content).join(" ");
1046
+ return null;
827
1047
  }
828
- async function resolveAssistantContent(content, searchRoots, verbose) {
829
- if (typeof content === "string") {
830
- return content;
1048
+ function buildDirectoryChain2(filePath, repoRoot) {
1049
+ const directories = [];
1050
+ const seen = /* @__PURE__ */ new Set();
1051
+ const boundary = import_node_path7.default.resolve(repoRoot);
1052
+ let current = import_node_path7.default.resolve(import_node_path7.default.dirname(filePath));
1053
+ while (current !== void 0) {
1054
+ if (!seen.has(current)) {
1055
+ directories.push(current);
1056
+ seen.add(current);
1057
+ }
1058
+ if (current === boundary) {
1059
+ break;
1060
+ }
1061
+ const parent = import_node_path7.default.dirname(current);
1062
+ if (parent === current) {
1063
+ break;
1064
+ }
1065
+ current = parent;
831
1066
  }
832
- if (!content) {
833
- return "";
1067
+ if (!seen.has(boundary)) {
1068
+ directories.push(boundary);
834
1069
  }
835
- const parts = [];
836
- for (const entry of content) {
837
- if (typeof entry === "string") {
838
- parts.push({ content: entry, isFile: false });
839
- continue;
840
- }
841
- if (!isJsonObject(entry)) {
842
- continue;
1070
+ return directories;
1071
+ }
1072
+ function buildSearchRoots2(evalPath, repoRoot) {
1073
+ const uniqueRoots = [];
1074
+ const addRoot = (root) => {
1075
+ const normalized = import_node_path7.default.resolve(root);
1076
+ if (!uniqueRoots.includes(normalized)) {
1077
+ uniqueRoots.push(normalized);
843
1078
  }
844
- const segmentType = asString(entry.type);
845
- if (segmentType === "file") {
846
- const rawValue = asString(entry.value);
847
- if (!rawValue) {
848
- continue;
849
- }
850
- const { displayPath, resolvedPath, attempted } = await resolveFileReference(
851
- rawValue,
852
- searchRoots
853
- );
854
- if (!resolvedPath) {
855
- const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
856
- logWarning(`File not found in expected_messages: ${displayPath}`, attempts);
857
- continue;
858
- }
859
- try {
860
- const fileContent = (await (0, import_promises2.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n").trim();
861
- parts.push({ content: fileContent, isFile: true, displayPath });
862
- if (verbose) {
863
- console.log(` [Expected Assistant File] Found: ${displayPath}`);
864
- console.log(` Resolved to: ${resolvedPath}`);
865
- }
866
- } catch (error) {
867
- logWarning(`Could not read file ${resolvedPath}: ${error.message}`);
868
- }
869
- continue;
1079
+ };
1080
+ let currentDir = import_node_path7.default.dirname(evalPath);
1081
+ let reachedBoundary = false;
1082
+ while (!reachedBoundary) {
1083
+ addRoot(currentDir);
1084
+ const parentDir = import_node_path7.default.dirname(currentDir);
1085
+ if (currentDir === repoRoot || parentDir === currentDir) {
1086
+ reachedBoundary = true;
1087
+ } else {
1088
+ currentDir = parentDir;
870
1089
  }
871
- const textValue = asString(entry.text);
872
- if (typeof textValue === "string") {
873
- parts.push({ content: textValue, isFile: false });
1090
+ }
1091
+ addRoot(repoRoot);
1092
+ addRoot(process.cwd());
1093
+ return uniqueRoots;
1094
+ }
1095
+ function trimLeadingSeparators2(value) {
1096
+ const trimmed = value.replace(/^[/\\]+/, "");
1097
+ return trimmed.length > 0 ? trimmed : value;
1098
+ }
1099
+ async function resolveFileReference2(rawValue, searchRoots) {
1100
+ const displayPath = trimLeadingSeparators2(rawValue);
1101
+ const potentialPaths = [];
1102
+ if (import_node_path7.default.isAbsolute(rawValue)) {
1103
+ potentialPaths.push(import_node_path7.default.normalize(rawValue));
1104
+ }
1105
+ for (const base of searchRoots) {
1106
+ potentialPaths.push(import_node_path7.default.resolve(base, displayPath));
1107
+ }
1108
+ const attempted = [];
1109
+ const seen = /* @__PURE__ */ new Set();
1110
+ for (const candidate of potentialPaths) {
1111
+ const absoluteCandidate = import_node_path7.default.resolve(candidate);
1112
+ if (seen.has(absoluteCandidate)) {
874
1113
  continue;
875
1114
  }
876
- const valueValue = asString(entry.value);
877
- if (typeof valueValue === "string") {
878
- parts.push({ content: valueValue, isFile: false });
879
- continue;
1115
+ seen.add(absoluteCandidate);
1116
+ attempted.push(absoluteCandidate);
1117
+ if (await fileExists2(absoluteCandidate)) {
1118
+ return { displayPath, resolvedPath: absoluteCandidate, attempted };
880
1119
  }
881
- parts.push({ content: JSON.stringify(entry), isFile: false });
882
1120
  }
883
- return formatFileContents(parts);
1121
+ return { displayPath, attempted };
884
1122
  }
885
- async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
886
- const execution = rawEvalCase.execution;
887
- const candidateEvaluators = isJsonObject(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators ?? globalExecution?.evaluators;
888
- if (candidateEvaluators === void 0) {
889
- return void 0;
1123
+
1124
+ // src/evaluation/providers/ai-sdk.ts
1125
+ var import_anthropic = require("@ai-sdk/anthropic");
1126
+ var import_azure = require("@ai-sdk/azure");
1127
+ var import_google = require("@ai-sdk/google");
1128
+ var import_ai = require("ai");
1129
+ var DEFAULT_SYSTEM_PROMPT = "You are a careful assistant. Follow all provided instructions and do not fabricate results.";
1130
+ var AzureProvider = class {
1131
+ constructor(targetName, config) {
1132
+ this.config = config;
1133
+ this.id = `azure:${targetName}`;
1134
+ this.targetName = targetName;
1135
+ this.defaults = {
1136
+ temperature: config.temperature,
1137
+ maxOutputTokens: config.maxOutputTokens
1138
+ };
1139
+ this.retryConfig = config.retry;
1140
+ const azure = (0, import_azure.createAzure)(buildAzureOptions(config));
1141
+ this.model = azure(config.deploymentName);
890
1142
  }
891
- if (!Array.isArray(candidateEvaluators)) {
892
- logWarning(`Skipping evaluators for '${evalId}': expected array`);
893
- return void 0;
1143
+ id;
1144
+ kind = "azure";
1145
+ targetName;
1146
+ model;
1147
+ defaults;
1148
+ retryConfig;
1149
+ async invoke(request) {
1150
+ return invokeModel({
1151
+ model: this.model,
1152
+ request,
1153
+ defaults: this.defaults,
1154
+ retryConfig: this.retryConfig
1155
+ });
1156
+ }
1157
+ };
1158
+ var AnthropicProvider = class {
1159
+ constructor(targetName, config) {
1160
+ this.config = config;
1161
+ this.id = `anthropic:${targetName}`;
1162
+ this.targetName = targetName;
1163
+ this.defaults = {
1164
+ temperature: config.temperature,
1165
+ maxOutputTokens: config.maxOutputTokens,
1166
+ thinkingBudget: config.thinkingBudget
1167
+ };
1168
+ this.retryConfig = config.retry;
1169
+ const anthropic = (0, import_anthropic.createAnthropic)({
1170
+ apiKey: config.apiKey
1171
+ });
1172
+ this.model = anthropic(config.model);
1173
+ }
1174
+ id;
1175
+ kind = "anthropic";
1176
+ targetName;
1177
+ model;
1178
+ defaults;
1179
+ retryConfig;
1180
+ async invoke(request) {
1181
+ const providerOptions = buildAnthropicProviderOptions(this.defaults);
1182
+ return invokeModel({
1183
+ model: this.model,
1184
+ request,
1185
+ defaults: this.defaults,
1186
+ retryConfig: this.retryConfig,
1187
+ providerOptions
1188
+ });
894
1189
  }
895
- const evaluators = [];
896
- for (const rawEvaluator of candidateEvaluators) {
897
- if (!isJsonObject(rawEvaluator)) {
898
- logWarning(`Skipping invalid evaluator entry for '${evalId}' (expected object)`);
899
- continue;
900
- }
901
- const name = asString(rawEvaluator.name);
902
- const typeValue = rawEvaluator.type;
903
- if (!name || !isEvaluatorKind(typeValue)) {
904
- logWarning(`Skipping evaluator with invalid name/type in '${evalId}'`);
905
- continue;
906
- }
907
- if (typeValue === "code") {
908
- const script = asString(rawEvaluator.script);
909
- if (!script) {
910
- logWarning(`Skipping code evaluator '${name}' in '${evalId}': missing script`);
911
- continue;
912
- }
913
- const cwd = asString(rawEvaluator.cwd);
914
- let resolvedCwd;
915
- if (cwd) {
916
- const resolved = await resolveFileReference(cwd, searchRoots);
917
- if (resolved.resolvedPath) {
918
- resolvedCwd = import_node_path2.default.resolve(resolved.resolvedPath);
919
- } else {
920
- logWarning(
921
- `Code evaluator '${name}' in '${evalId}': cwd not found (${resolved.displayPath})`,
922
- resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
923
- );
924
- }
925
- } else {
926
- resolvedCwd = searchRoots[0];
927
- }
928
- evaluators.push({
929
- name,
930
- type: "code",
931
- script,
932
- cwd,
933
- resolvedCwd
934
- });
935
- continue;
936
- }
937
- const prompt = asString(rawEvaluator.prompt);
938
- let promptPath;
939
- if (prompt) {
940
- const resolved = await resolveFileReference(prompt, searchRoots);
941
- if (resolved.resolvedPath) {
942
- promptPath = import_node_path2.default.resolve(resolved.resolvedPath);
943
- } else {
944
- logWarning(
945
- `Inline prompt used for evaluator '${name}' in '${evalId}' (file not found: ${resolved.displayPath})`,
946
- resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
947
- );
948
- }
949
- }
950
- const model = asString(rawEvaluator.model);
951
- evaluators.push({
952
- name,
953
- type: "llm_judge",
954
- prompt,
955
- promptPath
1190
+ };
1191
+ var GeminiProvider = class {
1192
+ constructor(targetName, config) {
1193
+ this.config = config;
1194
+ this.id = `gemini:${targetName}`;
1195
+ this.targetName = targetName;
1196
+ this.defaults = {
1197
+ temperature: config.temperature,
1198
+ maxOutputTokens: config.maxOutputTokens
1199
+ };
1200
+ this.retryConfig = config.retry;
1201
+ const google = (0, import_google.createGoogleGenerativeAI)({
1202
+ apiKey: config.apiKey
956
1203
  });
1204
+ this.model = google(config.model);
957
1205
  }
958
- return evaluators.length > 0 ? evaluators : void 0;
1206
+ id;
1207
+ kind = "gemini";
1208
+ targetName;
1209
+ model;
1210
+ defaults;
1211
+ retryConfig;
1212
+ async invoke(request) {
1213
+ return invokeModel({
1214
+ model: this.model,
1215
+ request,
1216
+ defaults: this.defaults,
1217
+ retryConfig: this.retryConfig
1218
+ });
1219
+ }
1220
+ };
1221
+ function buildAzureOptions(config) {
1222
+ const options = {
1223
+ apiKey: config.apiKey,
1224
+ apiVersion: config.version,
1225
+ useDeploymentBasedUrls: true
1226
+ };
1227
+ const baseURL = normalizeAzureBaseUrl(config.resourceName);
1228
+ if (baseURL) {
1229
+ options.baseURL = baseURL;
1230
+ } else {
1231
+ options.resourceName = config.resourceName;
1232
+ }
1233
+ return options;
959
1234
  }
960
- function coerceEvaluator(candidate, contextId) {
961
- if (typeof candidate !== "string") {
1235
+ function normalizeAzureBaseUrl(resourceName) {
1236
+ const trimmed = resourceName.trim();
1237
+ if (!/^https?:\/\//i.test(trimmed)) {
962
1238
  return void 0;
963
1239
  }
964
- if (isEvaluatorKind(candidate)) {
965
- return candidate;
966
- }
967
- logWarning(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
968
- return void 0;
1240
+ const withoutSlash = trimmed.replace(/\/+$/, "");
1241
+ const normalized = withoutSlash.endsWith("/openai") ? withoutSlash : `${withoutSlash}/openai`;
1242
+ return normalized;
969
1243
  }
970
- function logWarning(message, details) {
971
- if (details && details.length > 0) {
972
- const detailBlock = details.join("\n");
973
- console.warn(`${ANSI_YELLOW}Warning: ${message}
974
- ${detailBlock}${ANSI_RESET}`);
975
- } else {
976
- console.warn(`${ANSI_YELLOW}Warning: ${message}${ANSI_RESET}`);
1244
+ function buildAnthropicProviderOptions(defaults) {
1245
+ if (defaults.thinkingBudget === void 0) {
1246
+ return void 0;
977
1247
  }
1248
+ return {
1249
+ anthropic: {
1250
+ thinking: {
1251
+ type: "enabled",
1252
+ budgetTokens: defaults.thinkingBudget
1253
+ }
1254
+ }
1255
+ };
978
1256
  }
979
-
980
- // src/evaluation/providers/ax.ts
981
- var import_ax = require("@ax-llm/ax");
982
- var DEFAULT_SYSTEM_PROMPT = "You are a careful assistant. Follow all provided instructions and do not fabricate results.";
983
1257
  function buildChatPrompt(request) {
984
- if (request.chatPrompt) {
985
- const hasSystemMessage = request.chatPrompt.some((message) => message.role === "system");
1258
+ const provided = request.chatPrompt?.length ? request.chatPrompt : void 0;
1259
+ if (provided) {
1260
+ const hasSystemMessage = provided.some((message) => message.role === "system");
986
1261
  if (hasSystemMessage) {
987
- return request.chatPrompt;
1262
+ return provided;
988
1263
  }
989
- const systemContent2 = resolveSystemContent(request);
990
- return [{ role: "system", content: systemContent2 }, ...request.chatPrompt];
1264
+ const systemContent2 = resolveSystemContent(request, false);
1265
+ return [{ role: "system", content: systemContent2 }, ...provided];
991
1266
  }
992
- const systemContent = resolveSystemContent(request);
1267
+ const systemContent = resolveSystemContent(request, true);
993
1268
  const userContent = request.question.trim();
994
1269
  const prompt = [
995
- {
996
- role: "system",
997
- content: systemContent
998
- },
999
- {
1000
- role: "user",
1001
- content: userContent
1002
- }
1270
+ { role: "system", content: systemContent },
1271
+ { role: "user", content: userContent }
1003
1272
  ];
1004
1273
  return prompt;
1005
1274
  }
1006
- function resolveSystemContent(request) {
1275
+ function resolveSystemContent(request, includeGuidelines) {
1007
1276
  const systemSegments = [];
1008
- const metadataSystemPrompt = typeof request.metadata?.systemPrompt === "string" ? request.metadata.systemPrompt : void 0;
1009
- if (metadataSystemPrompt && metadataSystemPrompt.trim().length > 0) {
1010
- systemSegments.push(metadataSystemPrompt.trim());
1277
+ if (request.systemPrompt && request.systemPrompt.trim().length > 0) {
1278
+ systemSegments.push(request.systemPrompt.trim());
1011
1279
  } else {
1012
1280
  systemSegments.push(DEFAULT_SYSTEM_PROMPT);
1013
1281
  }
1014
- if (request.guidelines && request.guidelines.trim().length > 0) {
1282
+ if (includeGuidelines && request.guidelines && request.guidelines.trim().length > 0) {
1015
1283
  systemSegments.push(`[[ ## Guidelines ## ]]
1016
1284
 
1017
1285
  ${request.guidelines.trim()}`);
1018
1286
  }
1019
1287
  return systemSegments.join("\n\n");
1020
1288
  }
1021
- function extractModelConfig(request, defaults) {
1289
+ function toModelMessages(chatPrompt) {
1290
+ return chatPrompt.map((message) => {
1291
+ if (message.role === "tool" || message.role === "function") {
1292
+ const prefix = message.name ? `@[${message.name}]: ` : "@[Tool]: ";
1293
+ return {
1294
+ role: "assistant",
1295
+ content: `${prefix}${message.content}`
1296
+ };
1297
+ }
1298
+ if (message.role === "assistant" || message.role === "system" || message.role === "user") {
1299
+ return {
1300
+ role: message.role,
1301
+ content: message.content
1302
+ };
1303
+ }
1304
+ return {
1305
+ role: "user",
1306
+ content: message.content
1307
+ };
1308
+ });
1309
+ }
1310
+ function resolveModelSettings(request, defaults) {
1022
1311
  const temperature = request.temperature ?? defaults.temperature;
1023
- const maxTokens = request.maxOutputTokens ?? defaults.maxOutputTokens;
1024
- const config = {};
1025
- if (temperature !== void 0) {
1026
- config.temperature = temperature;
1027
- }
1028
- if (maxTokens !== void 0) {
1029
- config.maxTokens = maxTokens;
1030
- }
1031
- return Object.keys(config).length > 0 ? config : void 0;
1312
+ const maxOutputTokens = request.maxOutputTokens ?? defaults.maxOutputTokens;
1313
+ return {
1314
+ temperature,
1315
+ maxOutputTokens
1316
+ };
1317
+ }
1318
+ async function invokeModel(options) {
1319
+ const { model, request, defaults, retryConfig, providerOptions } = options;
1320
+ const chatPrompt = buildChatPrompt(request);
1321
+ const { temperature, maxOutputTokens } = resolveModelSettings(request, defaults);
1322
+ const result = await withRetry(
1323
+ () => (0, import_ai.generateText)({
1324
+ model,
1325
+ messages: toModelMessages(chatPrompt),
1326
+ temperature,
1327
+ maxOutputTokens,
1328
+ maxRetries: 0,
1329
+ abortSignal: request.signal,
1330
+ ...providerOptions ? { providerOptions } : {}
1331
+ }),
1332
+ retryConfig,
1333
+ request.signal
1334
+ );
1335
+ return mapResponse(result);
1032
1336
  }
1033
- function mapResponse(response) {
1034
- const primary = response.results[0];
1035
- const text = typeof primary?.content === "string" ? primary.content : "";
1036
- const reasoning = primary?.thought ?? primary?.thoughtBlock?.data;
1037
- const usage = toJsonObject(response.modelUsage);
1337
+ function mapResponse(result) {
1038
1338
  return {
1039
- text,
1040
- reasoning,
1041
- raw: response,
1042
- usage
1339
+ text: result.text ?? "",
1340
+ reasoning: result.reasoningText ?? void 0,
1341
+ raw: result,
1342
+ usage: toJsonObject(result.totalUsage ?? result.usage)
1043
1343
  };
1044
1344
  }
1045
1345
  function toJsonObject(value) {
@@ -1052,34 +1352,59 @@ function toJsonObject(value) {
1052
1352
  return void 0;
1053
1353
  }
1054
1354
  }
1055
- function ensureChatResponse(result) {
1056
- if (typeof ReadableStream !== "undefined" && result instanceof ReadableStream) {
1057
- throw new Error("Streaming responses are not supported for this provider");
1355
+ function extractStatus(error) {
1356
+ if (!error || typeof error !== "object") {
1357
+ return void 0;
1358
+ }
1359
+ const candidate = error;
1360
+ const directStatus = candidate.status ?? candidate.statusCode;
1361
+ if (typeof directStatus === "number" && Number.isFinite(directStatus)) {
1362
+ return directStatus;
1058
1363
  }
1059
- if (!result || typeof result !== "object" || !("results" in result)) {
1060
- throw new Error("Unexpected response type from AxAI provider");
1364
+ const responseStatus = typeof candidate.response === "object" && candidate.response ? candidate.response.status : void 0;
1365
+ if (typeof responseStatus === "number" && Number.isFinite(responseStatus)) {
1366
+ return responseStatus;
1367
+ }
1368
+ const message = typeof candidate.message === "string" ? candidate.message : void 0;
1369
+ if (message) {
1370
+ const match = message.match(/HTTP\s+(\d{3})/i);
1371
+ if (match) {
1372
+ const parsed = Number.parseInt(match[1], 10);
1373
+ if (Number.isFinite(parsed)) {
1374
+ return parsed;
1375
+ }
1376
+ }
1061
1377
  }
1062
- return result;
1378
+ return void 0;
1063
1379
  }
1064
- function isRetryableError(error, retryableStatusCodes) {
1380
+ function isNetworkError(error) {
1065
1381
  if (!error || typeof error !== "object") {
1066
1382
  return false;
1067
1383
  }
1068
- if ("status" in error && typeof error.status === "number") {
1069
- return retryableStatusCodes.includes(error.status);
1384
+ const candidate = error;
1385
+ if (candidate.name === "AbortError") {
1386
+ return false;
1070
1387
  }
1071
- if ("message" in error && typeof error.message === "string") {
1072
- const match = error.message.match(/HTTP (\d{3})/);
1073
- if (match) {
1074
- const status = Number.parseInt(match[1], 10);
1075
- return retryableStatusCodes.includes(status);
1076
- }
1388
+ const code = candidate.code;
1389
+ if (typeof code === "string" && /^E(AI|CONN|HOST|NET|PIPE|TIME|REFUSED|RESET)/i.test(code)) {
1390
+ return true;
1077
1391
  }
1078
- if ("name" in error && error.name === "AxAIServiceNetworkError") {
1392
+ const message = typeof candidate.message === "string" ? candidate.message : void 0;
1393
+ if (message && /(network|fetch failed|ECONNRESET|ENOTFOUND|EAI_AGAIN|ETIMEDOUT|ECONNREFUSED)/i.test(message)) {
1079
1394
  return true;
1080
1395
  }
1081
1396
  return false;
1082
1397
  }
1398
+ function isRetryableError(error, retryableStatusCodes) {
1399
+ const status = extractStatus(error);
1400
+ if (status === 401 || status === 403) {
1401
+ return false;
1402
+ }
1403
+ if (typeof status === "number") {
1404
+ return retryableStatusCodes.includes(status);
1405
+ }
1406
+ return isNetworkError(error);
1407
+ }
1083
1408
  function calculateRetryDelay(attempt, config) {
1084
1409
  const delay = Math.min(
1085
1410
  config.maxDelayMs,
@@ -1115,152 +1440,16 @@ async function withRetry(fn, retryConfig, signal) {
1115
1440
  }
1116
1441
  const delay = calculateRetryDelay(attempt, config);
1117
1442
  await sleep(delay);
1118
- if (signal?.aborted) {
1119
- throw new Error(`Request aborted: ${signal.reason ?? "Unknown reason"}`);
1120
- }
1121
1443
  }
1122
1444
  }
1123
1445
  throw lastError;
1124
1446
  }
1125
- var AzureProvider = class {
1126
- constructor(targetName, config) {
1127
- this.config = config;
1128
- this.id = `azure:${targetName}`;
1129
- this.targetName = targetName;
1130
- this.defaults = {
1131
- temperature: config.temperature,
1132
- maxOutputTokens: config.maxOutputTokens
1133
- };
1134
- this.retryConfig = config.retry;
1135
- this.ai = import_ax.AxAI.create({
1136
- name: "azure-openai",
1137
- apiKey: config.apiKey,
1138
- resourceName: config.resourceName,
1139
- deploymentName: config.deploymentName,
1140
- version: config.version,
1141
- config: {
1142
- stream: false
1143
- }
1144
- });
1145
- }
1146
- id;
1147
- kind = "azure";
1148
- targetName;
1149
- ai;
1150
- defaults;
1151
- retryConfig;
1152
- async invoke(request) {
1153
- const chatPrompt = buildChatPrompt(request);
1154
- const modelConfig = extractModelConfig(request, this.defaults);
1155
- const response = await withRetry(
1156
- async () => await this.ai.chat(
1157
- {
1158
- chatPrompt,
1159
- model: this.config.deploymentName,
1160
- ...modelConfig ? { modelConfig } : {}
1161
- },
1162
- request.signal ? { abortSignal: request.signal } : void 0
1163
- ),
1164
- this.retryConfig,
1165
- request.signal
1166
- );
1167
- return mapResponse(ensureChatResponse(response));
1168
- }
1169
- getAxAI() {
1170
- return this.ai;
1171
- }
1172
- };
1173
- var AnthropicProvider = class {
1174
- constructor(targetName, config) {
1175
- this.config = config;
1176
- this.id = `anthropic:${targetName}`;
1177
- this.targetName = targetName;
1178
- this.defaults = {
1179
- temperature: config.temperature,
1180
- maxOutputTokens: config.maxOutputTokens,
1181
- thinkingBudget: config.thinkingBudget
1182
- };
1183
- this.retryConfig = config.retry;
1184
- this.ai = import_ax.AxAI.create({
1185
- name: "anthropic",
1186
- apiKey: config.apiKey
1187
- });
1188
- }
1189
- id;
1190
- kind = "anthropic";
1191
- targetName;
1192
- ai;
1193
- defaults;
1194
- retryConfig;
1195
- async invoke(request) {
1196
- const chatPrompt = buildChatPrompt(request);
1197
- const modelConfig = extractModelConfig(request, this.defaults);
1198
- const response = await withRetry(
1199
- async () => await this.ai.chat(
1200
- {
1201
- chatPrompt,
1202
- model: this.config.model,
1203
- ...modelConfig ? { modelConfig } : {}
1204
- },
1205
- request.signal ? { abortSignal: request.signal } : void 0
1206
- ),
1207
- this.retryConfig,
1208
- request.signal
1209
- );
1210
- return mapResponse(ensureChatResponse(response));
1211
- }
1212
- getAxAI() {
1213
- return this.ai;
1214
- }
1215
- };
1216
- var GeminiProvider = class {
1217
- constructor(targetName, config) {
1218
- this.config = config;
1219
- this.id = `gemini:${targetName}`;
1220
- this.targetName = targetName;
1221
- this.defaults = {
1222
- temperature: config.temperature,
1223
- maxOutputTokens: config.maxOutputTokens
1224
- };
1225
- this.retryConfig = config.retry;
1226
- this.ai = import_ax.AxAI.create({
1227
- name: "google-gemini",
1228
- apiKey: config.apiKey
1229
- });
1230
- }
1231
- id;
1232
- kind = "gemini";
1233
- targetName;
1234
- ai;
1235
- defaults;
1236
- retryConfig;
1237
- async invoke(request) {
1238
- const chatPrompt = buildChatPrompt(request);
1239
- const modelConfig = extractModelConfig(request, this.defaults);
1240
- const response = await withRetry(
1241
- async () => await this.ai.chat(
1242
- {
1243
- chatPrompt,
1244
- model: this.config.model,
1245
- ...modelConfig ? { modelConfig } : {}
1246
- },
1247
- request.signal ? { abortSignal: request.signal } : void 0
1248
- ),
1249
- this.retryConfig,
1250
- request.signal
1251
- );
1252
- return mapResponse(ensureChatResponse(response));
1253
- }
1254
- getAxAI() {
1255
- return this.ai;
1256
- }
1257
- };
1258
1447
 
1259
1448
  // src/evaluation/providers/cli.ts
1260
1449
  var import_node_child_process = require("child_process");
1261
- var import_promises3 = __toESM(require("fs/promises"), 1);
1450
+ var import_promises7 = __toESM(require("fs/promises"), 1);
1262
1451
  var import_node_os = __toESM(require("os"), 1);
1263
- var import_node_path3 = __toESM(require("path"), 1);
1452
+ var import_node_path8 = __toESM(require("path"), 1);
1264
1453
  var import_node_util = require("util");
1265
1454
  var execAsync = (0, import_node_util.promisify)(import_node_child_process.exec);
1266
1455
  var DEFAULT_MAX_BUFFER = 10 * 1024 * 1024;
@@ -1302,12 +1491,14 @@ var CliProvider = class {
1302
1491
  supportsBatch = false;
1303
1492
  config;
1304
1493
  runCommand;
1494
+ verbose;
1305
1495
  healthcheckPromise;
1306
1496
  constructor(targetName, config, runner = defaultCommandRunner) {
1307
1497
  this.targetName = targetName;
1308
1498
  this.id = `cli:${targetName}`;
1309
1499
  this.config = config;
1310
1500
  this.runCommand = runner;
1501
+ this.verbose = config.verbose ?? false;
1311
1502
  }
1312
1503
  async invoke(request) {
1313
1504
  if (request.signal?.aborted) {
@@ -1357,7 +1548,7 @@ var CliProvider = class {
1357
1548
  const errorMsg = error instanceof Error ? error.message : String(error);
1358
1549
  throw new Error(`Failed to read output file '${filePath}': ${errorMsg}`);
1359
1550
  } finally {
1360
- await import_promises3.default.unlink(filePath).catch(() => {
1551
+ await import_promises7.default.unlink(filePath).catch(() => {
1361
1552
  });
1362
1553
  }
1363
1554
  }
@@ -1408,6 +1599,11 @@ var CliProvider = class {
1408
1599
  generateOutputFilePath("healthcheck")
1409
1600
  )
1410
1601
  );
1602
+ if (this.verbose) {
1603
+ console.log(
1604
+ `[cli-provider:${this.targetName}] (healthcheck) CLI_EVALS_DIR=${process.env.CLI_EVALS_DIR ?? ""} cwd=${healthcheck.cwd ?? this.config.cwd ?? ""} command=${renderedCommand}`
1605
+ );
1606
+ }
1411
1607
  const result = await this.runCommand(renderedCommand, {
1412
1608
  cwd: healthcheck.cwd ?? this.config.cwd,
1413
1609
  env: process.env,
@@ -1439,7 +1635,7 @@ function normalizeInputFiles(inputFiles) {
1439
1635
  }
1440
1636
  const unique = /* @__PURE__ */ new Map();
1441
1637
  for (const inputFile of inputFiles) {
1442
- const absolutePath = import_node_path3.default.resolve(inputFile);
1638
+ const absolutePath = import_node_path8.default.resolve(inputFile);
1443
1639
  if (!unique.has(absolutePath)) {
1444
1640
  unique.set(absolutePath, absolutePath);
1445
1641
  }
@@ -1453,7 +1649,7 @@ function formatFileList(files, template) {
1453
1649
  const formatter = template ?? "{path}";
1454
1650
  return files.map((filePath) => {
1455
1651
  const escapedPath = shellEscape(filePath);
1456
- const escapedName = shellEscape(import_node_path3.default.basename(filePath));
1652
+ const escapedName = shellEscape(import_node_path8.default.basename(filePath));
1457
1653
  return formatter.replaceAll("{path}", escapedPath).replaceAll("{basename}", escapedName);
1458
1654
  }).join(" ");
1459
1655
  }
@@ -1477,7 +1673,7 @@ function generateOutputFilePath(evalCaseId) {
1477
1673
  const safeEvalId = evalCaseId || "unknown";
1478
1674
  const timestamp = Date.now();
1479
1675
  const random = Math.random().toString(36).substring(2, 9);
1480
- return import_node_path3.default.join(import_node_os.default.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}.json`);
1676
+ return import_node_path8.default.join(import_node_os.default.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}.json`);
1481
1677
  }
1482
1678
  function formatTimeoutSuffix(timeoutMs) {
1483
1679
  if (!timeoutMs || timeoutMs <= 0) {
@@ -1491,9 +1687,9 @@ function formatTimeoutSuffix(timeoutMs) {
1491
1687
  var import_node_child_process2 = require("child_process");
1492
1688
  var import_node_crypto = require("crypto");
1493
1689
  var import_node_fs3 = require("fs");
1494
- var import_promises4 = require("fs/promises");
1690
+ var import_promises8 = require("fs/promises");
1495
1691
  var import_node_os2 = require("os");
1496
- var import_node_path5 = __toESM(require("path"), 1);
1692
+ var import_node_path10 = __toESM(require("path"), 1);
1497
1693
  var import_node_util2 = require("util");
1498
1694
 
1499
1695
  // src/evaluation/providers/codex-log-tracker.ts
@@ -1550,7 +1746,7 @@ function subscribeToCodexLogEntries(listener) {
1550
1746
  }
1551
1747
 
1552
1748
  // src/evaluation/providers/preread.ts
1553
- var import_node_path4 = __toESM(require("path"), 1);
1749
+ var import_node_path9 = __toESM(require("path"), 1);
1554
1750
  function buildPromptDocument(request, inputFiles, options) {
1555
1751
  const parts = [];
1556
1752
  const guidelineFiles = collectGuidelineFiles(
@@ -1575,7 +1771,7 @@ function normalizeInputFiles2(inputFiles) {
1575
1771
  }
1576
1772
  const deduped = /* @__PURE__ */ new Map();
1577
1773
  for (const inputFile of inputFiles) {
1578
- const absolutePath = import_node_path4.default.resolve(inputFile);
1774
+ const absolutePath = import_node_path9.default.resolve(inputFile);
1579
1775
  if (!deduped.has(absolutePath)) {
1580
1776
  deduped.set(absolutePath, absolutePath);
1581
1777
  }
@@ -1588,14 +1784,14 @@ function collectGuidelineFiles(inputFiles, guidelinePatterns, overrides) {
1588
1784
  }
1589
1785
  const unique = /* @__PURE__ */ new Map();
1590
1786
  for (const inputFile of inputFiles) {
1591
- const absolutePath = import_node_path4.default.resolve(inputFile);
1787
+ const absolutePath = import_node_path9.default.resolve(inputFile);
1592
1788
  if (overrides?.has(absolutePath)) {
1593
1789
  if (!unique.has(absolutePath)) {
1594
1790
  unique.set(absolutePath, absolutePath);
1595
1791
  }
1596
1792
  continue;
1597
1793
  }
1598
- const normalized = absolutePath.split(import_node_path4.default.sep).join("/");
1794
+ const normalized = absolutePath.split(import_node_path9.default.sep).join("/");
1599
1795
  if (isGuidelineFile(normalized, guidelinePatterns)) {
1600
1796
  if (!unique.has(absolutePath)) {
1601
1797
  unique.set(absolutePath, absolutePath);
@@ -1610,7 +1806,7 @@ function collectInputFiles(inputFiles) {
1610
1806
  }
1611
1807
  const unique = /* @__PURE__ */ new Map();
1612
1808
  for (const inputFile of inputFiles) {
1613
- const absolutePath = import_node_path4.default.resolve(inputFile);
1809
+ const absolutePath = import_node_path9.default.resolve(inputFile);
1614
1810
  if (!unique.has(absolutePath)) {
1615
1811
  unique.set(absolutePath, absolutePath);
1616
1812
  }
@@ -1622,7 +1818,7 @@ function buildMandatoryPrereadBlock(guidelineFiles, inputFiles) {
1622
1818
  return "";
1623
1819
  }
1624
1820
  const buildList = (files) => files.map((absolutePath) => {
1625
- const fileName = import_node_path4.default.basename(absolutePath);
1821
+ const fileName = import_node_path9.default.basename(absolutePath);
1626
1822
  const fileUri = pathToFileUri(absolutePath);
1627
1823
  return `* [${fileName}](${fileUri})`;
1628
1824
  });
@@ -1642,7 +1838,7 @@ ${buildList(inputFiles).join("\n")}.`);
1642
1838
  return sections.join("\n");
1643
1839
  }
1644
1840
  function pathToFileUri(filePath) {
1645
- const absolutePath = import_node_path4.default.isAbsolute(filePath) ? filePath : import_node_path4.default.resolve(filePath);
1841
+ const absolutePath = import_node_path9.default.isAbsolute(filePath) ? filePath : import_node_path9.default.resolve(filePath);
1646
1842
  const normalizedPath = absolutePath.replace(/\\/g, "/");
1647
1843
  if (/^[a-zA-Z]:\//.test(normalizedPath)) {
1648
1844
  return `file:///${normalizedPath}`;
@@ -1680,8 +1876,8 @@ var CodexProvider = class {
1680
1876
  const logger = await this.createStreamLogger(request).catch(() => void 0);
1681
1877
  try {
1682
1878
  const promptContent = buildPromptDocument(request, inputFiles);
1683
- const promptFile = import_node_path5.default.join(workspaceRoot, PROMPT_FILENAME);
1684
- await (0, import_promises4.writeFile)(promptFile, promptContent, "utf8");
1879
+ const promptFile = import_node_path10.default.join(workspaceRoot, PROMPT_FILENAME);
1880
+ await (0, import_promises8.writeFile)(promptFile, promptContent, "utf8");
1685
1881
  const args = this.buildCodexArgs();
1686
1882
  const cwd = this.resolveCwd(workspaceRoot);
1687
1883
  const result = await this.executeCodex(args, cwd, promptContent, request.signal, logger);
@@ -1730,7 +1926,7 @@ var CodexProvider = class {
1730
1926
  if (!this.config.cwd) {
1731
1927
  return workspaceRoot;
1732
1928
  }
1733
- return import_node_path5.default.resolve(this.config.cwd);
1929
+ return import_node_path10.default.resolve(this.config.cwd);
1734
1930
  }
1735
1931
  buildCodexArgs() {
1736
1932
  const args = ["--ask-for-approval", "never", "exec", "--json", "--color", "never", "--skip-git-repo-check"];
@@ -1764,11 +1960,11 @@ var CodexProvider = class {
1764
1960
  }
1765
1961
  }
1766
1962
  async createWorkspace() {
1767
- return await (0, import_promises4.mkdtemp)(import_node_path5.default.join((0, import_node_os2.tmpdir)(), WORKSPACE_PREFIX));
1963
+ return await (0, import_promises8.mkdtemp)(import_node_path10.default.join((0, import_node_os2.tmpdir)(), WORKSPACE_PREFIX));
1768
1964
  }
1769
1965
  async cleanupWorkspace(workspaceRoot) {
1770
1966
  try {
1771
- await (0, import_promises4.rm)(workspaceRoot, { recursive: true, force: true });
1967
+ await (0, import_promises8.rm)(workspaceRoot, { recursive: true, force: true });
1772
1968
  } catch {
1773
1969
  }
1774
1970
  }
@@ -1778,9 +1974,9 @@ var CodexProvider = class {
1778
1974
  return void 0;
1779
1975
  }
1780
1976
  if (this.config.logDir) {
1781
- return import_node_path5.default.resolve(this.config.logDir);
1977
+ return import_node_path10.default.resolve(this.config.logDir);
1782
1978
  }
1783
- return import_node_path5.default.join(process.cwd(), ".agentv", "logs", "codex");
1979
+ return import_node_path10.default.join(process.cwd(), ".agentv", "logs", "codex");
1784
1980
  }
1785
1981
  async createStreamLogger(request) {
1786
1982
  const logDir = this.resolveLogDirectory();
@@ -1788,13 +1984,13 @@ var CodexProvider = class {
1788
1984
  return void 0;
1789
1985
  }
1790
1986
  try {
1791
- await (0, import_promises4.mkdir)(logDir, { recursive: true });
1987
+ await (0, import_promises8.mkdir)(logDir, { recursive: true });
1792
1988
  } catch (error) {
1793
1989
  const message = error instanceof Error ? error.message : String(error);
1794
1990
  console.warn(`Skipping Codex stream logging (could not create ${logDir}): ${message}`);
1795
1991
  return void 0;
1796
1992
  }
1797
- const filePath = import_node_path5.default.join(logDir, buildLogFilename(request, this.targetName));
1993
+ const filePath = import_node_path10.default.join(logDir, buildLogFilename(request, this.targetName));
1798
1994
  try {
1799
1995
  const logger = await CodexStreamLogger.create({
1800
1996
  filePath,
@@ -2009,9 +2205,9 @@ function tryParseJsonValue(rawLine) {
2009
2205
  async function locateExecutable(candidate) {
2010
2206
  const includesPathSeparator = candidate.includes("/") || candidate.includes("\\");
2011
2207
  if (includesPathSeparator) {
2012
- const resolved = import_node_path5.default.isAbsolute(candidate) ? candidate : import_node_path5.default.resolve(candidate);
2208
+ const resolved = import_node_path10.default.isAbsolute(candidate) ? candidate : import_node_path10.default.resolve(candidate);
2013
2209
  const executablePath = await ensureWindowsExecutableVariant(resolved);
2014
- await (0, import_promises4.access)(executablePath, import_node_fs3.constants.F_OK);
2210
+ await (0, import_promises8.access)(executablePath, import_node_fs3.constants.F_OK);
2015
2211
  return executablePath;
2016
2212
  }
2017
2213
  const locator = process.platform === "win32" ? "where" : "which";
@@ -2021,7 +2217,7 @@ async function locateExecutable(candidate) {
2021
2217
  const preferred = selectExecutableCandidate(lines);
2022
2218
  if (preferred) {
2023
2219
  const executablePath = await ensureWindowsExecutableVariant(preferred);
2024
- await (0, import_promises4.access)(executablePath, import_node_fs3.constants.F_OK);
2220
+ await (0, import_promises8.access)(executablePath, import_node_fs3.constants.F_OK);
2025
2221
  return executablePath;
2026
2222
  }
2027
2223
  } catch {
@@ -2055,7 +2251,7 @@ async function ensureWindowsExecutableVariant(candidate) {
2055
2251
  for (const ext of extensions) {
2056
2252
  const withExtension = `${candidate}${ext}`;
2057
2253
  try {
2058
- await (0, import_promises4.access)(withExtension, import_node_fs3.constants.F_OK);
2254
+ await (0, import_promises8.access)(withExtension, import_node_fs3.constants.F_OK);
2059
2255
  return withExtension;
2060
2256
  } catch {
2061
2257
  }
@@ -2867,7 +3063,7 @@ function resolveOptionalNumberArray(source, description) {
2867
3063
  }
2868
3064
 
2869
3065
  // src/evaluation/providers/vscode.ts
2870
- var import_node_path6 = __toESM(require("path"), 1);
3066
+ var import_node_path11 = __toESM(require("path"), 1);
2871
3067
  var import_subagent = require("subagent");
2872
3068
  var VSCodeProvider = class {
2873
3069
  id;
@@ -2980,6 +3176,9 @@ var VSCodeProvider = class {
2980
3176
  };
2981
3177
  function buildPromptDocument2(request, attachments, guidelinePatterns) {
2982
3178
  const parts = [];
3179
+ if (request.systemPrompt && request.systemPrompt.trim().length > 0) {
3180
+ parts.push(request.systemPrompt.trim());
3181
+ }
2983
3182
  const guidelineFiles = collectGuidelineFiles2(attachments, guidelinePatterns);
2984
3183
  const attachmentFiles = collectAttachmentFiles(attachments);
2985
3184
  const nonGuidelineAttachments = attachmentFiles.filter(
@@ -2997,7 +3196,7 @@ function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
2997
3196
  return "";
2998
3197
  }
2999
3198
  const buildList = (files) => files.map((absolutePath) => {
3000
- const fileName = import_node_path6.default.basename(absolutePath);
3199
+ const fileName = import_node_path11.default.basename(absolutePath);
3001
3200
  const fileUri = pathToFileUri2(absolutePath);
3002
3201
  return `* [${fileName}](${fileUri})`;
3003
3202
  });
@@ -3022,8 +3221,8 @@ function collectGuidelineFiles2(attachments, guidelinePatterns) {
3022
3221
  }
3023
3222
  const unique = /* @__PURE__ */ new Map();
3024
3223
  for (const attachment of attachments) {
3025
- const absolutePath = import_node_path6.default.resolve(attachment);
3026
- const normalized = absolutePath.split(import_node_path6.default.sep).join("/");
3224
+ const absolutePath = import_node_path11.default.resolve(attachment);
3225
+ const normalized = absolutePath.split(import_node_path11.default.sep).join("/");
3027
3226
  if (isGuidelineFile(normalized, guidelinePatterns)) {
3028
3227
  if (!unique.has(absolutePath)) {
3029
3228
  unique.set(absolutePath, absolutePath);
@@ -3038,7 +3237,7 @@ function collectAttachmentFiles(attachments) {
3038
3237
  }
3039
3238
  const unique = /* @__PURE__ */ new Map();
3040
3239
  for (const attachment of attachments) {
3041
- const absolutePath = import_node_path6.default.resolve(attachment);
3240
+ const absolutePath = import_node_path11.default.resolve(attachment);
3042
3241
  if (!unique.has(absolutePath)) {
3043
3242
  unique.set(absolutePath, absolutePath);
3044
3243
  }
@@ -3046,7 +3245,7 @@ function collectAttachmentFiles(attachments) {
3046
3245
  return Array.from(unique.values());
3047
3246
  }
3048
3247
  function pathToFileUri2(filePath) {
3049
- const absolutePath = import_node_path6.default.isAbsolute(filePath) ? filePath : import_node_path6.default.resolve(filePath);
3248
+ const absolutePath = import_node_path11.default.isAbsolute(filePath) ? filePath : import_node_path11.default.resolve(filePath);
3050
3249
  const normalizedPath = absolutePath.replace(/\\/g, "/");
3051
3250
  if (/^[a-zA-Z]:\//.test(normalizedPath)) {
3052
3251
  return `file:///${normalizedPath}`;
@@ -3059,7 +3258,7 @@ function normalizeAttachments(attachments) {
3059
3258
  }
3060
3259
  const deduped = /* @__PURE__ */ new Set();
3061
3260
  for (const attachment of attachments) {
3062
- deduped.add(import_node_path6.default.resolve(attachment));
3261
+ deduped.add(import_node_path11.default.resolve(attachment));
3063
3262
  }
3064
3263
  return Array.from(deduped);
3065
3264
  }
@@ -3068,7 +3267,7 @@ function mergeAttachments(all) {
3068
3267
  for (const list of all) {
3069
3268
  if (!list) continue;
3070
3269
  for (const inputFile of list) {
3071
- deduped.add(import_node_path6.default.resolve(inputFile));
3270
+ deduped.add(import_node_path11.default.resolve(inputFile));
3072
3271
  }
3073
3272
  }
3074
3273
  return deduped.size > 0 ? Array.from(deduped) : void 0;
@@ -3114,9 +3313,9 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
3114
3313
 
3115
3314
  // src/evaluation/providers/targets-file.ts
3116
3315
  var import_node_fs4 = require("fs");
3117
- var import_promises5 = require("fs/promises");
3118
- var import_node_path7 = __toESM(require("path"), 1);
3119
- var import_yaml2 = require("yaml");
3316
+ var import_promises9 = require("fs/promises");
3317
+ var import_node_path12 = __toESM(require("path"), 1);
3318
+ var import_yaml3 = require("yaml");
3120
3319
 
3121
3320
  // src/evaluation/providers/types.ts
3122
3321
  var AGENT_PROVIDER_KINDS = [
@@ -3177,19 +3376,19 @@ function assertTargetDefinition(value, index, filePath) {
3177
3376
  }
3178
3377
  async function fileExists3(filePath) {
3179
3378
  try {
3180
- await (0, import_promises5.access)(filePath, import_node_fs4.constants.F_OK);
3379
+ await (0, import_promises9.access)(filePath, import_node_fs4.constants.F_OK);
3181
3380
  return true;
3182
3381
  } catch {
3183
3382
  return false;
3184
3383
  }
3185
3384
  }
3186
3385
  async function readTargetDefinitions(filePath) {
3187
- const absolutePath = import_node_path7.default.resolve(filePath);
3386
+ const absolutePath = import_node_path12.default.resolve(filePath);
3188
3387
  if (!await fileExists3(absolutePath)) {
3189
3388
  throw new Error(`targets.yaml not found at ${absolutePath}`);
3190
3389
  }
3191
- const raw = await (0, import_promises5.readFile)(absolutePath, "utf8");
3192
- const parsed = (0, import_yaml2.parse)(raw);
3390
+ const raw = await (0, import_promises9.readFile)(absolutePath, "utf8");
3391
+ const parsed = (0, import_yaml3.parse)(raw);
3193
3392
  if (!isRecord(parsed)) {
3194
3393
  throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with '$schema' and 'targets' fields`);
3195
3394
  }
@@ -3232,18 +3431,34 @@ function resolveAndCreateProvider(definition, env = process.env) {
3232
3431
  }
3233
3432
 
3234
3433
  // src/evaluation/evaluators.ts
3235
- var import_node_crypto2 = require("crypto");
3434
+ var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
3435
+
3436
+ Use the reference_answer as a gold standard for a high-quality response (if provided). The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
3437
+
3438
+ Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.
3439
+
3440
+ [[ ## expected_outcome ## ]]
3441
+ {{expected_outcome}}
3442
+
3443
+ [[ ## question ## ]]
3444
+ {{question}}
3445
+
3446
+ [[ ## reference_answer ## ]]
3447
+ {{reference_answer}}
3448
+
3449
+ [[ ## candidate_answer ## ]]
3450
+ {{candidate_answer}}`;
3236
3451
  var LlmJudgeEvaluator = class {
3237
3452
  kind = "llm_judge";
3238
3453
  resolveJudgeProvider;
3239
3454
  maxOutputTokens;
3240
3455
  temperature;
3241
- customPrompt;
3456
+ evaluatorTemplate;
3242
3457
  constructor(options) {
3243
3458
  this.resolveJudgeProvider = options.resolveJudgeProvider;
3244
3459
  this.maxOutputTokens = options.maxOutputTokens;
3245
3460
  this.temperature = options.temperature;
3246
- this.customPrompt = options.customPrompt;
3461
+ this.evaluatorTemplate = options.evaluatorTemplate;
3247
3462
  }
3248
3463
  async evaluate(context) {
3249
3464
  const judgeProvider = await this.resolveJudgeProvider(context);
@@ -3253,26 +3468,21 @@ var LlmJudgeEvaluator = class {
3253
3468
  return this.evaluateWithPrompt(context, judgeProvider);
3254
3469
  }
3255
3470
  async evaluateWithPrompt(context, judgeProvider) {
3256
- const hasReferenceAnswer = hasNonEmptyReferenceAnswer(context.evalCase);
3257
3471
  const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
3258
- let prompt = buildQualityPrompt(context.evalCase, context.candidate, formattedQuestion);
3259
- let systemPrompt = context.systemPrompt ?? this.customPrompt ?? buildSystemPrompt(hasReferenceAnswer);
3260
- if (systemPrompt && hasTemplateVariables(systemPrompt)) {
3261
- const variables = {
3262
- input_messages: JSON.stringify(context.evalCase.input_segments, null, 2),
3263
- output_messages: JSON.stringify(context.evalCase.output_segments, null, 2),
3264
- candidate_answer: context.candidate,
3265
- reference_answer: context.evalCase.reference_answer ?? "",
3266
- expected_outcome: context.evalCase.expected_outcome,
3267
- question: formattedQuestion
3268
- };
3269
- prompt = substituteVariables(systemPrompt, variables);
3270
- systemPrompt = buildSystemPrompt(hasReferenceAnswer);
3271
- }
3272
- const metadata = systemPrompt !== void 0 ? { systemPrompt } : {};
3472
+ const variables = {
3473
+ input_messages: JSON.stringify(context.evalCase.input_segments, null, 2),
3474
+ output_messages: JSON.stringify(context.evalCase.output_segments, null, 2),
3475
+ candidate_answer: context.candidate.trim(),
3476
+ reference_answer: (context.evalCase.reference_answer ?? "").trim(),
3477
+ expected_outcome: context.evalCase.expected_outcome.trim(),
3478
+ question: formattedQuestion.trim()
3479
+ };
3480
+ const systemPrompt = buildOutputSchema();
3481
+ const evaluatorTemplate = context.evaluatorTemplateOverride ?? this.evaluatorTemplate ?? DEFAULT_EVALUATOR_TEMPLATE;
3482
+ const userPrompt = substituteVariables(evaluatorTemplate, variables);
3273
3483
  const response = await judgeProvider.invoke({
3274
- question: prompt,
3275
- metadata,
3484
+ question: userPrompt,
3485
+ systemPrompt,
3276
3486
  evalCaseId: context.evalCase.id,
3277
3487
  attempt: context.attempt,
3278
3488
  maxOutputTokens: this.maxOutputTokens,
@@ -3285,11 +3495,9 @@ var LlmJudgeEvaluator = class {
3285
3495
  const reasoning = parsed.reasoning ?? response.reasoning;
3286
3496
  const expectedAspectCount = Math.max(hits.length + misses.length, 1);
3287
3497
  const evaluatorRawRequest = {
3288
- id: (0, import_node_crypto2.randomUUID)(),
3289
- provider: judgeProvider.id,
3290
- prompt,
3291
- target: context.target.name,
3292
- ...systemPrompt !== void 0 && { systemPrompt }
3498
+ userPrompt,
3499
+ systemPrompt,
3500
+ target: judgeProvider.targetName
3293
3501
  };
3294
3502
  return {
3295
3503
  score,
@@ -3301,20 +3509,8 @@ var LlmJudgeEvaluator = class {
3301
3509
  };
3302
3510
  }
3303
3511
  };
3304
- function buildSystemPrompt(hasReferenceAnswer) {
3305
- const basePrompt = [
3306
- "You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.",
3307
- ""
3308
- ];
3309
- if (hasReferenceAnswer) {
3310
- basePrompt.push(
3311
- "Use the reference_answer as a gold standard for a high-quality response. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.",
3312
- ""
3313
- );
3314
- }
3315
- basePrompt.push(
3316
- "Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.",
3317
- "",
3512
+ function buildOutputSchema() {
3513
+ return [
3318
3514
  "You must respond with a single JSON object matching this schema:",
3319
3515
  "",
3320
3516
  "{",
@@ -3323,30 +3519,7 @@ function buildSystemPrompt(hasReferenceAnswer) {
3323
3519
  ' "misses": [<array of strings, max 4 items, brief specific failures or omissions, empty if none>],',
3324
3520
  ' "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
3325
3521
  "}"
3326
- );
3327
- return basePrompt.join("\n");
3328
- }
3329
- function buildQualityPrompt(evalCase, candidate, question) {
3330
- const parts = [
3331
- "[[ ## expected_outcome ## ]]",
3332
- evalCase.expected_outcome.trim(),
3333
- "",
3334
- "[[ ## question ## ]]",
3335
- question.trim(),
3336
- ""
3337
- ];
3338
- if (hasNonEmptyReferenceAnswer(evalCase)) {
3339
- parts.push(
3340
- "[[ ## reference_answer ## ]]",
3341
- evalCase.reference_answer.trim(),
3342
- ""
3343
- );
3344
- }
3345
- parts.push(
3346
- "[[ ## candidate_answer ## ]]",
3347
- candidate.trim()
3348
- );
3349
- return parts.join("\n");
3522
+ ].join("\n");
3350
3523
  }
3351
3524
  function clampScore(value) {
3352
3525
  if (Number.isNaN(value) || !Number.isFinite(value)) {
@@ -3428,9 +3601,6 @@ function extractJsonBlob(text) {
3428
3601
  function isNonEmptyString(value) {
3429
3602
  return typeof value === "string" && value.trim().length > 0;
3430
3603
  }
3431
- function hasNonEmptyReferenceAnswer(evalCase) {
3432
- return evalCase.reference_answer !== void 0 && evalCase.reference_answer.trim().length > 0;
3433
- }
3434
3604
  var CodeEvaluator = class {
3435
3605
  kind = "code";
3436
3606
  script;
@@ -3536,19 +3706,16 @@ function parseJsonSafe(payload) {
3536
3706
  return void 0;
3537
3707
  }
3538
3708
  }
3539
- function hasTemplateVariables(text) {
3540
- return /\$\{[a-zA-Z0-9_]+\}/.test(text);
3541
- }
3542
3709
  function substituteVariables(template, variables) {
3543
- return template.replace(/\$\{([a-zA-Z0-9_]+)\}/g, (match, varName) => {
3710
+ return template.replace(/\{\{([a-zA-Z0-9_]+)\}\}/g, (match, varName) => {
3544
3711
  return variables[varName] ?? match;
3545
3712
  });
3546
3713
  }
3547
3714
 
3548
3715
  // src/evaluation/orchestrator.ts
3549
- var import_node_crypto3 = require("crypto");
3550
- var import_promises6 = require("fs/promises");
3551
- var import_node_path8 = __toESM(require("path"), 1);
3716
+ var import_node_crypto2 = require("crypto");
3717
+ var import_promises10 = require("fs/promises");
3718
+ var import_node_path13 = __toESM(require("path"), 1);
3552
3719
 
3553
3720
  // ../../node_modules/.pnpm/yocto-queue@1.2.1/node_modules/yocto-queue/index.js
3554
3721
  var Node = class {
@@ -4111,6 +4278,7 @@ async function evaluateCandidate(options) {
4111
4278
  }
4112
4279
  }
4113
4280
  return {
4281
+ timestamp: completedAt.toISOString(),
4114
4282
  eval_id: evalCase.id,
4115
4283
  dataset: evalCase.dataset,
4116
4284
  conversation_id: evalCase.conversation_id,
@@ -4118,14 +4286,12 @@ async function evaluateCandidate(options) {
4118
4286
  hits: score.hits,
4119
4287
  misses: score.misses,
4120
4288
  candidate_answer: candidate,
4121
- expected_aspect_count: score.expectedAspectCount,
4122
4289
  target: target.name,
4123
- timestamp: completedAt.toISOString(),
4124
4290
  reasoning: score.reasoning,
4125
4291
  raw_aspects: score.rawAspects,
4126
4292
  agent_provider_request: agentProviderRequest,
4127
4293
  lm_provider_request: lmProviderRequest,
4128
- evaluator_raw_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
4294
+ evaluator_provider_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
4129
4295
  evaluator_results: evaluatorResults
4130
4296
  };
4131
4297
  }
@@ -4202,7 +4368,7 @@ async function runEvaluatorList(options) {
4202
4368
  hits: score2.hits,
4203
4369
  misses: score2.misses,
4204
4370
  reasoning: score2.reasoning,
4205
- evaluator_raw_request: score2.evaluatorRawRequest
4371
+ evaluator_provider_request: score2.evaluatorRawRequest
4206
4372
  });
4207
4373
  continue;
4208
4374
  }
@@ -4229,7 +4395,7 @@ async function runEvaluatorList(options) {
4229
4395
  hits: score2.hits,
4230
4396
  misses: score2.misses,
4231
4397
  reasoning: score2.reasoning,
4232
- evaluator_raw_request: score2.evaluatorRawRequest
4398
+ evaluator_provider_request: score2.evaluatorRawRequest
4233
4399
  });
4234
4400
  continue;
4235
4401
  }
@@ -4282,7 +4448,7 @@ async function runLlmJudgeEvaluator(options) {
4282
4448
  promptInputs,
4283
4449
  now,
4284
4450
  judgeProvider,
4285
- systemPrompt: customPrompt,
4451
+ evaluatorTemplateOverride: customPrompt,
4286
4452
  evaluator: config
4287
4453
  });
4288
4454
  }
@@ -4323,22 +4489,22 @@ function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
4323
4489
  async function dumpPrompt(directory, evalCase, promptInputs) {
4324
4490
  const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
4325
4491
  const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
4326
- const filePath = import_node_path8.default.resolve(directory, filename);
4327
- await (0, import_promises6.mkdir)(import_node_path8.default.dirname(filePath), { recursive: true });
4492
+ const filePath = import_node_path13.default.resolve(directory, filename);
4493
+ await (0, import_promises10.mkdir)(import_node_path13.default.dirname(filePath), { recursive: true });
4328
4494
  const payload = {
4329
4495
  eval_id: evalCase.id,
4330
4496
  question: promptInputs.question,
4331
4497
  guidelines: promptInputs.guidelines,
4332
4498
  guideline_paths: evalCase.guideline_paths
4333
4499
  };
4334
- await (0, import_promises6.writeFile)(filePath, JSON.stringify(payload, null, 2), "utf8");
4500
+ await (0, import_promises10.writeFile)(filePath, JSON.stringify(payload, null, 2), "utf8");
4335
4501
  }
4336
4502
  function sanitizeFilename(value) {
4337
4503
  if (!value) {
4338
4504
  return "prompt";
4339
4505
  }
4340
4506
  const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
4341
- return sanitized.length > 0 ? sanitized : (0, import_node_crypto3.randomUUID)();
4507
+ return sanitized.length > 0 ? sanitized : (0, import_node_crypto2.randomUUID)();
4342
4508
  }
4343
4509
  async function invokeProvider(provider, options) {
4344
4510
  const { evalCase, promptInputs, attempt, agentTimeoutMs, signal } = options;
@@ -4394,6 +4560,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
4394
4560
  }
4395
4561
  }
4396
4562
  return {
4563
+ timestamp: timestamp.toISOString(),
4397
4564
  eval_id: evalCase.id,
4398
4565
  dataset: evalCase.dataset,
4399
4566
  conversation_id: evalCase.conversation_id,
@@ -4401,9 +4568,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
4401
4568
  hits: [],
4402
4569
  misses: [`Error: ${message}`],
4403
4570
  candidate_answer: `Error occurred: ${message}`,
4404
- expected_aspect_count: 0,
4405
4571
  target: targetName,
4406
- timestamp: timestamp.toISOString(),
4407
4572
  raw_aspects: [],
4408
4573
  agent_provider_request: agentProviderRequest,
4409
4574
  lm_provider_request: lmProviderRequest,
@@ -4411,7 +4576,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
4411
4576
  };
4412
4577
  }
4413
4578
  function createCacheKey(provider, target, evalCase, promptInputs) {
4414
- const hash = (0, import_node_crypto3.createHash)("sha256");
4579
+ const hash = (0, import_node_crypto2.createHash)("sha256");
4415
4580
  hash.update(provider.id);
4416
4581
  hash.update(target.name);
4417
4582
  hash.update(evalCase.id);