@agentv/core 0.11.0 → 0.14.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -9,7 +9,7 @@ import {
9
9
  readTextFile,
10
10
  resolveFileReference,
11
11
  resolveTargetDefinition
12
- } from "./chunk-YQBJAT5I.js";
12
+ } from "./chunk-IOCVST3R.js";
13
13
 
14
14
  // src/evaluation/types.ts
15
15
  var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
@@ -62,48 +62,197 @@ function getHitCount(result) {
62
62
  }
63
63
 
64
64
  // src/evaluation/yaml-parser.ts
65
+ import { readFile as readFile4 } from "node:fs/promises";
66
+ import path6 from "node:path";
67
+ import { parse as parse2 } from "yaml";
68
+
69
+ // src/evaluation/formatting/segment-formatter.ts
70
+ function extractCodeBlocks(segments) {
71
+ const CODE_BLOCK_PATTERN = /```[\s\S]*?```/g;
72
+ const codeBlocks = [];
73
+ for (const segment of segments) {
74
+ const typeValue = segment["type"];
75
+ if (typeof typeValue !== "string" || typeValue !== "text") {
76
+ continue;
77
+ }
78
+ const textValue = segment["value"];
79
+ if (typeof textValue !== "string") {
80
+ continue;
81
+ }
82
+ const matches = textValue.match(CODE_BLOCK_PATTERN);
83
+ if (matches) {
84
+ codeBlocks.push(...matches);
85
+ }
86
+ }
87
+ return codeBlocks;
88
+ }
89
+ function formatFileContents(parts) {
90
+ const fileCount = parts.filter((p) => p.isFile).length;
91
+ if (fileCount > 0) {
92
+ return parts.map((part) => {
93
+ if (part.isFile && part.displayPath) {
94
+ return `<file path="${part.displayPath}">
95
+ ${part.content}
96
+ </file>`;
97
+ }
98
+ return part.content;
99
+ }).join("\n\n");
100
+ }
101
+ return parts.map((p) => p.content).join(" ");
102
+ }
103
+ function formatSegment(segment) {
104
+ const type = asString(segment.type);
105
+ if (type === "text") {
106
+ return asString(segment.value);
107
+ }
108
+ if (type === "guideline_ref") {
109
+ const refPath = asString(segment.path);
110
+ return refPath ? `<Attached: ${refPath}>` : void 0;
111
+ }
112
+ if (type === "file") {
113
+ const text = asString(segment.text);
114
+ const filePath = asString(segment.path);
115
+ if (text && filePath) {
116
+ return formatFileContents([{ content: text.trim(), isFile: true, displayPath: filePath }]);
117
+ }
118
+ }
119
+ return void 0;
120
+ }
121
+ function hasVisibleContent(segments) {
122
+ return segments.some((segment) => {
123
+ const type = asString(segment.type);
124
+ if (type === "text") {
125
+ const value = asString(segment.value);
126
+ return value !== void 0 && value.trim().length > 0;
127
+ }
128
+ if (type === "guideline_ref") {
129
+ return false;
130
+ }
131
+ if (type === "file") {
132
+ const text = asString(segment.text);
133
+ return text !== void 0 && text.trim().length > 0;
134
+ }
135
+ return false;
136
+ });
137
+ }
138
+ function asString(value) {
139
+ return typeof value === "string" ? value : void 0;
140
+ }
141
+
142
+ // src/evaluation/loaders/config-loader.ts
65
143
  import micromatch from "micromatch";
144
+ import { readFile } from "node:fs/promises";
145
+ import path2 from "node:path";
146
+ import { parse } from "yaml";
147
+
148
+ // src/evaluation/loaders/file-resolver.ts
66
149
  import { constants } from "node:fs";
67
- import { access, readFile } from "node:fs/promises";
150
+ import { access } from "node:fs/promises";
68
151
  import path from "node:path";
69
- import { fileURLToPath } from "node:url";
70
- import { parse } from "yaml";
71
- var CODE_BLOCK_PATTERN = /```[\s\S]*?```/g;
72
- var ANSI_YELLOW = "\x1B[33m";
73
- var ANSI_RESET = "\x1B[0m";
74
- var SCHEMA_EVAL_V2 = "agentv-eval-v2";
75
- var SCHEMA_CONFIG_V2 = "agentv-config-v2";
76
- async function readTestSuiteMetadata(testFilePath) {
152
+ async function fileExists2(absolutePath) {
77
153
  try {
78
- const absolutePath = path.resolve(testFilePath);
79
- const content = await readFile(absolutePath, "utf8");
80
- const parsed = parse(content);
81
- if (!isJsonObject(parsed)) {
82
- return {};
83
- }
84
- return { target: extractTargetFromSuite(parsed) };
154
+ await access(absolutePath, constants.F_OK);
155
+ return true;
85
156
  } catch {
86
- return {};
157
+ return false;
87
158
  }
88
159
  }
89
- function extractTargetFromSuite(suite) {
90
- const execution = suite.execution;
91
- if (execution && typeof execution === "object" && !Array.isArray(execution)) {
92
- const executionTarget = execution.target;
93
- if (typeof executionTarget === "string" && executionTarget.trim().length > 0) {
94
- return executionTarget.trim();
160
+ function resolveToAbsolutePath(candidate) {
161
+ if (candidate instanceof URL) {
162
+ return new URL(candidate).pathname;
163
+ }
164
+ if (typeof candidate === "string") {
165
+ if (candidate.startsWith("file://")) {
166
+ return new URL(candidate).pathname;
95
167
  }
168
+ return path.resolve(candidate);
96
169
  }
97
- const targetValue = suite.target;
98
- if (typeof targetValue === "string" && targetValue.trim().length > 0) {
99
- return targetValue.trim();
170
+ throw new TypeError("Unsupported repoRoot value. Expected string or URL.");
171
+ }
172
+ function buildDirectoryChain2(filePath, repoRoot) {
173
+ const directories = [];
174
+ const seen = /* @__PURE__ */ new Set();
175
+ const boundary = path.resolve(repoRoot);
176
+ let current = path.resolve(path.dirname(filePath));
177
+ while (current !== void 0) {
178
+ if (!seen.has(current)) {
179
+ directories.push(current);
180
+ seen.add(current);
181
+ }
182
+ if (current === boundary) {
183
+ break;
184
+ }
185
+ const parent = path.dirname(current);
186
+ if (parent === current) {
187
+ break;
188
+ }
189
+ current = parent;
100
190
  }
101
- return void 0;
191
+ if (!seen.has(boundary)) {
192
+ directories.push(boundary);
193
+ }
194
+ return directories;
195
+ }
196
+ function buildSearchRoots2(evalPath, repoRoot) {
197
+ const uniqueRoots = [];
198
+ const addRoot = (root) => {
199
+ const normalized = path.resolve(root);
200
+ if (!uniqueRoots.includes(normalized)) {
201
+ uniqueRoots.push(normalized);
202
+ }
203
+ };
204
+ let currentDir = path.dirname(evalPath);
205
+ let reachedBoundary = false;
206
+ while (!reachedBoundary) {
207
+ addRoot(currentDir);
208
+ const parentDir = path.dirname(currentDir);
209
+ if (currentDir === repoRoot || parentDir === currentDir) {
210
+ reachedBoundary = true;
211
+ } else {
212
+ currentDir = parentDir;
213
+ }
214
+ }
215
+ addRoot(repoRoot);
216
+ addRoot(process.cwd());
217
+ return uniqueRoots;
102
218
  }
219
+ function trimLeadingSeparators(value) {
220
+ const trimmed = value.replace(/^[/\\]+/, "");
221
+ return trimmed.length > 0 ? trimmed : value;
222
+ }
223
+ async function resolveFileReference2(rawValue, searchRoots) {
224
+ const displayPath = trimLeadingSeparators(rawValue);
225
+ const potentialPaths = [];
226
+ if (path.isAbsolute(rawValue)) {
227
+ potentialPaths.push(path.normalize(rawValue));
228
+ }
229
+ for (const base of searchRoots) {
230
+ potentialPaths.push(path.resolve(base, displayPath));
231
+ }
232
+ const attempted = [];
233
+ const seen = /* @__PURE__ */ new Set();
234
+ for (const candidate of potentialPaths) {
235
+ const absoluteCandidate = path.resolve(candidate);
236
+ if (seen.has(absoluteCandidate)) {
237
+ continue;
238
+ }
239
+ seen.add(absoluteCandidate);
240
+ attempted.push(absoluteCandidate);
241
+ if (await fileExists2(absoluteCandidate)) {
242
+ return { displayPath, resolvedPath: absoluteCandidate, attempted };
243
+ }
244
+ }
245
+ return { displayPath, attempted };
246
+ }
247
+
248
+ // src/evaluation/loaders/config-loader.ts
249
+ var SCHEMA_CONFIG_V2 = "agentv-config-v2";
250
+ var ANSI_YELLOW = "\x1B[33m";
251
+ var ANSI_RESET = "\x1B[0m";
103
252
  async function loadConfig(evalFilePath, repoRoot) {
104
- const directories = buildDirectoryChain(evalFilePath, repoRoot);
253
+ const directories = buildDirectoryChain2(evalFilePath, repoRoot);
105
254
  for (const directory of directories) {
106
- const configPath = path.join(directory, ".agentv", "config.yaml");
255
+ const configPath = path2.join(directory, ".agentv", "config.yaml");
107
256
  if (!await fileExists2(configPath)) {
108
257
  continue;
109
258
  }
@@ -146,24 +295,134 @@ function isGuidelineFile(filePath, patterns) {
146
295
  const patternsToUse = patterns ?? [];
147
296
  return micromatch.isMatch(normalized, patternsToUse);
148
297
  }
149
- function extractCodeBlocks(segments) {
150
- const codeBlocks = [];
151
- for (const segment of segments) {
152
- const typeValue = segment["type"];
153
- if (typeof typeValue !== "string" || typeValue !== "text") {
298
+ function extractTargetFromSuite(suite) {
299
+ const execution = suite.execution;
300
+ if (execution && typeof execution === "object" && !Array.isArray(execution)) {
301
+ const executionTarget = execution.target;
302
+ if (typeof executionTarget === "string" && executionTarget.trim().length > 0) {
303
+ return executionTarget.trim();
304
+ }
305
+ }
306
+ const targetValue = suite.target;
307
+ if (typeof targetValue === "string" && targetValue.trim().length > 0) {
308
+ return targetValue.trim();
309
+ }
310
+ return void 0;
311
+ }
312
+ function logWarning(message) {
313
+ console.warn(`${ANSI_YELLOW}Warning: ${message}${ANSI_RESET}`);
314
+ }
315
+
316
+ // src/evaluation/loaders/evaluator-parser.ts
317
+ import path3 from "node:path";
318
+ var ANSI_YELLOW2 = "\x1B[33m";
319
+ var ANSI_RESET2 = "\x1B[0m";
320
+ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
321
+ const execution = rawEvalCase.execution;
322
+ const candidateEvaluators = isJsonObject2(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators ?? globalExecution?.evaluators;
323
+ if (candidateEvaluators === void 0) {
324
+ return void 0;
325
+ }
326
+ if (!Array.isArray(candidateEvaluators)) {
327
+ logWarning2(`Skipping evaluators for '${evalId}': expected array`);
328
+ return void 0;
329
+ }
330
+ const evaluators = [];
331
+ for (const rawEvaluator of candidateEvaluators) {
332
+ if (!isJsonObject2(rawEvaluator)) {
333
+ logWarning2(`Skipping invalid evaluator entry for '${evalId}' (expected object)`);
154
334
  continue;
155
335
  }
156
- const textValue = segment["value"];
157
- if (typeof textValue !== "string") {
336
+ const name = asString2(rawEvaluator.name);
337
+ const typeValue = rawEvaluator.type;
338
+ if (!name || !isEvaluatorKind(typeValue)) {
339
+ logWarning2(`Skipping evaluator with invalid name/type in '${evalId}'`);
158
340
  continue;
159
341
  }
160
- const matches = textValue.match(CODE_BLOCK_PATTERN);
161
- if (matches) {
162
- codeBlocks.push(...matches);
342
+ if (typeValue === "code") {
343
+ const script = asString2(rawEvaluator.script);
344
+ if (!script) {
345
+ logWarning2(`Skipping code evaluator '${name}' in '${evalId}': missing script`);
346
+ continue;
347
+ }
348
+ const cwd = asString2(rawEvaluator.cwd);
349
+ let resolvedCwd;
350
+ if (cwd) {
351
+ const resolved = await resolveFileReference2(cwd, searchRoots);
352
+ if (resolved.resolvedPath) {
353
+ resolvedCwd = path3.resolve(resolved.resolvedPath);
354
+ } else {
355
+ logWarning2(
356
+ `Code evaluator '${name}' in '${evalId}': cwd not found (${resolved.displayPath})`,
357
+ resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
358
+ );
359
+ }
360
+ } else {
361
+ resolvedCwd = searchRoots[0];
362
+ }
363
+ evaluators.push({
364
+ name,
365
+ type: "code",
366
+ script,
367
+ cwd,
368
+ resolvedCwd
369
+ });
370
+ continue;
371
+ }
372
+ const prompt = asString2(rawEvaluator.prompt);
373
+ let promptPath;
374
+ if (prompt) {
375
+ const resolved = await resolveFileReference2(prompt, searchRoots);
376
+ if (resolved.resolvedPath) {
377
+ promptPath = path3.resolve(resolved.resolvedPath);
378
+ } else {
379
+ logWarning2(
380
+ `Inline prompt used for evaluator '${name}' in '${evalId}' (file not found: ${resolved.displayPath})`,
381
+ resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
382
+ );
383
+ }
163
384
  }
385
+ const _model = asString2(rawEvaluator.model);
386
+ evaluators.push({
387
+ name,
388
+ type: "llm_judge",
389
+ prompt,
390
+ promptPath
391
+ });
164
392
  }
165
- return codeBlocks;
393
+ return evaluators.length > 0 ? evaluators : void 0;
166
394
  }
395
+ function coerceEvaluator(candidate, contextId) {
396
+ if (typeof candidate !== "string") {
397
+ return void 0;
398
+ }
399
+ if (isEvaluatorKind(candidate)) {
400
+ return candidate;
401
+ }
402
+ logWarning2(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
403
+ return void 0;
404
+ }
405
+ function asString2(value) {
406
+ return typeof value === "string" ? value : void 0;
407
+ }
408
+ function isJsonObject2(value) {
409
+ return typeof value === "object" && value !== null && !Array.isArray(value);
410
+ }
411
+ function logWarning2(message, details) {
412
+ if (details && details.length > 0) {
413
+ const detailBlock = details.join("\n");
414
+ console.warn(`${ANSI_YELLOW2}Warning: ${message}
415
+ ${detailBlock}${ANSI_RESET2}`);
416
+ } else {
417
+ console.warn(`${ANSI_YELLOW2}Warning: ${message}${ANSI_RESET2}`);
418
+ }
419
+ }
420
+
421
+ // src/evaluation/loaders/message-processor.ts
422
+ import { readFile as readFile2 } from "node:fs/promises";
423
+ import path4 from "node:path";
424
+ var ANSI_YELLOW3 = "\x1B[33m";
425
+ var ANSI_RESET3 = "\x1B[0m";
167
426
  async function processMessages(options) {
168
427
  const {
169
428
  messages,
@@ -189,28 +448,28 @@ async function processMessages(options) {
189
448
  if (!isJsonObject(rawSegment)) {
190
449
  continue;
191
450
  }
192
- const segmentType = asString(rawSegment.type);
451
+ const segmentType = asString3(rawSegment.type);
193
452
  if (segmentType === "file") {
194
- const rawValue = asString(rawSegment.value);
453
+ const rawValue = asString3(rawSegment.value);
195
454
  if (!rawValue) {
196
455
  continue;
197
456
  }
198
- const { displayPath, resolvedPath, attempted } = await resolveFileReference(
457
+ const { displayPath, resolvedPath, attempted } = await resolveFileReference2(
199
458
  rawValue,
200
459
  searchRoots
201
460
  );
202
461
  if (!resolvedPath) {
203
462
  const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
204
463
  const context = messageType === "input" ? "" : " in expected_messages";
205
- logWarning(`File not found${context}: ${displayPath}`, attempts);
464
+ logWarning3(`File not found${context}: ${displayPath}`, attempts);
206
465
  continue;
207
466
  }
208
467
  try {
209
- const fileContent = (await readFile(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
468
+ const fileContent = (await readFile2(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
210
469
  if (messageType === "input" && guidelinePatterns && guidelinePaths) {
211
- const relativeToRepo = path.relative(repoRootPath, resolvedPath);
470
+ const relativeToRepo = path4.relative(repoRootPath, resolvedPath);
212
471
  if (isGuidelineFile(relativeToRepo, guidelinePatterns)) {
213
- guidelinePaths.push(path.resolve(resolvedPath));
472
+ guidelinePaths.push(path4.resolve(resolvedPath));
214
473
  if (verbose) {
215
474
  console.log(` [Guideline] Found: ${displayPath}`);
216
475
  console.log(` Resolved to: ${resolvedPath}`);
@@ -222,7 +481,7 @@ async function processMessages(options) {
222
481
  type: "file",
223
482
  path: displayPath,
224
483
  text: fileContent,
225
- resolvedPath: path.resolve(resolvedPath)
484
+ resolvedPath: path4.resolve(resolvedPath)
226
485
  });
227
486
  if (verbose) {
228
487
  const label = messageType === "input" ? "[File]" : "[Expected Output File]";
@@ -231,7 +490,7 @@ async function processMessages(options) {
231
490
  }
232
491
  } catch (error) {
233
492
  const context = messageType === "input" ? "" : " expected output";
234
- logWarning(`Could not read${context} file ${resolvedPath}: ${error.message}`);
493
+ logWarning3(`Could not read${context} file ${resolvedPath}: ${error.message}`);
235
494
  }
236
495
  continue;
237
496
  }
@@ -245,201 +504,117 @@ async function processMessages(options) {
245
504
  }
246
505
  return segments;
247
506
  }
248
- async function loadEvalCases(evalFilePath, repoRoot, options) {
249
- const verbose = options?.verbose ?? false;
250
- const evalIdFilter = options?.evalId;
251
- const absoluteTestPath = path.resolve(evalFilePath);
252
- if (!await fileExists2(absoluteTestPath)) {
253
- throw new Error(`Test file not found: ${evalFilePath}`);
254
- }
255
- const repoRootPath = resolveToAbsolutePath(repoRoot);
256
- const searchRoots = buildSearchRoots(absoluteTestPath, repoRootPath);
257
- const config = await loadConfig(absoluteTestPath, repoRootPath);
258
- const guidelinePatterns = config?.guideline_patterns;
259
- const rawFile = await readFile(absoluteTestPath, "utf8");
260
- const parsed = parse(rawFile);
261
- if (!isJsonObject(parsed)) {
262
- throw new Error(`Invalid test file format: ${evalFilePath}`);
263
- }
264
- const suite = parsed;
265
- const datasetNameFromSuite = asString(suite.dataset)?.trim();
266
- const fallbackDataset = path.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
267
- const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
268
- const schema = suite.$schema;
269
- if (schema !== SCHEMA_EVAL_V2) {
270
- const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${evalFilePath}. Expected '${SCHEMA_EVAL_V2}'` : `Missing required field '$schema' in ${evalFilePath}.
271
- Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
272
- throw new Error(message);
507
+ async function resolveAssistantContent(content, searchRoots, verbose) {
508
+ if (typeof content === "string") {
509
+ return content;
273
510
  }
274
- const rawTestcases = suite.evalcases;
275
- if (!Array.isArray(rawTestcases)) {
276
- throw new Error(`Invalid test file format: ${evalFilePath} - missing 'evalcases' field`);
511
+ if (!content) {
512
+ return "";
277
513
  }
278
- const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm_judge";
279
- const globalExecution = isJsonObject(suite.execution) ? suite.execution : void 0;
280
- const globalTarget = asString(globalExecution?.target) ?? asString(suite.target);
281
- const results = [];
282
- for (const rawEvalcase of rawTestcases) {
283
- if (!isJsonObject(rawEvalcase)) {
284
- logWarning("Skipping invalid eval case entry (expected object)");
514
+ const parts = [];
515
+ for (const entry of content) {
516
+ if (typeof entry === "string") {
517
+ parts.push({ content: entry, isFile: false });
285
518
  continue;
286
519
  }
287
- const evalcase = rawEvalcase;
288
- const id = asString(evalcase.id);
289
- if (evalIdFilter && id !== evalIdFilter) {
520
+ if (!isJsonObject(entry)) {
290
521
  continue;
291
522
  }
292
- const conversationId = asString(evalcase.conversation_id);
293
- const outcome = asString(evalcase.outcome);
294
- const inputMessagesValue = evalcase.input_messages;
295
- const expectedMessagesValue = evalcase.expected_messages;
296
- if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
297
- logWarning(`Skipping incomplete eval case: ${id ?? "unknown"}`);
523
+ const segmentType = asString3(entry.type);
524
+ if (segmentType === "file") {
525
+ const rawValue = asString3(entry.value);
526
+ if (!rawValue) {
527
+ continue;
528
+ }
529
+ const { displayPath, resolvedPath, attempted } = await resolveFileReference2(
530
+ rawValue,
531
+ searchRoots
532
+ );
533
+ if (!resolvedPath) {
534
+ const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
535
+ logWarning3(`File not found in expected_messages: ${displayPath}`, attempts);
536
+ continue;
537
+ }
538
+ try {
539
+ const fileContent = (await readFile2(resolvedPath, "utf8")).replace(/\r\n/g, "\n").trim();
540
+ parts.push({ content: fileContent, isFile: true, displayPath });
541
+ if (verbose) {
542
+ console.log(` [Expected Assistant File] Found: ${displayPath}`);
543
+ console.log(` Resolved to: ${resolvedPath}`);
544
+ }
545
+ } catch (error) {
546
+ logWarning3(`Could not read file ${resolvedPath}: ${error.message}`);
547
+ }
298
548
  continue;
299
549
  }
300
- const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
301
- const inputMessages = inputMessagesValue.filter((msg) => isTestMessage(msg));
302
- const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
303
- if (hasExpectedMessages && expectedMessages.length === 0) {
304
- logWarning(`No valid expected message found for eval case: ${id}`);
550
+ const textValue = asString3(entry.text);
551
+ if (typeof textValue === "string") {
552
+ parts.push({ content: textValue, isFile: false });
305
553
  continue;
306
554
  }
307
- if (expectedMessages.length > 1) {
308
- logWarning(`Multiple expected messages found for eval case: ${id}, using first`);
309
- }
310
- const guidelinePaths = [];
311
- const inputTextParts = [];
312
- const inputSegments = await processMessages({
313
- messages: inputMessages,
314
- searchRoots,
315
- repoRootPath,
316
- guidelinePatterns,
317
- guidelinePaths,
318
- textParts: inputTextParts,
319
- messageType: "input",
320
- verbose
321
- });
322
- const outputSegments = hasExpectedMessages ? await processMessages({
323
- messages: expectedMessages,
324
- searchRoots,
325
- repoRootPath,
326
- guidelinePatterns,
327
- messageType: "output",
328
- verbose
329
- }) : [];
330
- const codeSnippets = extractCodeBlocks(inputSegments);
331
- const expectedContent = expectedMessages[0]?.content;
332
- const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
333
- const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
334
- const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
335
- const evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
336
- const userFilePaths = [];
337
- for (const segment of inputSegments) {
338
- if (segment.type === "file" && typeof segment.resolvedPath === "string") {
339
- userFilePaths.push(segment.resolvedPath);
340
- }
341
- }
342
- const allFilePaths = [
343
- ...guidelinePaths.map((guidelinePath) => path.resolve(guidelinePath)),
344
- ...userFilePaths
345
- ];
346
- const testCase = {
347
- id,
348
- dataset: datasetName,
349
- conversation_id: conversationId,
350
- question,
351
- input_messages: inputMessages,
352
- input_segments: inputSegments,
353
- output_segments: outputSegments,
354
- reference_answer: referenceAnswer,
355
- guideline_paths: guidelinePaths.map((guidelinePath) => path.resolve(guidelinePath)),
356
- guideline_patterns: guidelinePatterns,
357
- file_paths: allFilePaths,
358
- code_snippets: codeSnippets,
359
- expected_outcome: outcome,
360
- evaluator: evalCaseEvaluatorKind,
361
- evaluators
362
- };
363
- if (verbose) {
364
- console.log(`
365
- [Eval Case: ${id}]`);
366
- if (testCase.guideline_paths.length > 0) {
367
- console.log(` Guidelines used: ${testCase.guideline_paths.length}`);
368
- for (const guidelinePath of testCase.guideline_paths) {
369
- console.log(` - ${guidelinePath}`);
370
- }
371
- } else {
372
- console.log(" No guidelines found");
373
- }
555
+ const valueValue = asString3(entry.value);
556
+ if (typeof valueValue === "string") {
557
+ parts.push({ content: valueValue, isFile: false });
558
+ continue;
374
559
  }
375
- results.push(testCase);
560
+ parts.push({ content: JSON.stringify(entry), isFile: false });
376
561
  }
377
- return results;
562
+ return formatFileContents(parts);
378
563
  }
379
- function needsRoleMarkers(messages, processedSegmentsByMessage) {
380
- if (messages.some((msg) => msg.role === "assistant" || msg.role === "tool")) {
381
- return true;
382
- }
383
- let messagesWithContent = 0;
384
- for (const segments of processedSegmentsByMessage) {
385
- if (hasVisibleContent(segments)) {
386
- messagesWithContent++;
387
- }
388
- }
389
- return messagesWithContent > 1;
564
+ function asString3(value) {
565
+ return typeof value === "string" ? value : void 0;
390
566
  }
391
- function hasVisibleContent(segments) {
392
- return segments.some((segment) => {
393
- const type = asString(segment.type);
394
- if (type === "text") {
395
- const value = asString(segment.value);
396
- return value !== void 0 && value.trim().length > 0;
397
- }
398
- if (type === "guideline_ref") {
399
- return false;
400
- }
401
- if (type === "file") {
402
- const text = asString(segment.text);
403
- return text !== void 0 && text.trim().length > 0;
404
- }
405
- return false;
406
- });
567
+ function cloneJsonObject(source) {
568
+ const entries = Object.entries(source).map(([key, value]) => [key, cloneJsonValue(value)]);
569
+ return Object.fromEntries(entries);
407
570
  }
408
- function formatSegment(segment) {
409
- const type = asString(segment.type);
410
- if (type === "text") {
411
- return asString(segment.value);
571
+ function cloneJsonValue(value) {
572
+ if (value === null) {
573
+ return null;
412
574
  }
413
- if (type === "guideline_ref") {
414
- const refPath = asString(segment.path);
415
- return refPath ? `<Attached: ${refPath}>` : void 0;
575
+ if (typeof value === "string" || typeof value === "number" || typeof value === "boolean") {
576
+ return value;
416
577
  }
417
- if (type === "file") {
418
- const text = asString(segment.text);
419
- const filePath = asString(segment.path);
420
- if (text && filePath) {
421
- return formatFileContents([{ content: text.trim(), isFile: true, displayPath: filePath }]);
422
- }
578
+ if (Array.isArray(value)) {
579
+ return value.map((item) => cloneJsonValue(item));
580
+ }
581
+ if (typeof value === "object") {
582
+ return cloneJsonObject(value);
583
+ }
584
+ return value;
585
+ }
586
+ function logWarning3(message, details) {
587
+ if (details && details.length > 0) {
588
+ const detailBlock = details.join("\n");
589
+ console.warn(`${ANSI_YELLOW3}Warning: ${message}
590
+ ${detailBlock}${ANSI_RESET3}`);
591
+ } else {
592
+ console.warn(`${ANSI_YELLOW3}Warning: ${message}${ANSI_RESET3}`);
423
593
  }
424
- return void 0;
425
594
  }
595
+
596
+ // src/evaluation/formatting/prompt-builder.ts
597
+ import { readFile as readFile3 } from "node:fs/promises";
598
+ import path5 from "node:path";
599
+ var ANSI_YELLOW4 = "\x1B[33m";
600
+ var ANSI_RESET4 = "\x1B[0m";
426
601
  async function buildPromptInputs(testCase) {
427
602
  const guidelineParts = [];
428
603
  for (const rawPath of testCase.guideline_paths) {
429
- const absolutePath = path.resolve(rawPath);
604
+ const absolutePath = path5.resolve(rawPath);
430
605
  if (!await fileExists2(absolutePath)) {
431
- logWarning(`Could not read guideline file ${absolutePath}: file does not exist`);
606
+ logWarning4(`Could not read guideline file ${absolutePath}: file does not exist`);
432
607
  continue;
433
608
  }
434
609
  try {
435
- const content = (await readFile(absolutePath, "utf8")).replace(/\r\n/g, "\n").trim();
610
+ const content = (await readFile3(absolutePath, "utf8")).replace(/\r\n/g, "\n").trim();
436
611
  guidelineParts.push({
437
612
  content,
438
613
  isFile: true,
439
- displayPath: path.basename(absolutePath)
614
+ displayPath: path5.basename(absolutePath)
440
615
  });
441
616
  } catch (error) {
442
- logWarning(`Could not read guideline file ${absolutePath}: ${error.message}`);
617
+ logWarning4(`Could not read guideline file ${absolutePath}: ${error.message}`);
443
618
  }
444
619
  }
445
620
  const guidelines = formatFileContents(guidelineParts);
@@ -463,9 +638,9 @@ async function buildPromptInputs(testCase) {
463
638
  messageSegments.push({ type: "text", value: segment });
464
639
  }
465
640
  } else if (isJsonObject(segment)) {
466
- const type = asString(segment.type);
641
+ const type = asString4(segment.type);
467
642
  if (type === "file") {
468
- const value = asString(segment.value);
643
+ const value = asString4(segment.value);
469
644
  if (!value) continue;
470
645
  if (testCase.guideline_patterns && isGuidelineFile(value, testCase.guideline_patterns)) {
471
646
  messageSegments.push({ type: "guideline_ref", path: value });
@@ -476,7 +651,7 @@ async function buildPromptInputs(testCase) {
476
651
  messageSegments.push({ type: "file", text: fileText, path: value });
477
652
  }
478
653
  } else if (type === "text") {
479
- const textValue = asString(segment.value);
654
+ const textValue = asString4(segment.value);
480
655
  if (textValue && textValue.trim().length > 0) {
481
656
  messageSegments.push({ type: "text", value: textValue });
482
657
  }
@@ -532,6 +707,18 @@ ${messageContent}`);
532
707
  }) : void 0;
533
708
  return { question, guidelines, chatPrompt };
534
709
  }
710
+ function needsRoleMarkers(messages, processedSegmentsByMessage) {
711
+ if (messages.some((msg) => msg.role === "assistant" || msg.role === "tool")) {
712
+ return true;
713
+ }
714
+ let messagesWithContent = 0;
715
+ for (const segments of processedSegmentsByMessage) {
716
+ if (hasVisibleContent(segments)) {
717
+ messagesWithContent++;
718
+ }
719
+ }
720
+ return messagesWithContent > 1;
721
+ }
535
722
  function buildChatPromptFromSegments(options) {
536
723
  const { messages, segmentsByMessage, guidelinePatterns, guidelineContent, systemPrompt } = options;
537
724
  if (messages.length === 0) {
@@ -573,13 +760,12 @@ ${guidelineContent.trim()}`);
573
760
  const segments = segmentsByMessage[i];
574
761
  const contentParts = [];
575
762
  let role = message.role;
576
- let name;
577
763
  if (role === "system") {
578
764
  role = "assistant";
579
765
  contentParts.push("@[System]:");
580
766
  } else if (role === "tool") {
581
- role = "function";
582
- name = "tool";
767
+ role = "assistant";
768
+ contentParts.push("@[Tool]:");
583
769
  }
584
770
  for (const segment of segments) {
585
771
  if (segment.type === "guideline_ref") {
@@ -597,282 +783,398 @@ ${guidelineContent.trim()}`);
597
783
  if (contentParts.length === 0) {
598
784
  continue;
599
785
  }
786
+ const content = contentParts.join("\n");
600
787
  chatPrompt.push({
601
788
  role,
602
- content: contentParts.join("\n"),
603
- ...name ? { name } : {}
789
+ content
604
790
  });
605
791
  }
606
792
  return chatPrompt.length > 0 ? chatPrompt : void 0;
607
793
  }
608
- async function fileExists2(absolutePath) {
609
- try {
610
- await access(absolutePath, constants.F_OK);
611
- return true;
612
- } catch {
613
- return false;
614
- }
615
- }
616
- function resolveToAbsolutePath(candidate) {
617
- if (candidate instanceof URL) {
618
- return fileURLToPath(candidate);
619
- }
620
- if (typeof candidate === "string") {
621
- if (candidate.startsWith("file://")) {
622
- return fileURLToPath(new URL(candidate));
623
- }
624
- return path.resolve(candidate);
625
- }
626
- throw new TypeError("Unsupported repoRoot value. Expected string or URL.");
627
- }
628
- function asString(value) {
794
+ function asString4(value) {
629
795
  return typeof value === "string" ? value : void 0;
630
796
  }
631
- function cloneJsonObject(source) {
632
- const entries = Object.entries(source).map(([key, value]) => [key, cloneJsonValue(value)]);
633
- return Object.fromEntries(entries);
797
+ function logWarning4(message) {
798
+ console.warn(`${ANSI_YELLOW4}Warning: ${message}${ANSI_RESET4}`);
634
799
  }
635
- function cloneJsonValue(value) {
636
- if (value === null) {
637
- return null;
638
- }
639
- if (typeof value === "string" || typeof value === "number" || typeof value === "boolean") {
640
- return value;
641
- }
642
- if (Array.isArray(value)) {
643
- return value.map((item) => cloneJsonValue(item));
800
+
801
+ // src/evaluation/yaml-parser.ts
802
+ var ANSI_YELLOW5 = "\x1B[33m";
803
+ var ANSI_RESET5 = "\x1B[0m";
804
+ var SCHEMA_EVAL_V2 = "agentv-eval-v2";
805
+ async function readTestSuiteMetadata(testFilePath) {
806
+ try {
807
+ const absolutePath = path6.resolve(testFilePath);
808
+ const content = await readFile4(absolutePath, "utf8");
809
+ const parsed = parse2(content);
810
+ if (!isJsonObject(parsed)) {
811
+ return {};
812
+ }
813
+ return { target: extractTargetFromSuite(parsed) };
814
+ } catch {
815
+ return {};
644
816
  }
645
- return cloneJsonObject(value);
646
817
  }
647
- function formatFileContents(parts) {
648
- const fileCount = parts.filter((p) => p.isFile).length;
649
- if (fileCount > 0) {
650
- return parts.map((part) => {
651
- if (part.isFile && part.displayPath) {
652
- return `<file path="${part.displayPath}">
653
- ${part.content}
654
- </file>`;
655
- }
656
- return part.content;
657
- }).join("\n\n");
818
+ async function loadEvalCases(evalFilePath, repoRoot, options) {
819
+ const verbose = options?.verbose ?? false;
820
+ const evalIdFilter = options?.evalId;
821
+ const absoluteTestPath = path6.resolve(evalFilePath);
822
+ const repoRootPath = resolveToAbsolutePath(repoRoot);
823
+ const searchRoots = buildSearchRoots2(absoluteTestPath, repoRootPath);
824
+ const config = await loadConfig(absoluteTestPath, repoRootPath);
825
+ const guidelinePatterns = config?.guideline_patterns;
826
+ const rawFile = await readFile4(absoluteTestPath, "utf8");
827
+ const parsed = parse2(rawFile);
828
+ if (!isJsonObject(parsed)) {
829
+ throw new Error(`Invalid test file format: ${evalFilePath}`);
658
830
  }
659
- return parts.map((p) => p.content).join(" ");
660
- }
661
- async function resolveAssistantContent(content, searchRoots, verbose) {
662
- if (typeof content === "string") {
663
- return content;
831
+ const suite = parsed;
832
+ const datasetNameFromSuite = asString5(suite.dataset)?.trim();
833
+ const fallbackDataset = path6.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
834
+ const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
835
+ const schema = suite.$schema;
836
+ if (schema !== SCHEMA_EVAL_V2) {
837
+ const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${evalFilePath}. Expected '${SCHEMA_EVAL_V2}'` : `Missing required field '$schema' in ${evalFilePath}.
838
+ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
839
+ throw new Error(message);
664
840
  }
665
- if (!content) {
666
- return "";
841
+ const rawTestcases = suite.evalcases;
842
+ if (!Array.isArray(rawTestcases)) {
843
+ throw new Error(`Invalid test file format: ${evalFilePath} - missing 'evalcases' field`);
667
844
  }
668
- const parts = [];
669
- for (const entry of content) {
670
- if (typeof entry === "string") {
671
- parts.push({ content: entry, isFile: false });
672
- continue;
673
- }
674
- if (!isJsonObject(entry)) {
675
- continue;
676
- }
677
- const segmentType = asString(entry.type);
678
- if (segmentType === "file") {
679
- const rawValue = asString(entry.value);
680
- if (!rawValue) {
681
- continue;
682
- }
683
- const { displayPath, resolvedPath, attempted } = await resolveFileReference(
684
- rawValue,
685
- searchRoots
686
- );
687
- if (!resolvedPath) {
688
- const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
689
- logWarning(`File not found in expected_messages: ${displayPath}`, attempts);
690
- continue;
691
- }
692
- try {
693
- const fileContent = (await readFile(resolvedPath, "utf8")).replace(/\r\n/g, "\n").trim();
694
- parts.push({ content: fileContent, isFile: true, displayPath });
695
- if (verbose) {
696
- console.log(` [Expected Assistant File] Found: ${displayPath}`);
697
- console.log(` Resolved to: ${resolvedPath}`);
698
- }
699
- } catch (error) {
700
- logWarning(`Could not read file ${resolvedPath}: ${error.message}`);
701
- }
702
- continue;
703
- }
704
- const textValue = asString(entry.text);
705
- if (typeof textValue === "string") {
706
- parts.push({ content: textValue, isFile: false });
845
+ const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm_judge";
846
+ const globalExecution = isJsonObject(suite.execution) ? suite.execution : void 0;
847
+ const _globalTarget = asString5(globalExecution?.target) ?? asString5(suite.target);
848
+ const results = [];
849
+ for (const rawEvalcase of rawTestcases) {
850
+ if (!isJsonObject(rawEvalcase)) {
851
+ logWarning5("Skipping invalid eval case entry (expected object)");
707
852
  continue;
708
853
  }
709
- const valueValue = asString(entry.value);
710
- if (typeof valueValue === "string") {
711
- parts.push({ content: valueValue, isFile: false });
854
+ const evalcase = rawEvalcase;
855
+ const id = asString5(evalcase.id);
856
+ if (evalIdFilter && id !== evalIdFilter) {
712
857
  continue;
713
858
  }
714
- parts.push({ content: JSON.stringify(entry), isFile: false });
715
- }
716
- return formatFileContents(parts);
717
- }
718
- async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
719
- const execution = rawEvalCase.execution;
720
- const candidateEvaluators = isJsonObject(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators ?? globalExecution?.evaluators;
721
- if (candidateEvaluators === void 0) {
722
- return void 0;
723
- }
724
- if (!Array.isArray(candidateEvaluators)) {
725
- logWarning(`Skipping evaluators for '${evalId}': expected array`);
726
- return void 0;
727
- }
728
- const evaluators = [];
729
- for (const rawEvaluator of candidateEvaluators) {
730
- if (!isJsonObject(rawEvaluator)) {
731
- logWarning(`Skipping invalid evaluator entry for '${evalId}' (expected object)`);
859
+ const conversationId = asString5(evalcase.conversation_id);
860
+ const outcome = asString5(evalcase.outcome);
861
+ const inputMessagesValue = evalcase.input_messages;
862
+ const expectedMessagesValue = evalcase.expected_messages;
863
+ if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
864
+ logWarning5(`Skipping incomplete eval case: ${id ?? "unknown"}`);
732
865
  continue;
733
866
  }
734
- const name = asString(rawEvaluator.name);
735
- const typeValue = rawEvaluator.type;
736
- if (!name || !isEvaluatorKind(typeValue)) {
737
- logWarning(`Skipping evaluator with invalid name/type in '${evalId}'`);
867
+ const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
868
+ const inputMessages = inputMessagesValue.filter((msg) => isTestMessage(msg));
869
+ const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
870
+ if (hasExpectedMessages && expectedMessages.length === 0) {
871
+ logWarning5(`No valid expected message found for eval case: ${id}`);
738
872
  continue;
739
873
  }
740
- if (typeValue === "code") {
741
- const script = asString(rawEvaluator.script);
742
- if (!script) {
743
- logWarning(`Skipping code evaluator '${name}' in '${evalId}': missing script`);
744
- continue;
745
- }
746
- const cwd = asString(rawEvaluator.cwd);
747
- let resolvedCwd;
748
- if (cwd) {
749
- const resolved = await resolveFileReference(cwd, searchRoots);
750
- if (resolved.resolvedPath) {
751
- resolvedCwd = path.resolve(resolved.resolvedPath);
752
- } else {
753
- logWarning(
754
- `Code evaluator '${name}' in '${evalId}': cwd not found (${resolved.displayPath})`,
755
- resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
756
- );
757
- }
758
- } else {
759
- resolvedCwd = searchRoots[0];
760
- }
761
- evaluators.push({
762
- name,
763
- type: "code",
764
- script,
765
- cwd,
766
- resolvedCwd
767
- });
768
- continue;
874
+ if (expectedMessages.length > 1) {
875
+ logWarning5(`Multiple expected messages found for eval case: ${id}, using first`);
769
876
  }
770
- const prompt = asString(rawEvaluator.prompt);
771
- let promptPath;
772
- if (prompt) {
773
- const resolved = await resolveFileReference(prompt, searchRoots);
774
- if (resolved.resolvedPath) {
775
- promptPath = path.resolve(resolved.resolvedPath);
776
- } else {
777
- logWarning(
778
- `Inline prompt used for evaluator '${name}' in '${evalId}' (file not found: ${resolved.displayPath})`,
779
- resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
780
- );
877
+ const guidelinePaths = [];
878
+ const inputTextParts = [];
879
+ const inputSegments = await processMessages({
880
+ messages: inputMessages,
881
+ searchRoots,
882
+ repoRootPath,
883
+ guidelinePatterns,
884
+ guidelinePaths,
885
+ textParts: inputTextParts,
886
+ messageType: "input",
887
+ verbose
888
+ });
889
+ const outputSegments = hasExpectedMessages ? await processMessages({
890
+ messages: expectedMessages,
891
+ searchRoots,
892
+ repoRootPath,
893
+ guidelinePatterns,
894
+ messageType: "output",
895
+ verbose
896
+ }) : [];
897
+ const codeSnippets = extractCodeBlocks(inputSegments);
898
+ const expectedContent = expectedMessages[0]?.content;
899
+ const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
900
+ const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
901
+ const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
902
+ const evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
903
+ const userFilePaths = [];
904
+ for (const segment of inputSegments) {
905
+ if (segment.type === "file" && typeof segment.resolvedPath === "string") {
906
+ userFilePaths.push(segment.resolvedPath);
781
907
  }
782
908
  }
783
- const model = asString(rawEvaluator.model);
784
- evaluators.push({
785
- name,
786
- type: "llm_judge",
787
- prompt,
788
- promptPath
789
- });
909
+ const allFilePaths = [
910
+ ...guidelinePaths.map((guidelinePath) => path6.resolve(guidelinePath)),
911
+ ...userFilePaths
912
+ ];
913
+ const testCase = {
914
+ id,
915
+ dataset: datasetName,
916
+ conversation_id: conversationId,
917
+ question,
918
+ input_messages: inputMessages,
919
+ input_segments: inputSegments,
920
+ output_segments: outputSegments,
921
+ reference_answer: referenceAnswer,
922
+ guideline_paths: guidelinePaths.map((guidelinePath) => path6.resolve(guidelinePath)),
923
+ guideline_patterns: guidelinePatterns,
924
+ file_paths: allFilePaths,
925
+ code_snippets: codeSnippets,
926
+ expected_outcome: outcome,
927
+ evaluator: evalCaseEvaluatorKind,
928
+ evaluators
929
+ };
930
+ if (verbose) {
931
+ console.log(`
932
+ [Eval Case: ${id}]`);
933
+ if (testCase.guideline_paths.length > 0) {
934
+ console.log(` Guidelines used: ${testCase.guideline_paths.length}`);
935
+ for (const guidelinePath of testCase.guideline_paths) {
936
+ console.log(` - ${guidelinePath}`);
937
+ }
938
+ } else {
939
+ console.log(" No guidelines found");
940
+ }
941
+ }
942
+ results.push(testCase);
790
943
  }
791
- return evaluators.length > 0 ? evaluators : void 0;
944
+ return results;
792
945
  }
793
- function coerceEvaluator(candidate, contextId) {
794
- if (typeof candidate !== "string") {
795
- return void 0;
796
- }
797
- if (isEvaluatorKind(candidate)) {
798
- return candidate;
799
- }
800
- logWarning(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
801
- return void 0;
946
+ function asString5(value) {
947
+ return typeof value === "string" ? value : void 0;
802
948
  }
803
- function logWarning(message, details) {
949
+ function logWarning5(message, details) {
804
950
  if (details && details.length > 0) {
805
951
  const detailBlock = details.join("\n");
806
- console.warn(`${ANSI_YELLOW}Warning: ${message}
807
- ${detailBlock}${ANSI_RESET}`);
952
+ console.warn(`${ANSI_YELLOW5}Warning: ${message}
953
+ ${detailBlock}${ANSI_RESET5}`);
808
954
  } else {
809
- console.warn(`${ANSI_YELLOW}Warning: ${message}${ANSI_RESET}`);
955
+ console.warn(`${ANSI_YELLOW5}Warning: ${message}${ANSI_RESET5}`);
810
956
  }
811
957
  }
812
958
 
813
- // src/evaluation/providers/ax.ts
814
- import { AxAI } from "@ax-llm/ax";
959
+ // src/evaluation/providers/ai-sdk.ts
960
+ import { createAnthropic } from "@ai-sdk/anthropic";
961
+ import { createAzure } from "@ai-sdk/azure";
962
+ import { createGoogleGenerativeAI } from "@ai-sdk/google";
963
+ import { generateText } from "ai";
815
964
  var DEFAULT_SYSTEM_PROMPT = "You are a careful assistant. Follow all provided instructions and do not fabricate results.";
965
+ var AzureProvider = class {
966
+ constructor(targetName, config) {
967
+ this.config = config;
968
+ this.id = `azure:${targetName}`;
969
+ this.targetName = targetName;
970
+ this.defaults = {
971
+ temperature: config.temperature,
972
+ maxOutputTokens: config.maxOutputTokens
973
+ };
974
+ this.retryConfig = config.retry;
975
+ const azure = createAzure(buildAzureOptions(config));
976
+ this.model = azure(config.deploymentName);
977
+ }
978
+ id;
979
+ kind = "azure";
980
+ targetName;
981
+ model;
982
+ defaults;
983
+ retryConfig;
984
+ async invoke(request) {
985
+ return invokeModel({
986
+ model: this.model,
987
+ request,
988
+ defaults: this.defaults,
989
+ retryConfig: this.retryConfig
990
+ });
991
+ }
992
+ };
993
+ var AnthropicProvider = class {
994
+ constructor(targetName, config) {
995
+ this.config = config;
996
+ this.id = `anthropic:${targetName}`;
997
+ this.targetName = targetName;
998
+ this.defaults = {
999
+ temperature: config.temperature,
1000
+ maxOutputTokens: config.maxOutputTokens,
1001
+ thinkingBudget: config.thinkingBudget
1002
+ };
1003
+ this.retryConfig = config.retry;
1004
+ const anthropic = createAnthropic({
1005
+ apiKey: config.apiKey
1006
+ });
1007
+ this.model = anthropic(config.model);
1008
+ }
1009
+ id;
1010
+ kind = "anthropic";
1011
+ targetName;
1012
+ model;
1013
+ defaults;
1014
+ retryConfig;
1015
+ async invoke(request) {
1016
+ const providerOptions = buildAnthropicProviderOptions(this.defaults);
1017
+ return invokeModel({
1018
+ model: this.model,
1019
+ request,
1020
+ defaults: this.defaults,
1021
+ retryConfig: this.retryConfig,
1022
+ providerOptions
1023
+ });
1024
+ }
1025
+ };
1026
+ var GeminiProvider = class {
1027
+ constructor(targetName, config) {
1028
+ this.config = config;
1029
+ this.id = `gemini:${targetName}`;
1030
+ this.targetName = targetName;
1031
+ this.defaults = {
1032
+ temperature: config.temperature,
1033
+ maxOutputTokens: config.maxOutputTokens
1034
+ };
1035
+ this.retryConfig = config.retry;
1036
+ const google = createGoogleGenerativeAI({
1037
+ apiKey: config.apiKey
1038
+ });
1039
+ this.model = google(config.model);
1040
+ }
1041
+ id;
1042
+ kind = "gemini";
1043
+ targetName;
1044
+ model;
1045
+ defaults;
1046
+ retryConfig;
1047
+ async invoke(request) {
1048
+ return invokeModel({
1049
+ model: this.model,
1050
+ request,
1051
+ defaults: this.defaults,
1052
+ retryConfig: this.retryConfig
1053
+ });
1054
+ }
1055
+ };
1056
+ function buildAzureOptions(config) {
1057
+ const options = {
1058
+ apiKey: config.apiKey,
1059
+ apiVersion: config.version,
1060
+ useDeploymentBasedUrls: true
1061
+ };
1062
+ const baseURL = normalizeAzureBaseUrl(config.resourceName);
1063
+ if (baseURL) {
1064
+ options.baseURL = baseURL;
1065
+ } else {
1066
+ options.resourceName = config.resourceName;
1067
+ }
1068
+ return options;
1069
+ }
1070
+ function normalizeAzureBaseUrl(resourceName) {
1071
+ const trimmed = resourceName.trim();
1072
+ if (!/^https?:\/\//i.test(trimmed)) {
1073
+ return void 0;
1074
+ }
1075
+ const withoutSlash = trimmed.replace(/\/+$/, "");
1076
+ const normalized = withoutSlash.endsWith("/openai") ? withoutSlash : `${withoutSlash}/openai`;
1077
+ return normalized;
1078
+ }
1079
+ function buildAnthropicProviderOptions(defaults) {
1080
+ if (defaults.thinkingBudget === void 0) {
1081
+ return void 0;
1082
+ }
1083
+ return {
1084
+ anthropic: {
1085
+ thinking: {
1086
+ type: "enabled",
1087
+ budgetTokens: defaults.thinkingBudget
1088
+ }
1089
+ }
1090
+ };
1091
+ }
816
1092
  function buildChatPrompt(request) {
817
- if (request.chatPrompt) {
818
- const hasSystemMessage = request.chatPrompt.some((message) => message.role === "system");
1093
+ const provided = request.chatPrompt?.length ? request.chatPrompt : void 0;
1094
+ if (provided) {
1095
+ const hasSystemMessage = provided.some((message) => message.role === "system");
819
1096
  if (hasSystemMessage) {
820
- return request.chatPrompt;
1097
+ return provided;
821
1098
  }
822
- const systemContent2 = resolveSystemContent(request);
823
- return [{ role: "system", content: systemContent2 }, ...request.chatPrompt];
1099
+ const systemContent2 = resolveSystemContent(request, false);
1100
+ return [{ role: "system", content: systemContent2 }, ...provided];
824
1101
  }
825
- const systemContent = resolveSystemContent(request);
1102
+ const systemContent = resolveSystemContent(request, true);
826
1103
  const userContent = request.question.trim();
827
1104
  const prompt = [
828
- {
829
- role: "system",
830
- content: systemContent
831
- },
832
- {
833
- role: "user",
834
- content: userContent
835
- }
1105
+ { role: "system", content: systemContent },
1106
+ { role: "user", content: userContent }
836
1107
  ];
837
1108
  return prompt;
838
1109
  }
839
- function resolveSystemContent(request) {
1110
+ function resolveSystemContent(request, includeGuidelines) {
840
1111
  const systemSegments = [];
841
- const metadataSystemPrompt = typeof request.metadata?.systemPrompt === "string" ? request.metadata.systemPrompt : void 0;
842
- if (metadataSystemPrompt && metadataSystemPrompt.trim().length > 0) {
843
- systemSegments.push(metadataSystemPrompt.trim());
1112
+ if (request.systemPrompt && request.systemPrompt.trim().length > 0) {
1113
+ systemSegments.push(request.systemPrompt.trim());
844
1114
  } else {
845
1115
  systemSegments.push(DEFAULT_SYSTEM_PROMPT);
846
1116
  }
847
- if (request.guidelines && request.guidelines.trim().length > 0) {
1117
+ if (includeGuidelines && request.guidelines && request.guidelines.trim().length > 0) {
848
1118
  systemSegments.push(`[[ ## Guidelines ## ]]
849
1119
 
850
1120
  ${request.guidelines.trim()}`);
851
1121
  }
852
1122
  return systemSegments.join("\n\n");
853
1123
  }
854
- function extractModelConfig(request, defaults) {
1124
+ function toModelMessages(chatPrompt) {
1125
+ return chatPrompt.map((message) => {
1126
+ if (message.role === "tool" || message.role === "function") {
1127
+ const prefix = message.name ? `@[${message.name}]: ` : "@[Tool]: ";
1128
+ return {
1129
+ role: "assistant",
1130
+ content: `${prefix}${message.content}`
1131
+ };
1132
+ }
1133
+ if (message.role === "assistant" || message.role === "system" || message.role === "user") {
1134
+ return {
1135
+ role: message.role,
1136
+ content: message.content
1137
+ };
1138
+ }
1139
+ return {
1140
+ role: "user",
1141
+ content: message.content
1142
+ };
1143
+ });
1144
+ }
1145
+ function resolveModelSettings(request, defaults) {
855
1146
  const temperature = request.temperature ?? defaults.temperature;
856
- const maxTokens = request.maxOutputTokens ?? defaults.maxOutputTokens;
857
- const config = {};
858
- if (temperature !== void 0) {
859
- config.temperature = temperature;
860
- }
861
- if (maxTokens !== void 0) {
862
- config.maxTokens = maxTokens;
863
- }
864
- return Object.keys(config).length > 0 ? config : void 0;
1147
+ const maxOutputTokens = request.maxOutputTokens ?? defaults.maxOutputTokens;
1148
+ return {
1149
+ temperature,
1150
+ maxOutputTokens
1151
+ };
1152
+ }
1153
+ async function invokeModel(options) {
1154
+ const { model, request, defaults, retryConfig, providerOptions } = options;
1155
+ const chatPrompt = buildChatPrompt(request);
1156
+ const { temperature, maxOutputTokens } = resolveModelSettings(request, defaults);
1157
+ const result = await withRetry(
1158
+ () => generateText({
1159
+ model,
1160
+ messages: toModelMessages(chatPrompt),
1161
+ temperature,
1162
+ maxOutputTokens,
1163
+ maxRetries: 0,
1164
+ abortSignal: request.signal,
1165
+ ...providerOptions ? { providerOptions } : {}
1166
+ }),
1167
+ retryConfig,
1168
+ request.signal
1169
+ );
1170
+ return mapResponse(result);
865
1171
  }
866
- function mapResponse(response) {
867
- const primary = response.results[0];
868
- const text = typeof primary?.content === "string" ? primary.content : "";
869
- const reasoning = primary?.thought ?? primary?.thoughtBlock?.data;
870
- const usage = toJsonObject(response.modelUsage);
1172
+ function mapResponse(result) {
871
1173
  return {
872
- text,
873
- reasoning,
874
- raw: response,
875
- usage
1174
+ text: result.text ?? "",
1175
+ reasoning: result.reasoningText ?? void 0,
1176
+ raw: result,
1177
+ usage: toJsonObject(result.totalUsage ?? result.usage)
876
1178
  };
877
1179
  }
878
1180
  function toJsonObject(value) {
@@ -885,34 +1187,59 @@ function toJsonObject(value) {
885
1187
  return void 0;
886
1188
  }
887
1189
  }
888
- function ensureChatResponse(result) {
889
- if (typeof ReadableStream !== "undefined" && result instanceof ReadableStream) {
890
- throw new Error("Streaming responses are not supported for this provider");
1190
+ function extractStatus(error) {
1191
+ if (!error || typeof error !== "object") {
1192
+ return void 0;
891
1193
  }
892
- if (!result || typeof result !== "object" || !("results" in result)) {
893
- throw new Error("Unexpected response type from AxAI provider");
1194
+ const candidate = error;
1195
+ const directStatus = candidate.status ?? candidate.statusCode;
1196
+ if (typeof directStatus === "number" && Number.isFinite(directStatus)) {
1197
+ return directStatus;
894
1198
  }
895
- return result;
1199
+ const responseStatus = typeof candidate.response === "object" && candidate.response ? candidate.response.status : void 0;
1200
+ if (typeof responseStatus === "number" && Number.isFinite(responseStatus)) {
1201
+ return responseStatus;
1202
+ }
1203
+ const message = typeof candidate.message === "string" ? candidate.message : void 0;
1204
+ if (message) {
1205
+ const match = message.match(/HTTP\s+(\d{3})/i);
1206
+ if (match) {
1207
+ const parsed = Number.parseInt(match[1], 10);
1208
+ if (Number.isFinite(parsed)) {
1209
+ return parsed;
1210
+ }
1211
+ }
1212
+ }
1213
+ return void 0;
896
1214
  }
897
- function isRetryableError(error, retryableStatusCodes) {
1215
+ function isNetworkError(error) {
898
1216
  if (!error || typeof error !== "object") {
899
1217
  return false;
900
1218
  }
901
- if ("status" in error && typeof error.status === "number") {
902
- return retryableStatusCodes.includes(error.status);
1219
+ const candidate = error;
1220
+ if (candidate.name === "AbortError") {
1221
+ return false;
903
1222
  }
904
- if ("message" in error && typeof error.message === "string") {
905
- const match = error.message.match(/HTTP (\d{3})/);
906
- if (match) {
907
- const status = Number.parseInt(match[1], 10);
908
- return retryableStatusCodes.includes(status);
909
- }
1223
+ const code = candidate.code;
1224
+ if (typeof code === "string" && /^E(AI|CONN|HOST|NET|PIPE|TIME|REFUSED|RESET)/i.test(code)) {
1225
+ return true;
910
1226
  }
911
- if ("name" in error && error.name === "AxAIServiceNetworkError") {
1227
+ const message = typeof candidate.message === "string" ? candidate.message : void 0;
1228
+ if (message && /(network|fetch failed|ECONNRESET|ENOTFOUND|EAI_AGAIN|ETIMEDOUT|ECONNREFUSED)/i.test(message)) {
912
1229
  return true;
913
1230
  }
914
1231
  return false;
915
1232
  }
1233
+ function isRetryableError(error, retryableStatusCodes) {
1234
+ const status = extractStatus(error);
1235
+ if (status === 401 || status === 403) {
1236
+ return false;
1237
+ }
1238
+ if (typeof status === "number") {
1239
+ return retryableStatusCodes.includes(status);
1240
+ }
1241
+ return isNetworkError(error);
1242
+ }
916
1243
  function calculateRetryDelay(attempt, config) {
917
1244
  const delay = Math.min(
918
1245
  config.maxDelayMs,
@@ -948,152 +1275,16 @@ async function withRetry(fn, retryConfig, signal) {
948
1275
  }
949
1276
  const delay = calculateRetryDelay(attempt, config);
950
1277
  await sleep(delay);
951
- if (signal?.aborted) {
952
- throw new Error(`Request aborted: ${signal.reason ?? "Unknown reason"}`);
953
- }
954
1278
  }
955
1279
  }
956
1280
  throw lastError;
957
1281
  }
958
- var AzureProvider = class {
959
- constructor(targetName, config) {
960
- this.config = config;
961
- this.id = `azure:${targetName}`;
962
- this.targetName = targetName;
963
- this.defaults = {
964
- temperature: config.temperature,
965
- maxOutputTokens: config.maxOutputTokens
966
- };
967
- this.retryConfig = config.retry;
968
- this.ai = AxAI.create({
969
- name: "azure-openai",
970
- apiKey: config.apiKey,
971
- resourceName: config.resourceName,
972
- deploymentName: config.deploymentName,
973
- version: config.version,
974
- config: {
975
- stream: false
976
- }
977
- });
978
- }
979
- id;
980
- kind = "azure";
981
- targetName;
982
- ai;
983
- defaults;
984
- retryConfig;
985
- async invoke(request) {
986
- const chatPrompt = buildChatPrompt(request);
987
- const modelConfig = extractModelConfig(request, this.defaults);
988
- const response = await withRetry(
989
- async () => await this.ai.chat(
990
- {
991
- chatPrompt,
992
- model: this.config.deploymentName,
993
- ...modelConfig ? { modelConfig } : {}
994
- },
995
- request.signal ? { abortSignal: request.signal } : void 0
996
- ),
997
- this.retryConfig,
998
- request.signal
999
- );
1000
- return mapResponse(ensureChatResponse(response));
1001
- }
1002
- getAxAI() {
1003
- return this.ai;
1004
- }
1005
- };
1006
- var AnthropicProvider = class {
1007
- constructor(targetName, config) {
1008
- this.config = config;
1009
- this.id = `anthropic:${targetName}`;
1010
- this.targetName = targetName;
1011
- this.defaults = {
1012
- temperature: config.temperature,
1013
- maxOutputTokens: config.maxOutputTokens,
1014
- thinkingBudget: config.thinkingBudget
1015
- };
1016
- this.retryConfig = config.retry;
1017
- this.ai = AxAI.create({
1018
- name: "anthropic",
1019
- apiKey: config.apiKey
1020
- });
1021
- }
1022
- id;
1023
- kind = "anthropic";
1024
- targetName;
1025
- ai;
1026
- defaults;
1027
- retryConfig;
1028
- async invoke(request) {
1029
- const chatPrompt = buildChatPrompt(request);
1030
- const modelConfig = extractModelConfig(request, this.defaults);
1031
- const response = await withRetry(
1032
- async () => await this.ai.chat(
1033
- {
1034
- chatPrompt,
1035
- model: this.config.model,
1036
- ...modelConfig ? { modelConfig } : {}
1037
- },
1038
- request.signal ? { abortSignal: request.signal } : void 0
1039
- ),
1040
- this.retryConfig,
1041
- request.signal
1042
- );
1043
- return mapResponse(ensureChatResponse(response));
1044
- }
1045
- getAxAI() {
1046
- return this.ai;
1047
- }
1048
- };
1049
- var GeminiProvider = class {
1050
- constructor(targetName, config) {
1051
- this.config = config;
1052
- this.id = `gemini:${targetName}`;
1053
- this.targetName = targetName;
1054
- this.defaults = {
1055
- temperature: config.temperature,
1056
- maxOutputTokens: config.maxOutputTokens
1057
- };
1058
- this.retryConfig = config.retry;
1059
- this.ai = AxAI.create({
1060
- name: "google-gemini",
1061
- apiKey: config.apiKey
1062
- });
1063
- }
1064
- id;
1065
- kind = "gemini";
1066
- targetName;
1067
- ai;
1068
- defaults;
1069
- retryConfig;
1070
- async invoke(request) {
1071
- const chatPrompt = buildChatPrompt(request);
1072
- const modelConfig = extractModelConfig(request, this.defaults);
1073
- const response = await withRetry(
1074
- async () => await this.ai.chat(
1075
- {
1076
- chatPrompt,
1077
- model: this.config.model,
1078
- ...modelConfig ? { modelConfig } : {}
1079
- },
1080
- request.signal ? { abortSignal: request.signal } : void 0
1081
- ),
1082
- this.retryConfig,
1083
- request.signal
1084
- );
1085
- return mapResponse(ensureChatResponse(response));
1086
- }
1087
- getAxAI() {
1088
- return this.ai;
1089
- }
1090
- };
1091
1282
 
1092
1283
  // src/evaluation/providers/cli.ts
1093
1284
  import { exec as execWithCallback } from "node:child_process";
1094
1285
  import fs from "node:fs/promises";
1095
1286
  import os from "node:os";
1096
- import path2 from "node:path";
1287
+ import path7 from "node:path";
1097
1288
  import { promisify } from "node:util";
1098
1289
  var execAsync = promisify(execWithCallback);
1099
1290
  var DEFAULT_MAX_BUFFER = 10 * 1024 * 1024;
@@ -1135,12 +1326,14 @@ var CliProvider = class {
1135
1326
  supportsBatch = false;
1136
1327
  config;
1137
1328
  runCommand;
1329
+ verbose;
1138
1330
  healthcheckPromise;
1139
1331
  constructor(targetName, config, runner = defaultCommandRunner) {
1140
1332
  this.targetName = targetName;
1141
1333
  this.id = `cli:${targetName}`;
1142
1334
  this.config = config;
1143
1335
  this.runCommand = runner;
1336
+ this.verbose = config.verbose ?? false;
1144
1337
  }
1145
1338
  async invoke(request) {
1146
1339
  if (request.signal?.aborted) {
@@ -1241,6 +1434,11 @@ var CliProvider = class {
1241
1434
  generateOutputFilePath("healthcheck")
1242
1435
  )
1243
1436
  );
1437
+ if (this.verbose) {
1438
+ console.log(
1439
+ `[cli-provider:${this.targetName}] (healthcheck) CLI_EVALS_DIR=${process.env.CLI_EVALS_DIR ?? ""} cwd=${healthcheck.cwd ?? this.config.cwd ?? ""} command=${renderedCommand}`
1440
+ );
1441
+ }
1244
1442
  const result = await this.runCommand(renderedCommand, {
1245
1443
  cwd: healthcheck.cwd ?? this.config.cwd,
1246
1444
  env: process.env,
@@ -1272,7 +1470,7 @@ function normalizeInputFiles(inputFiles) {
1272
1470
  }
1273
1471
  const unique = /* @__PURE__ */ new Map();
1274
1472
  for (const inputFile of inputFiles) {
1275
- const absolutePath = path2.resolve(inputFile);
1473
+ const absolutePath = path7.resolve(inputFile);
1276
1474
  if (!unique.has(absolutePath)) {
1277
1475
  unique.set(absolutePath, absolutePath);
1278
1476
  }
@@ -1286,7 +1484,7 @@ function formatFileList(files, template) {
1286
1484
  const formatter = template ?? "{path}";
1287
1485
  return files.map((filePath) => {
1288
1486
  const escapedPath = shellEscape(filePath);
1289
- const escapedName = shellEscape(path2.basename(filePath));
1487
+ const escapedName = shellEscape(path7.basename(filePath));
1290
1488
  return formatter.replaceAll("{path}", escapedPath).replaceAll("{basename}", escapedName);
1291
1489
  }).join(" ");
1292
1490
  }
@@ -1310,7 +1508,7 @@ function generateOutputFilePath(evalCaseId) {
1310
1508
  const safeEvalId = evalCaseId || "unknown";
1311
1509
  const timestamp = Date.now();
1312
1510
  const random = Math.random().toString(36).substring(2, 9);
1313
- return path2.join(os.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}.json`);
1511
+ return path7.join(os.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}.json`);
1314
1512
  }
1315
1513
  function formatTimeoutSuffix(timeoutMs) {
1316
1514
  if (!timeoutMs || timeoutMs <= 0) {
@@ -1326,7 +1524,7 @@ import { randomUUID } from "node:crypto";
1326
1524
  import { constants as constants2, createWriteStream } from "node:fs";
1327
1525
  import { access as access2, mkdtemp, mkdir, rm, writeFile } from "node:fs/promises";
1328
1526
  import { tmpdir } from "node:os";
1329
- import path4 from "node:path";
1527
+ import path9 from "node:path";
1330
1528
  import { promisify as promisify2 } from "node:util";
1331
1529
 
1332
1530
  // src/evaluation/providers/codex-log-tracker.ts
@@ -1383,7 +1581,7 @@ function subscribeToCodexLogEntries(listener) {
1383
1581
  }
1384
1582
 
1385
1583
  // src/evaluation/providers/preread.ts
1386
- import path3 from "node:path";
1584
+ import path8 from "node:path";
1387
1585
  function buildPromptDocument(request, inputFiles, options) {
1388
1586
  const parts = [];
1389
1587
  const guidelineFiles = collectGuidelineFiles(
@@ -1408,7 +1606,7 @@ function normalizeInputFiles2(inputFiles) {
1408
1606
  }
1409
1607
  const deduped = /* @__PURE__ */ new Map();
1410
1608
  for (const inputFile of inputFiles) {
1411
- const absolutePath = path3.resolve(inputFile);
1609
+ const absolutePath = path8.resolve(inputFile);
1412
1610
  if (!deduped.has(absolutePath)) {
1413
1611
  deduped.set(absolutePath, absolutePath);
1414
1612
  }
@@ -1421,14 +1619,14 @@ function collectGuidelineFiles(inputFiles, guidelinePatterns, overrides) {
1421
1619
  }
1422
1620
  const unique = /* @__PURE__ */ new Map();
1423
1621
  for (const inputFile of inputFiles) {
1424
- const absolutePath = path3.resolve(inputFile);
1622
+ const absolutePath = path8.resolve(inputFile);
1425
1623
  if (overrides?.has(absolutePath)) {
1426
1624
  if (!unique.has(absolutePath)) {
1427
1625
  unique.set(absolutePath, absolutePath);
1428
1626
  }
1429
1627
  continue;
1430
1628
  }
1431
- const normalized = absolutePath.split(path3.sep).join("/");
1629
+ const normalized = absolutePath.split(path8.sep).join("/");
1432
1630
  if (isGuidelineFile(normalized, guidelinePatterns)) {
1433
1631
  if (!unique.has(absolutePath)) {
1434
1632
  unique.set(absolutePath, absolutePath);
@@ -1443,7 +1641,7 @@ function collectInputFiles(inputFiles) {
1443
1641
  }
1444
1642
  const unique = /* @__PURE__ */ new Map();
1445
1643
  for (const inputFile of inputFiles) {
1446
- const absolutePath = path3.resolve(inputFile);
1644
+ const absolutePath = path8.resolve(inputFile);
1447
1645
  if (!unique.has(absolutePath)) {
1448
1646
  unique.set(absolutePath, absolutePath);
1449
1647
  }
@@ -1455,7 +1653,7 @@ function buildMandatoryPrereadBlock(guidelineFiles, inputFiles) {
1455
1653
  return "";
1456
1654
  }
1457
1655
  const buildList = (files) => files.map((absolutePath) => {
1458
- const fileName = path3.basename(absolutePath);
1656
+ const fileName = path8.basename(absolutePath);
1459
1657
  const fileUri = pathToFileUri(absolutePath);
1460
1658
  return `* [${fileName}](${fileUri})`;
1461
1659
  });
@@ -1475,7 +1673,7 @@ ${buildList(inputFiles).join("\n")}.`);
1475
1673
  return sections.join("\n");
1476
1674
  }
1477
1675
  function pathToFileUri(filePath) {
1478
- const absolutePath = path3.isAbsolute(filePath) ? filePath : path3.resolve(filePath);
1676
+ const absolutePath = path8.isAbsolute(filePath) ? filePath : path8.resolve(filePath);
1479
1677
  const normalizedPath = absolutePath.replace(/\\/g, "/");
1480
1678
  if (/^[a-zA-Z]:\//.test(normalizedPath)) {
1481
1679
  return `file:///${normalizedPath}`;
@@ -1513,7 +1711,7 @@ var CodexProvider = class {
1513
1711
  const logger = await this.createStreamLogger(request).catch(() => void 0);
1514
1712
  try {
1515
1713
  const promptContent = buildPromptDocument(request, inputFiles);
1516
- const promptFile = path4.join(workspaceRoot, PROMPT_FILENAME);
1714
+ const promptFile = path9.join(workspaceRoot, PROMPT_FILENAME);
1517
1715
  await writeFile(promptFile, promptContent, "utf8");
1518
1716
  const args = this.buildCodexArgs();
1519
1717
  const cwd = this.resolveCwd(workspaceRoot);
@@ -1563,7 +1761,7 @@ var CodexProvider = class {
1563
1761
  if (!this.config.cwd) {
1564
1762
  return workspaceRoot;
1565
1763
  }
1566
- return path4.resolve(this.config.cwd);
1764
+ return path9.resolve(this.config.cwd);
1567
1765
  }
1568
1766
  buildCodexArgs() {
1569
1767
  const args = ["--ask-for-approval", "never", "exec", "--json", "--color", "never", "--skip-git-repo-check"];
@@ -1597,7 +1795,7 @@ var CodexProvider = class {
1597
1795
  }
1598
1796
  }
1599
1797
  async createWorkspace() {
1600
- return await mkdtemp(path4.join(tmpdir(), WORKSPACE_PREFIX));
1798
+ return await mkdtemp(path9.join(tmpdir(), WORKSPACE_PREFIX));
1601
1799
  }
1602
1800
  async cleanupWorkspace(workspaceRoot) {
1603
1801
  try {
@@ -1611,9 +1809,9 @@ var CodexProvider = class {
1611
1809
  return void 0;
1612
1810
  }
1613
1811
  if (this.config.logDir) {
1614
- return path4.resolve(this.config.logDir);
1812
+ return path9.resolve(this.config.logDir);
1615
1813
  }
1616
- return path4.join(process.cwd(), ".agentv", "logs", "codex");
1814
+ return path9.join(process.cwd(), ".agentv", "logs", "codex");
1617
1815
  }
1618
1816
  async createStreamLogger(request) {
1619
1817
  const logDir = this.resolveLogDirectory();
@@ -1627,7 +1825,7 @@ var CodexProvider = class {
1627
1825
  console.warn(`Skipping Codex stream logging (could not create ${logDir}): ${message}`);
1628
1826
  return void 0;
1629
1827
  }
1630
- const filePath = path4.join(logDir, buildLogFilename(request, this.targetName));
1828
+ const filePath = path9.join(logDir, buildLogFilename(request, this.targetName));
1631
1829
  try {
1632
1830
  const logger = await CodexStreamLogger.create({
1633
1831
  filePath,
@@ -1842,7 +2040,7 @@ function tryParseJsonValue(rawLine) {
1842
2040
  async function locateExecutable(candidate) {
1843
2041
  const includesPathSeparator = candidate.includes("/") || candidate.includes("\\");
1844
2042
  if (includesPathSeparator) {
1845
- const resolved = path4.isAbsolute(candidate) ? candidate : path4.resolve(candidate);
2043
+ const resolved = path9.isAbsolute(candidate) ? candidate : path9.resolve(candidate);
1846
2044
  const executablePath = await ensureWindowsExecutableVariant(resolved);
1847
2045
  await access2(executablePath, constants2.F_OK);
1848
2046
  return executablePath;
@@ -2189,7 +2387,7 @@ var MockProvider = class {
2189
2387
  };
2190
2388
 
2191
2389
  // src/evaluation/providers/vscode.ts
2192
- import path5 from "node:path";
2390
+ import path10 from "node:path";
2193
2391
  import { dispatchAgentSession, dispatchBatchAgent, getSubagentRoot, provisionSubagents } from "subagent";
2194
2392
  var VSCodeProvider = class {
2195
2393
  id;
@@ -2302,6 +2500,9 @@ var VSCodeProvider = class {
2302
2500
  };
2303
2501
  function buildPromptDocument2(request, attachments, guidelinePatterns) {
2304
2502
  const parts = [];
2503
+ if (request.systemPrompt && request.systemPrompt.trim().length > 0) {
2504
+ parts.push(request.systemPrompt.trim());
2505
+ }
2305
2506
  const guidelineFiles = collectGuidelineFiles2(attachments, guidelinePatterns);
2306
2507
  const attachmentFiles = collectAttachmentFiles(attachments);
2307
2508
  const nonGuidelineAttachments = attachmentFiles.filter(
@@ -2319,7 +2520,7 @@ function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
2319
2520
  return "";
2320
2521
  }
2321
2522
  const buildList = (files) => files.map((absolutePath) => {
2322
- const fileName = path5.basename(absolutePath);
2523
+ const fileName = path10.basename(absolutePath);
2323
2524
  const fileUri = pathToFileUri2(absolutePath);
2324
2525
  return `* [${fileName}](${fileUri})`;
2325
2526
  });
@@ -2344,8 +2545,8 @@ function collectGuidelineFiles2(attachments, guidelinePatterns) {
2344
2545
  }
2345
2546
  const unique = /* @__PURE__ */ new Map();
2346
2547
  for (const attachment of attachments) {
2347
- const absolutePath = path5.resolve(attachment);
2348
- const normalized = absolutePath.split(path5.sep).join("/");
2548
+ const absolutePath = path10.resolve(attachment);
2549
+ const normalized = absolutePath.split(path10.sep).join("/");
2349
2550
  if (isGuidelineFile(normalized, guidelinePatterns)) {
2350
2551
  if (!unique.has(absolutePath)) {
2351
2552
  unique.set(absolutePath, absolutePath);
@@ -2360,7 +2561,7 @@ function collectAttachmentFiles(attachments) {
2360
2561
  }
2361
2562
  const unique = /* @__PURE__ */ new Map();
2362
2563
  for (const attachment of attachments) {
2363
- const absolutePath = path5.resolve(attachment);
2564
+ const absolutePath = path10.resolve(attachment);
2364
2565
  if (!unique.has(absolutePath)) {
2365
2566
  unique.set(absolutePath, absolutePath);
2366
2567
  }
@@ -2368,7 +2569,7 @@ function collectAttachmentFiles(attachments) {
2368
2569
  return Array.from(unique.values());
2369
2570
  }
2370
2571
  function pathToFileUri2(filePath) {
2371
- const absolutePath = path5.isAbsolute(filePath) ? filePath : path5.resolve(filePath);
2572
+ const absolutePath = path10.isAbsolute(filePath) ? filePath : path10.resolve(filePath);
2372
2573
  const normalizedPath = absolutePath.replace(/\\/g, "/");
2373
2574
  if (/^[a-zA-Z]:\//.test(normalizedPath)) {
2374
2575
  return `file:///${normalizedPath}`;
@@ -2381,7 +2582,7 @@ function normalizeAttachments(attachments) {
2381
2582
  }
2382
2583
  const deduped = /* @__PURE__ */ new Set();
2383
2584
  for (const attachment of attachments) {
2384
- deduped.add(path5.resolve(attachment));
2585
+ deduped.add(path10.resolve(attachment));
2385
2586
  }
2386
2587
  return Array.from(deduped);
2387
2588
  }
@@ -2390,7 +2591,7 @@ function mergeAttachments(all) {
2390
2591
  for (const list of all) {
2391
2592
  if (!list) continue;
2392
2593
  for (const inputFile of list) {
2393
- deduped.add(path5.resolve(inputFile));
2594
+ deduped.add(path10.resolve(inputFile));
2394
2595
  }
2395
2596
  }
2396
2597
  return deduped.size > 0 ? Array.from(deduped) : void 0;
@@ -2436,9 +2637,9 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
2436
2637
 
2437
2638
  // src/evaluation/providers/targets-file.ts
2438
2639
  import { constants as constants3 } from "node:fs";
2439
- import { access as access3, readFile as readFile2 } from "node:fs/promises";
2440
- import path6 from "node:path";
2441
- import { parse as parse2 } from "yaml";
2640
+ import { access as access3, readFile as readFile5 } from "node:fs/promises";
2641
+ import path11 from "node:path";
2642
+ import { parse as parse3 } from "yaml";
2442
2643
  function isRecord(value) {
2443
2644
  return typeof value === "object" && value !== null && !Array.isArray(value);
2444
2645
  }
@@ -2493,12 +2694,12 @@ async function fileExists3(filePath) {
2493
2694
  }
2494
2695
  }
2495
2696
  async function readTargetDefinitions(filePath) {
2496
- const absolutePath = path6.resolve(filePath);
2697
+ const absolutePath = path11.resolve(filePath);
2497
2698
  if (!await fileExists3(absolutePath)) {
2498
2699
  throw new Error(`targets.yaml not found at ${absolutePath}`);
2499
2700
  }
2500
- const raw = await readFile2(absolutePath, "utf8");
2501
- const parsed = parse2(raw);
2701
+ const raw = await readFile5(absolutePath, "utf8");
2702
+ const parsed = parse3(raw);
2502
2703
  if (!isRecord(parsed)) {
2503
2704
  throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with '$schema' and 'targets' fields`);
2504
2705
  }
@@ -2541,18 +2742,34 @@ function resolveAndCreateProvider(definition, env = process.env) {
2541
2742
  }
2542
2743
 
2543
2744
  // src/evaluation/evaluators.ts
2544
- import { randomUUID as randomUUID2 } from "node:crypto";
2745
+ var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
2746
+
2747
+ Use the reference_answer as a gold standard for a high-quality response (if provided). The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
2748
+
2749
+ Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.
2750
+
2751
+ [[ ## expected_outcome ## ]]
2752
+ {{expected_outcome}}
2753
+
2754
+ [[ ## question ## ]]
2755
+ {{question}}
2756
+
2757
+ [[ ## reference_answer ## ]]
2758
+ {{reference_answer}}
2759
+
2760
+ [[ ## candidate_answer ## ]]
2761
+ {{candidate_answer}}`;
2545
2762
  var LlmJudgeEvaluator = class {
2546
2763
  kind = "llm_judge";
2547
2764
  resolveJudgeProvider;
2548
2765
  maxOutputTokens;
2549
2766
  temperature;
2550
- customPrompt;
2767
+ evaluatorTemplate;
2551
2768
  constructor(options) {
2552
2769
  this.resolveJudgeProvider = options.resolveJudgeProvider;
2553
2770
  this.maxOutputTokens = options.maxOutputTokens;
2554
2771
  this.temperature = options.temperature;
2555
- this.customPrompt = options.customPrompt;
2772
+ this.evaluatorTemplate = options.evaluatorTemplate;
2556
2773
  }
2557
2774
  async evaluate(context) {
2558
2775
  const judgeProvider = await this.resolveJudgeProvider(context);
@@ -2562,26 +2779,21 @@ var LlmJudgeEvaluator = class {
2562
2779
  return this.evaluateWithPrompt(context, judgeProvider);
2563
2780
  }
2564
2781
  async evaluateWithPrompt(context, judgeProvider) {
2565
- const hasReferenceAnswer = hasNonEmptyReferenceAnswer(context.evalCase);
2566
2782
  const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
2567
- let prompt = buildQualityPrompt(context.evalCase, context.candidate, formattedQuestion);
2568
- let systemPrompt = context.systemPrompt ?? this.customPrompt ?? buildSystemPrompt(hasReferenceAnswer);
2569
- if (systemPrompt && hasTemplateVariables(systemPrompt)) {
2570
- const variables = {
2571
- input_messages: JSON.stringify(context.evalCase.input_segments, null, 2),
2572
- output_messages: JSON.stringify(context.evalCase.output_segments, null, 2),
2573
- candidate_answer: context.candidate,
2574
- reference_answer: context.evalCase.reference_answer ?? "",
2575
- expected_outcome: context.evalCase.expected_outcome,
2576
- question: formattedQuestion
2577
- };
2578
- prompt = substituteVariables(systemPrompt, variables);
2579
- systemPrompt = buildSystemPrompt(hasReferenceAnswer);
2580
- }
2581
- const metadata = systemPrompt !== void 0 ? { systemPrompt } : {};
2783
+ const variables = {
2784
+ input_messages: JSON.stringify(context.evalCase.input_segments, null, 2),
2785
+ output_messages: JSON.stringify(context.evalCase.output_segments, null, 2),
2786
+ candidate_answer: context.candidate.trim(),
2787
+ reference_answer: (context.evalCase.reference_answer ?? "").trim(),
2788
+ expected_outcome: context.evalCase.expected_outcome.trim(),
2789
+ question: formattedQuestion.trim()
2790
+ };
2791
+ const systemPrompt = buildOutputSchema();
2792
+ const evaluatorTemplate = context.evaluatorTemplateOverride ?? this.evaluatorTemplate ?? DEFAULT_EVALUATOR_TEMPLATE;
2793
+ const userPrompt = substituteVariables(evaluatorTemplate, variables);
2582
2794
  const response = await judgeProvider.invoke({
2583
- question: prompt,
2584
- metadata,
2795
+ question: userPrompt,
2796
+ systemPrompt,
2585
2797
  evalCaseId: context.evalCase.id,
2586
2798
  attempt: context.attempt,
2587
2799
  maxOutputTokens: this.maxOutputTokens,
@@ -2594,11 +2806,9 @@ var LlmJudgeEvaluator = class {
2594
2806
  const reasoning = parsed.reasoning ?? response.reasoning;
2595
2807
  const expectedAspectCount = Math.max(hits.length + misses.length, 1);
2596
2808
  const evaluatorRawRequest = {
2597
- id: randomUUID2(),
2598
- provider: judgeProvider.id,
2599
- prompt,
2600
- target: context.target.name,
2601
- ...systemPrompt !== void 0 && { systemPrompt }
2809
+ userPrompt,
2810
+ systemPrompt,
2811
+ target: judgeProvider.targetName
2602
2812
  };
2603
2813
  return {
2604
2814
  score,
@@ -2610,20 +2820,8 @@ var LlmJudgeEvaluator = class {
2610
2820
  };
2611
2821
  }
2612
2822
  };
2613
- function buildSystemPrompt(hasReferenceAnswer) {
2614
- const basePrompt = [
2615
- "You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.",
2616
- ""
2617
- ];
2618
- if (hasReferenceAnswer) {
2619
- basePrompt.push(
2620
- "Use the reference_answer as a gold standard for a high-quality response. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.",
2621
- ""
2622
- );
2623
- }
2624
- basePrompt.push(
2625
- "Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.",
2626
- "",
2823
+ function buildOutputSchema() {
2824
+ return [
2627
2825
  "You must respond with a single JSON object matching this schema:",
2628
2826
  "",
2629
2827
  "{",
@@ -2632,30 +2830,7 @@ function buildSystemPrompt(hasReferenceAnswer) {
2632
2830
  ' "misses": [<array of strings, max 4 items, brief specific failures or omissions, empty if none>],',
2633
2831
  ' "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
2634
2832
  "}"
2635
- );
2636
- return basePrompt.join("\n");
2637
- }
2638
- function buildQualityPrompt(evalCase, candidate, question) {
2639
- const parts = [
2640
- "[[ ## expected_outcome ## ]]",
2641
- evalCase.expected_outcome.trim(),
2642
- "",
2643
- "[[ ## question ## ]]",
2644
- question.trim(),
2645
- ""
2646
- ];
2647
- if (hasNonEmptyReferenceAnswer(evalCase)) {
2648
- parts.push(
2649
- "[[ ## reference_answer ## ]]",
2650
- evalCase.reference_answer.trim(),
2651
- ""
2652
- );
2653
- }
2654
- parts.push(
2655
- "[[ ## candidate_answer ## ]]",
2656
- candidate.trim()
2657
- );
2658
- return parts.join("\n");
2833
+ ].join("\n");
2659
2834
  }
2660
2835
  function clampScore(value) {
2661
2836
  if (Number.isNaN(value) || !Number.isFinite(value)) {
@@ -2737,9 +2912,6 @@ function extractJsonBlob(text) {
2737
2912
  function isNonEmptyString(value) {
2738
2913
  return typeof value === "string" && value.trim().length > 0;
2739
2914
  }
2740
- function hasNonEmptyReferenceAnswer(evalCase) {
2741
- return evalCase.reference_answer !== void 0 && evalCase.reference_answer.trim().length > 0;
2742
- }
2743
2915
  var CodeEvaluator = class {
2744
2916
  kind = "code";
2745
2917
  script;
@@ -2845,19 +3017,16 @@ function parseJsonSafe(payload) {
2845
3017
  return void 0;
2846
3018
  }
2847
3019
  }
2848
- function hasTemplateVariables(text) {
2849
- return /\$\{[a-zA-Z0-9_]+\}/.test(text);
2850
- }
2851
3020
  function substituteVariables(template, variables) {
2852
- return template.replace(/\$\{([a-zA-Z0-9_]+)\}/g, (match, varName) => {
3021
+ return template.replace(/\{\{([a-zA-Z0-9_]+)\}\}/g, (match, varName) => {
2853
3022
  return variables[varName] ?? match;
2854
3023
  });
2855
3024
  }
2856
3025
 
2857
3026
  // src/evaluation/orchestrator.ts
2858
- import { createHash, randomUUID as randomUUID3 } from "node:crypto";
3027
+ import { createHash, randomUUID as randomUUID2 } from "node:crypto";
2859
3028
  import { mkdir as mkdir2, writeFile as writeFile2 } from "node:fs/promises";
2860
- import path7 from "node:path";
3029
+ import path12 from "node:path";
2861
3030
 
2862
3031
  // ../../node_modules/.pnpm/yocto-queue@1.2.1/node_modules/yocto-queue/index.js
2863
3032
  var Node = class {
@@ -3420,6 +3589,7 @@ async function evaluateCandidate(options) {
3420
3589
  }
3421
3590
  }
3422
3591
  return {
3592
+ timestamp: completedAt.toISOString(),
3423
3593
  eval_id: evalCase.id,
3424
3594
  dataset: evalCase.dataset,
3425
3595
  conversation_id: evalCase.conversation_id,
@@ -3427,14 +3597,12 @@ async function evaluateCandidate(options) {
3427
3597
  hits: score.hits,
3428
3598
  misses: score.misses,
3429
3599
  candidate_answer: candidate,
3430
- expected_aspect_count: score.expectedAspectCount,
3431
3600
  target: target.name,
3432
- timestamp: completedAt.toISOString(),
3433
3601
  reasoning: score.reasoning,
3434
3602
  raw_aspects: score.rawAspects,
3435
3603
  agent_provider_request: agentProviderRequest,
3436
3604
  lm_provider_request: lmProviderRequest,
3437
- evaluator_raw_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
3605
+ evaluator_provider_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
3438
3606
  evaluator_results: evaluatorResults
3439
3607
  };
3440
3608
  }
@@ -3511,7 +3679,7 @@ async function runEvaluatorList(options) {
3511
3679
  hits: score2.hits,
3512
3680
  misses: score2.misses,
3513
3681
  reasoning: score2.reasoning,
3514
- evaluator_raw_request: score2.evaluatorRawRequest
3682
+ evaluator_provider_request: score2.evaluatorRawRequest
3515
3683
  });
3516
3684
  continue;
3517
3685
  }
@@ -3538,7 +3706,7 @@ async function runEvaluatorList(options) {
3538
3706
  hits: score2.hits,
3539
3707
  misses: score2.misses,
3540
3708
  reasoning: score2.reasoning,
3541
- evaluator_raw_request: score2.evaluatorRawRequest
3709
+ evaluator_provider_request: score2.evaluatorRawRequest
3542
3710
  });
3543
3711
  continue;
3544
3712
  }
@@ -3591,7 +3759,7 @@ async function runLlmJudgeEvaluator(options) {
3591
3759
  promptInputs,
3592
3760
  now,
3593
3761
  judgeProvider,
3594
- systemPrompt: customPrompt,
3762
+ evaluatorTemplateOverride: customPrompt,
3595
3763
  evaluator: config
3596
3764
  });
3597
3765
  }
@@ -3632,8 +3800,8 @@ function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
3632
3800
  async function dumpPrompt(directory, evalCase, promptInputs) {
3633
3801
  const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
3634
3802
  const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
3635
- const filePath = path7.resolve(directory, filename);
3636
- await mkdir2(path7.dirname(filePath), { recursive: true });
3803
+ const filePath = path12.resolve(directory, filename);
3804
+ await mkdir2(path12.dirname(filePath), { recursive: true });
3637
3805
  const payload = {
3638
3806
  eval_id: evalCase.id,
3639
3807
  question: promptInputs.question,
@@ -3647,7 +3815,7 @@ function sanitizeFilename(value) {
3647
3815
  return "prompt";
3648
3816
  }
3649
3817
  const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
3650
- return sanitized.length > 0 ? sanitized : randomUUID3();
3818
+ return sanitized.length > 0 ? sanitized : randomUUID2();
3651
3819
  }
3652
3820
  async function invokeProvider(provider, options) {
3653
3821
  const { evalCase, promptInputs, attempt, agentTimeoutMs, signal } = options;
@@ -3703,6 +3871,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
3703
3871
  }
3704
3872
  }
3705
3873
  return {
3874
+ timestamp: timestamp.toISOString(),
3706
3875
  eval_id: evalCase.id,
3707
3876
  dataset: evalCase.dataset,
3708
3877
  conversation_id: evalCase.conversation_id,
@@ -3710,9 +3879,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
3710
3879
  hits: [],
3711
3880
  misses: [`Error: ${message}`],
3712
3881
  candidate_answer: `Error occurred: ${message}`,
3713
- expected_aspect_count: 0,
3714
3882
  target: targetName,
3715
- timestamp: timestamp.toISOString(),
3716
3883
  raw_aspects: [],
3717
3884
  agent_provider_request: agentProviderRequest,
3718
3885
  lm_provider_request: lmProviderRequest,