@agentv/core 0.11.0 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -9,7 +9,7 @@ import {
9
9
  readTextFile,
10
10
  resolveFileReference,
11
11
  resolveTargetDefinition
12
- } from "./chunk-YQBJAT5I.js";
12
+ } from "./chunk-U3GEJ3K7.js";
13
13
 
14
14
  // src/evaluation/types.ts
15
15
  var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
@@ -62,48 +62,197 @@ function getHitCount(result) {
62
62
  }
63
63
 
64
64
  // src/evaluation/yaml-parser.ts
65
+ import { readFile as readFile4 } from "node:fs/promises";
66
+ import path6 from "node:path";
67
+ import { parse as parse2 } from "yaml";
68
+
69
+ // src/evaluation/formatting/segment-formatter.ts
70
+ function extractCodeBlocks(segments) {
71
+ const CODE_BLOCK_PATTERN = /```[\s\S]*?```/g;
72
+ const codeBlocks = [];
73
+ for (const segment of segments) {
74
+ const typeValue = segment["type"];
75
+ if (typeof typeValue !== "string" || typeValue !== "text") {
76
+ continue;
77
+ }
78
+ const textValue = segment["value"];
79
+ if (typeof textValue !== "string") {
80
+ continue;
81
+ }
82
+ const matches = textValue.match(CODE_BLOCK_PATTERN);
83
+ if (matches) {
84
+ codeBlocks.push(...matches);
85
+ }
86
+ }
87
+ return codeBlocks;
88
+ }
89
+ function formatFileContents(parts) {
90
+ const fileCount = parts.filter((p) => p.isFile).length;
91
+ if (fileCount > 0) {
92
+ return parts.map((part) => {
93
+ if (part.isFile && part.displayPath) {
94
+ return `<file path="${part.displayPath}">
95
+ ${part.content}
96
+ </file>`;
97
+ }
98
+ return part.content;
99
+ }).join("\n\n");
100
+ }
101
+ return parts.map((p) => p.content).join(" ");
102
+ }
103
+ function formatSegment(segment) {
104
+ const type = asString(segment.type);
105
+ if (type === "text") {
106
+ return asString(segment.value);
107
+ }
108
+ if (type === "guideline_ref") {
109
+ const refPath = asString(segment.path);
110
+ return refPath ? `<Attached: ${refPath}>` : void 0;
111
+ }
112
+ if (type === "file") {
113
+ const text = asString(segment.text);
114
+ const filePath = asString(segment.path);
115
+ if (text && filePath) {
116
+ return formatFileContents([{ content: text.trim(), isFile: true, displayPath: filePath }]);
117
+ }
118
+ }
119
+ return void 0;
120
+ }
121
+ function hasVisibleContent(segments) {
122
+ return segments.some((segment) => {
123
+ const type = asString(segment.type);
124
+ if (type === "text") {
125
+ const value = asString(segment.value);
126
+ return value !== void 0 && value.trim().length > 0;
127
+ }
128
+ if (type === "guideline_ref") {
129
+ return false;
130
+ }
131
+ if (type === "file") {
132
+ const text = asString(segment.text);
133
+ return text !== void 0 && text.trim().length > 0;
134
+ }
135
+ return false;
136
+ });
137
+ }
138
+ function asString(value) {
139
+ return typeof value === "string" ? value : void 0;
140
+ }
141
+
142
+ // src/evaluation/loaders/config-loader.ts
65
143
  import micromatch from "micromatch";
144
+ import { readFile } from "node:fs/promises";
145
+ import path2 from "node:path";
146
+ import { parse } from "yaml";
147
+
148
+ // src/evaluation/loaders/file-resolver.ts
66
149
  import { constants } from "node:fs";
67
- import { access, readFile } from "node:fs/promises";
150
+ import { access } from "node:fs/promises";
68
151
  import path from "node:path";
69
- import { fileURLToPath } from "node:url";
70
- import { parse } from "yaml";
71
- var CODE_BLOCK_PATTERN = /```[\s\S]*?```/g;
72
- var ANSI_YELLOW = "\x1B[33m";
73
- var ANSI_RESET = "\x1B[0m";
74
- var SCHEMA_EVAL_V2 = "agentv-eval-v2";
75
- var SCHEMA_CONFIG_V2 = "agentv-config-v2";
76
- async function readTestSuiteMetadata(testFilePath) {
152
+ async function fileExists2(absolutePath) {
77
153
  try {
78
- const absolutePath = path.resolve(testFilePath);
79
- const content = await readFile(absolutePath, "utf8");
80
- const parsed = parse(content);
81
- if (!isJsonObject(parsed)) {
82
- return {};
83
- }
84
- return { target: extractTargetFromSuite(parsed) };
154
+ await access(absolutePath, constants.F_OK);
155
+ return true;
85
156
  } catch {
86
- return {};
157
+ return false;
87
158
  }
88
159
  }
89
- function extractTargetFromSuite(suite) {
90
- const execution = suite.execution;
91
- if (execution && typeof execution === "object" && !Array.isArray(execution)) {
92
- const executionTarget = execution.target;
93
- if (typeof executionTarget === "string" && executionTarget.trim().length > 0) {
94
- return executionTarget.trim();
160
+ function resolveToAbsolutePath(candidate) {
161
+ if (candidate instanceof URL) {
162
+ return new URL(candidate).pathname;
163
+ }
164
+ if (typeof candidate === "string") {
165
+ if (candidate.startsWith("file://")) {
166
+ return new URL(candidate).pathname;
95
167
  }
168
+ return path.resolve(candidate);
96
169
  }
97
- const targetValue = suite.target;
98
- if (typeof targetValue === "string" && targetValue.trim().length > 0) {
99
- return targetValue.trim();
170
+ throw new TypeError("Unsupported repoRoot value. Expected string or URL.");
171
+ }
172
+ function buildDirectoryChain2(filePath, repoRoot) {
173
+ const directories = [];
174
+ const seen = /* @__PURE__ */ new Set();
175
+ const boundary = path.resolve(repoRoot);
176
+ let current = path.resolve(path.dirname(filePath));
177
+ while (current !== void 0) {
178
+ if (!seen.has(current)) {
179
+ directories.push(current);
180
+ seen.add(current);
181
+ }
182
+ if (current === boundary) {
183
+ break;
184
+ }
185
+ const parent = path.dirname(current);
186
+ if (parent === current) {
187
+ break;
188
+ }
189
+ current = parent;
100
190
  }
101
- return void 0;
191
+ if (!seen.has(boundary)) {
192
+ directories.push(boundary);
193
+ }
194
+ return directories;
102
195
  }
196
+ function buildSearchRoots2(evalPath, repoRoot) {
197
+ const uniqueRoots = [];
198
+ const addRoot = (root) => {
199
+ const normalized = path.resolve(root);
200
+ if (!uniqueRoots.includes(normalized)) {
201
+ uniqueRoots.push(normalized);
202
+ }
203
+ };
204
+ let currentDir = path.dirname(evalPath);
205
+ let reachedBoundary = false;
206
+ while (!reachedBoundary) {
207
+ addRoot(currentDir);
208
+ const parentDir = path.dirname(currentDir);
209
+ if (currentDir === repoRoot || parentDir === currentDir) {
210
+ reachedBoundary = true;
211
+ } else {
212
+ currentDir = parentDir;
213
+ }
214
+ }
215
+ addRoot(repoRoot);
216
+ addRoot(process.cwd());
217
+ return uniqueRoots;
218
+ }
219
+ function trimLeadingSeparators(value) {
220
+ const trimmed = value.replace(/^[/\\]+/, "");
221
+ return trimmed.length > 0 ? trimmed : value;
222
+ }
223
+ async function resolveFileReference2(rawValue, searchRoots) {
224
+ const displayPath = trimLeadingSeparators(rawValue);
225
+ const potentialPaths = [];
226
+ if (path.isAbsolute(rawValue)) {
227
+ potentialPaths.push(path.normalize(rawValue));
228
+ }
229
+ for (const base of searchRoots) {
230
+ potentialPaths.push(path.resolve(base, displayPath));
231
+ }
232
+ const attempted = [];
233
+ const seen = /* @__PURE__ */ new Set();
234
+ for (const candidate of potentialPaths) {
235
+ const absoluteCandidate = path.resolve(candidate);
236
+ if (seen.has(absoluteCandidate)) {
237
+ continue;
238
+ }
239
+ seen.add(absoluteCandidate);
240
+ attempted.push(absoluteCandidate);
241
+ if (await fileExists2(absoluteCandidate)) {
242
+ return { displayPath, resolvedPath: absoluteCandidate, attempted };
243
+ }
244
+ }
245
+ return { displayPath, attempted };
246
+ }
247
+
248
+ // src/evaluation/loaders/config-loader.ts
249
+ var SCHEMA_CONFIG_V2 = "agentv-config-v2";
250
+ var ANSI_YELLOW = "\x1B[33m";
251
+ var ANSI_RESET = "\x1B[0m";
103
252
  async function loadConfig(evalFilePath, repoRoot) {
104
- const directories = buildDirectoryChain(evalFilePath, repoRoot);
253
+ const directories = buildDirectoryChain2(evalFilePath, repoRoot);
105
254
  for (const directory of directories) {
106
- const configPath = path.join(directory, ".agentv", "config.yaml");
255
+ const configPath = path2.join(directory, ".agentv", "config.yaml");
107
256
  if (!await fileExists2(configPath)) {
108
257
  continue;
109
258
  }
@@ -146,24 +295,134 @@ function isGuidelineFile(filePath, patterns) {
146
295
  const patternsToUse = patterns ?? [];
147
296
  return micromatch.isMatch(normalized, patternsToUse);
148
297
  }
149
- function extractCodeBlocks(segments) {
150
- const codeBlocks = [];
151
- for (const segment of segments) {
152
- const typeValue = segment["type"];
153
- if (typeof typeValue !== "string" || typeValue !== "text") {
298
+ function extractTargetFromSuite(suite) {
299
+ const execution = suite.execution;
300
+ if (execution && typeof execution === "object" && !Array.isArray(execution)) {
301
+ const executionTarget = execution.target;
302
+ if (typeof executionTarget === "string" && executionTarget.trim().length > 0) {
303
+ return executionTarget.trim();
304
+ }
305
+ }
306
+ const targetValue = suite.target;
307
+ if (typeof targetValue === "string" && targetValue.trim().length > 0) {
308
+ return targetValue.trim();
309
+ }
310
+ return void 0;
311
+ }
312
+ function logWarning(message) {
313
+ console.warn(`${ANSI_YELLOW}Warning: ${message}${ANSI_RESET}`);
314
+ }
315
+
316
+ // src/evaluation/loaders/evaluator-parser.ts
317
+ import path3 from "node:path";
318
+ var ANSI_YELLOW2 = "\x1B[33m";
319
+ var ANSI_RESET2 = "\x1B[0m";
320
+ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
321
+ const execution = rawEvalCase.execution;
322
+ const candidateEvaluators = isJsonObject2(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators ?? globalExecution?.evaluators;
323
+ if (candidateEvaluators === void 0) {
324
+ return void 0;
325
+ }
326
+ if (!Array.isArray(candidateEvaluators)) {
327
+ logWarning2(`Skipping evaluators for '${evalId}': expected array`);
328
+ return void 0;
329
+ }
330
+ const evaluators = [];
331
+ for (const rawEvaluator of candidateEvaluators) {
332
+ if (!isJsonObject2(rawEvaluator)) {
333
+ logWarning2(`Skipping invalid evaluator entry for '${evalId}' (expected object)`);
154
334
  continue;
155
335
  }
156
- const textValue = segment["value"];
157
- if (typeof textValue !== "string") {
336
+ const name = asString2(rawEvaluator.name);
337
+ const typeValue = rawEvaluator.type;
338
+ if (!name || !isEvaluatorKind(typeValue)) {
339
+ logWarning2(`Skipping evaluator with invalid name/type in '${evalId}'`);
158
340
  continue;
159
341
  }
160
- const matches = textValue.match(CODE_BLOCK_PATTERN);
161
- if (matches) {
162
- codeBlocks.push(...matches);
342
+ if (typeValue === "code") {
343
+ const script = asString2(rawEvaluator.script);
344
+ if (!script) {
345
+ logWarning2(`Skipping code evaluator '${name}' in '${evalId}': missing script`);
346
+ continue;
347
+ }
348
+ const cwd = asString2(rawEvaluator.cwd);
349
+ let resolvedCwd;
350
+ if (cwd) {
351
+ const resolved = await resolveFileReference2(cwd, searchRoots);
352
+ if (resolved.resolvedPath) {
353
+ resolvedCwd = path3.resolve(resolved.resolvedPath);
354
+ } else {
355
+ logWarning2(
356
+ `Code evaluator '${name}' in '${evalId}': cwd not found (${resolved.displayPath})`,
357
+ resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
358
+ );
359
+ }
360
+ } else {
361
+ resolvedCwd = searchRoots[0];
362
+ }
363
+ evaluators.push({
364
+ name,
365
+ type: "code",
366
+ script,
367
+ cwd,
368
+ resolvedCwd
369
+ });
370
+ continue;
371
+ }
372
+ const prompt = asString2(rawEvaluator.prompt);
373
+ let promptPath;
374
+ if (prompt) {
375
+ const resolved = await resolveFileReference2(prompt, searchRoots);
376
+ if (resolved.resolvedPath) {
377
+ promptPath = path3.resolve(resolved.resolvedPath);
378
+ } else {
379
+ logWarning2(
380
+ `Inline prompt used for evaluator '${name}' in '${evalId}' (file not found: ${resolved.displayPath})`,
381
+ resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
382
+ );
383
+ }
163
384
  }
385
+ const _model = asString2(rawEvaluator.model);
386
+ evaluators.push({
387
+ name,
388
+ type: "llm_judge",
389
+ prompt,
390
+ promptPath
391
+ });
164
392
  }
165
- return codeBlocks;
393
+ return evaluators.length > 0 ? evaluators : void 0;
166
394
  }
395
+ function coerceEvaluator(candidate, contextId) {
396
+ if (typeof candidate !== "string") {
397
+ return void 0;
398
+ }
399
+ if (isEvaluatorKind(candidate)) {
400
+ return candidate;
401
+ }
402
+ logWarning2(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
403
+ return void 0;
404
+ }
405
+ function asString2(value) {
406
+ return typeof value === "string" ? value : void 0;
407
+ }
408
+ function isJsonObject2(value) {
409
+ return typeof value === "object" && value !== null && !Array.isArray(value);
410
+ }
411
+ function logWarning2(message, details) {
412
+ if (details && details.length > 0) {
413
+ const detailBlock = details.join("\n");
414
+ console.warn(`${ANSI_YELLOW2}Warning: ${message}
415
+ ${detailBlock}${ANSI_RESET2}`);
416
+ } else {
417
+ console.warn(`${ANSI_YELLOW2}Warning: ${message}${ANSI_RESET2}`);
418
+ }
419
+ }
420
+
421
+ // src/evaluation/loaders/message-processor.ts
422
+ import { readFile as readFile2 } from "node:fs/promises";
423
+ import path4 from "node:path";
424
+ var ANSI_YELLOW3 = "\x1B[33m";
425
+ var ANSI_RESET3 = "\x1B[0m";
167
426
  async function processMessages(options) {
168
427
  const {
169
428
  messages,
@@ -189,257 +448,173 @@ async function processMessages(options) {
189
448
  if (!isJsonObject(rawSegment)) {
190
449
  continue;
191
450
  }
192
- const segmentType = asString(rawSegment.type);
451
+ const segmentType = asString3(rawSegment.type);
193
452
  if (segmentType === "file") {
194
- const rawValue = asString(rawSegment.value);
453
+ const rawValue = asString3(rawSegment.value);
195
454
  if (!rawValue) {
196
455
  continue;
197
456
  }
198
- const { displayPath, resolvedPath, attempted } = await resolveFileReference(
457
+ const { displayPath, resolvedPath, attempted } = await resolveFileReference2(
199
458
  rawValue,
200
- searchRoots
201
- );
202
- if (!resolvedPath) {
203
- const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
204
- const context = messageType === "input" ? "" : " in expected_messages";
205
- logWarning(`File not found${context}: ${displayPath}`, attempts);
206
- continue;
207
- }
208
- try {
209
- const fileContent = (await readFile(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
210
- if (messageType === "input" && guidelinePatterns && guidelinePaths) {
211
- const relativeToRepo = path.relative(repoRootPath, resolvedPath);
212
- if (isGuidelineFile(relativeToRepo, guidelinePatterns)) {
213
- guidelinePaths.push(path.resolve(resolvedPath));
214
- if (verbose) {
215
- console.log(` [Guideline] Found: ${displayPath}`);
216
- console.log(` Resolved to: ${resolvedPath}`);
217
- }
218
- continue;
219
- }
220
- }
221
- segments.push({
222
- type: "file",
223
- path: displayPath,
224
- text: fileContent,
225
- resolvedPath: path.resolve(resolvedPath)
226
- });
227
- if (verbose) {
228
- const label = messageType === "input" ? "[File]" : "[Expected Output File]";
229
- console.log(` ${label} Found: ${displayPath}`);
230
- console.log(` Resolved to: ${resolvedPath}`);
231
- }
232
- } catch (error) {
233
- const context = messageType === "input" ? "" : " expected output";
234
- logWarning(`Could not read${context} file ${resolvedPath}: ${error.message}`);
235
- }
236
- continue;
237
- }
238
- const clonedSegment = cloneJsonObject(rawSegment);
239
- segments.push(clonedSegment);
240
- const inlineValue = clonedSegment.value;
241
- if (typeof inlineValue === "string" && textParts) {
242
- textParts.push(inlineValue);
243
- }
244
- }
245
- }
246
- return segments;
247
- }
248
- async function loadEvalCases(evalFilePath, repoRoot, options) {
249
- const verbose = options?.verbose ?? false;
250
- const evalIdFilter = options?.evalId;
251
- const absoluteTestPath = path.resolve(evalFilePath);
252
- if (!await fileExists2(absoluteTestPath)) {
253
- throw new Error(`Test file not found: ${evalFilePath}`);
254
- }
255
- const repoRootPath = resolveToAbsolutePath(repoRoot);
256
- const searchRoots = buildSearchRoots(absoluteTestPath, repoRootPath);
257
- const config = await loadConfig(absoluteTestPath, repoRootPath);
258
- const guidelinePatterns = config?.guideline_patterns;
259
- const rawFile = await readFile(absoluteTestPath, "utf8");
260
- const parsed = parse(rawFile);
261
- if (!isJsonObject(parsed)) {
262
- throw new Error(`Invalid test file format: ${evalFilePath}`);
263
- }
264
- const suite = parsed;
265
- const datasetNameFromSuite = asString(suite.dataset)?.trim();
266
- const fallbackDataset = path.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
267
- const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
268
- const schema = suite.$schema;
269
- if (schema !== SCHEMA_EVAL_V2) {
270
- const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${evalFilePath}. Expected '${SCHEMA_EVAL_V2}'` : `Missing required field '$schema' in ${evalFilePath}.
271
- Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
272
- throw new Error(message);
273
- }
274
- const rawTestcases = suite.evalcases;
275
- if (!Array.isArray(rawTestcases)) {
276
- throw new Error(`Invalid test file format: ${evalFilePath} - missing 'evalcases' field`);
277
- }
278
- const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm_judge";
279
- const globalExecution = isJsonObject(suite.execution) ? suite.execution : void 0;
280
- const globalTarget = asString(globalExecution?.target) ?? asString(suite.target);
281
- const results = [];
282
- for (const rawEvalcase of rawTestcases) {
283
- if (!isJsonObject(rawEvalcase)) {
284
- logWarning("Skipping invalid eval case entry (expected object)");
285
- continue;
286
- }
287
- const evalcase = rawEvalcase;
288
- const id = asString(evalcase.id);
289
- if (evalIdFilter && id !== evalIdFilter) {
290
- continue;
291
- }
292
- const conversationId = asString(evalcase.conversation_id);
293
- const outcome = asString(evalcase.outcome);
294
- const inputMessagesValue = evalcase.input_messages;
295
- const expectedMessagesValue = evalcase.expected_messages;
296
- if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
297
- logWarning(`Skipping incomplete eval case: ${id ?? "unknown"}`);
298
- continue;
299
- }
300
- const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
301
- const inputMessages = inputMessagesValue.filter((msg) => isTestMessage(msg));
302
- const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
303
- if (hasExpectedMessages && expectedMessages.length === 0) {
304
- logWarning(`No valid expected message found for eval case: ${id}`);
305
- continue;
306
- }
307
- if (expectedMessages.length > 1) {
308
- logWarning(`Multiple expected messages found for eval case: ${id}, using first`);
309
- }
310
- const guidelinePaths = [];
311
- const inputTextParts = [];
312
- const inputSegments = await processMessages({
313
- messages: inputMessages,
314
- searchRoots,
315
- repoRootPath,
316
- guidelinePatterns,
317
- guidelinePaths,
318
- textParts: inputTextParts,
319
- messageType: "input",
320
- verbose
321
- });
322
- const outputSegments = hasExpectedMessages ? await processMessages({
323
- messages: expectedMessages,
324
- searchRoots,
325
- repoRootPath,
326
- guidelinePatterns,
327
- messageType: "output",
328
- verbose
329
- }) : [];
330
- const codeSnippets = extractCodeBlocks(inputSegments);
331
- const expectedContent = expectedMessages[0]?.content;
332
- const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
333
- const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
334
- const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
335
- const evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
336
- const userFilePaths = [];
337
- for (const segment of inputSegments) {
338
- if (segment.type === "file" && typeof segment.resolvedPath === "string") {
339
- userFilePaths.push(segment.resolvedPath);
340
- }
341
- }
342
- const allFilePaths = [
343
- ...guidelinePaths.map((guidelinePath) => path.resolve(guidelinePath)),
344
- ...userFilePaths
345
- ];
346
- const testCase = {
347
- id,
348
- dataset: datasetName,
349
- conversation_id: conversationId,
350
- question,
351
- input_messages: inputMessages,
352
- input_segments: inputSegments,
353
- output_segments: outputSegments,
354
- reference_answer: referenceAnswer,
355
- guideline_paths: guidelinePaths.map((guidelinePath) => path.resolve(guidelinePath)),
356
- guideline_patterns: guidelinePatterns,
357
- file_paths: allFilePaths,
358
- code_snippets: codeSnippets,
359
- expected_outcome: outcome,
360
- evaluator: evalCaseEvaluatorKind,
361
- evaluators
362
- };
363
- if (verbose) {
364
- console.log(`
365
- [Eval Case: ${id}]`);
366
- if (testCase.guideline_paths.length > 0) {
367
- console.log(` Guidelines used: ${testCase.guideline_paths.length}`);
368
- for (const guidelinePath of testCase.guideline_paths) {
369
- console.log(` - ${guidelinePath}`);
459
+ searchRoots
460
+ );
461
+ if (!resolvedPath) {
462
+ const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
463
+ const context = messageType === "input" ? "" : " in expected_messages";
464
+ logWarning3(`File not found${context}: ${displayPath}`, attempts);
465
+ continue;
370
466
  }
371
- } else {
372
- console.log(" No guidelines found");
467
+ try {
468
+ const fileContent = (await readFile2(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
469
+ if (messageType === "input" && guidelinePatterns && guidelinePaths) {
470
+ const relativeToRepo = path4.relative(repoRootPath, resolvedPath);
471
+ if (isGuidelineFile(relativeToRepo, guidelinePatterns)) {
472
+ guidelinePaths.push(path4.resolve(resolvedPath));
473
+ if (verbose) {
474
+ console.log(` [Guideline] Found: ${displayPath}`);
475
+ console.log(` Resolved to: ${resolvedPath}`);
476
+ }
477
+ continue;
478
+ }
479
+ }
480
+ segments.push({
481
+ type: "file",
482
+ path: displayPath,
483
+ text: fileContent,
484
+ resolvedPath: path4.resolve(resolvedPath)
485
+ });
486
+ if (verbose) {
487
+ const label = messageType === "input" ? "[File]" : "[Expected Output File]";
488
+ console.log(` ${label} Found: ${displayPath}`);
489
+ console.log(` Resolved to: ${resolvedPath}`);
490
+ }
491
+ } catch (error) {
492
+ const context = messageType === "input" ? "" : " expected output";
493
+ logWarning3(`Could not read${context} file ${resolvedPath}: ${error.message}`);
494
+ }
495
+ continue;
496
+ }
497
+ const clonedSegment = cloneJsonObject(rawSegment);
498
+ segments.push(clonedSegment);
499
+ const inlineValue = clonedSegment.value;
500
+ if (typeof inlineValue === "string" && textParts) {
501
+ textParts.push(inlineValue);
373
502
  }
374
503
  }
375
- results.push(testCase);
376
504
  }
377
- return results;
505
+ return segments;
378
506
  }
379
- function needsRoleMarkers(messages, processedSegmentsByMessage) {
380
- if (messages.some((msg) => msg.role === "assistant" || msg.role === "tool")) {
381
- return true;
507
+ async function resolveAssistantContent(content, searchRoots, verbose) {
508
+ if (typeof content === "string") {
509
+ return content;
382
510
  }
383
- let messagesWithContent = 0;
384
- for (const segments of processedSegmentsByMessage) {
385
- if (hasVisibleContent(segments)) {
386
- messagesWithContent++;
387
- }
511
+ if (!content) {
512
+ return "";
388
513
  }
389
- return messagesWithContent > 1;
390
- }
391
- function hasVisibleContent(segments) {
392
- return segments.some((segment) => {
393
- const type = asString(segment.type);
394
- if (type === "text") {
395
- const value = asString(segment.value);
396
- return value !== void 0 && value.trim().length > 0;
514
+ const parts = [];
515
+ for (const entry of content) {
516
+ if (typeof entry === "string") {
517
+ parts.push({ content: entry, isFile: false });
518
+ continue;
397
519
  }
398
- if (type === "guideline_ref") {
399
- return false;
520
+ if (!isJsonObject(entry)) {
521
+ continue;
400
522
  }
401
- if (type === "file") {
402
- const text = asString(segment.text);
403
- return text !== void 0 && text.trim().length > 0;
523
+ const segmentType = asString3(entry.type);
524
+ if (segmentType === "file") {
525
+ const rawValue = asString3(entry.value);
526
+ if (!rawValue) {
527
+ continue;
528
+ }
529
+ const { displayPath, resolvedPath, attempted } = await resolveFileReference2(
530
+ rawValue,
531
+ searchRoots
532
+ );
533
+ if (!resolvedPath) {
534
+ const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
535
+ logWarning3(`File not found in expected_messages: ${displayPath}`, attempts);
536
+ continue;
537
+ }
538
+ try {
539
+ const fileContent = (await readFile2(resolvedPath, "utf8")).replace(/\r\n/g, "\n").trim();
540
+ parts.push({ content: fileContent, isFile: true, displayPath });
541
+ if (verbose) {
542
+ console.log(` [Expected Assistant File] Found: ${displayPath}`);
543
+ console.log(` Resolved to: ${resolvedPath}`);
544
+ }
545
+ } catch (error) {
546
+ logWarning3(`Could not read file ${resolvedPath}: ${error.message}`);
547
+ }
548
+ continue;
404
549
  }
405
- return false;
406
- });
550
+ const textValue = asString3(entry.text);
551
+ if (typeof textValue === "string") {
552
+ parts.push({ content: textValue, isFile: false });
553
+ continue;
554
+ }
555
+ const valueValue = asString3(entry.value);
556
+ if (typeof valueValue === "string") {
557
+ parts.push({ content: valueValue, isFile: false });
558
+ continue;
559
+ }
560
+ parts.push({ content: JSON.stringify(entry), isFile: false });
561
+ }
562
+ return formatFileContents(parts);
407
563
  }
408
- function formatSegment(segment) {
409
- const type = asString(segment.type);
410
- if (type === "text") {
411
- return asString(segment.value);
564
+ function asString3(value) {
565
+ return typeof value === "string" ? value : void 0;
566
+ }
567
+ function cloneJsonObject(source) {
568
+ const entries = Object.entries(source).map(([key, value]) => [key, cloneJsonValue(value)]);
569
+ return Object.fromEntries(entries);
570
+ }
571
+ function cloneJsonValue(value) {
572
+ if (value === null) {
573
+ return null;
412
574
  }
413
- if (type === "guideline_ref") {
414
- const refPath = asString(segment.path);
415
- return refPath ? `<Attached: ${refPath}>` : void 0;
575
+ if (typeof value === "string" || typeof value === "number" || typeof value === "boolean") {
576
+ return value;
416
577
  }
417
- if (type === "file") {
418
- const text = asString(segment.text);
419
- const filePath = asString(segment.path);
420
- if (text && filePath) {
421
- return formatFileContents([{ content: text.trim(), isFile: true, displayPath: filePath }]);
422
- }
578
+ if (Array.isArray(value)) {
579
+ return value.map((item) => cloneJsonValue(item));
580
+ }
581
+ if (typeof value === "object") {
582
+ return cloneJsonObject(value);
583
+ }
584
+ return value;
585
+ }
586
+ function logWarning3(message, details) {
587
+ if (details && details.length > 0) {
588
+ const detailBlock = details.join("\n");
589
+ console.warn(`${ANSI_YELLOW3}Warning: ${message}
590
+ ${detailBlock}${ANSI_RESET3}`);
591
+ } else {
592
+ console.warn(`${ANSI_YELLOW3}Warning: ${message}${ANSI_RESET3}`);
423
593
  }
424
- return void 0;
425
594
  }
595
+
596
+ // src/evaluation/formatting/prompt-builder.ts
597
+ import { readFile as readFile3 } from "node:fs/promises";
598
+ import path5 from "node:path";
599
+ var ANSI_YELLOW4 = "\x1B[33m";
600
+ var ANSI_RESET4 = "\x1B[0m";
426
601
  async function buildPromptInputs(testCase) {
427
602
  const guidelineParts = [];
428
603
  for (const rawPath of testCase.guideline_paths) {
429
- const absolutePath = path.resolve(rawPath);
604
+ const absolutePath = path5.resolve(rawPath);
430
605
  if (!await fileExists2(absolutePath)) {
431
- logWarning(`Could not read guideline file ${absolutePath}: file does not exist`);
606
+ logWarning4(`Could not read guideline file ${absolutePath}: file does not exist`);
432
607
  continue;
433
608
  }
434
609
  try {
435
- const content = (await readFile(absolutePath, "utf8")).replace(/\r\n/g, "\n").trim();
610
+ const content = (await readFile3(absolutePath, "utf8")).replace(/\r\n/g, "\n").trim();
436
611
  guidelineParts.push({
437
612
  content,
438
613
  isFile: true,
439
- displayPath: path.basename(absolutePath)
614
+ displayPath: path5.basename(absolutePath)
440
615
  });
441
616
  } catch (error) {
442
- logWarning(`Could not read guideline file ${absolutePath}: ${error.message}`);
617
+ logWarning4(`Could not read guideline file ${absolutePath}: ${error.message}`);
443
618
  }
444
619
  }
445
620
  const guidelines = formatFileContents(guidelineParts);
@@ -463,9 +638,9 @@ async function buildPromptInputs(testCase) {
463
638
  messageSegments.push({ type: "text", value: segment });
464
639
  }
465
640
  } else if (isJsonObject(segment)) {
466
- const type = asString(segment.type);
641
+ const type = asString4(segment.type);
467
642
  if (type === "file") {
468
- const value = asString(segment.value);
643
+ const value = asString4(segment.value);
469
644
  if (!value) continue;
470
645
  if (testCase.guideline_patterns && isGuidelineFile(value, testCase.guideline_patterns)) {
471
646
  messageSegments.push({ type: "guideline_ref", path: value });
@@ -476,7 +651,7 @@ async function buildPromptInputs(testCase) {
476
651
  messageSegments.push({ type: "file", text: fileText, path: value });
477
652
  }
478
653
  } else if (type === "text") {
479
- const textValue = asString(segment.value);
654
+ const textValue = asString4(segment.value);
480
655
  if (textValue && textValue.trim().length > 0) {
481
656
  messageSegments.push({ type: "text", value: textValue });
482
657
  }
@@ -532,6 +707,18 @@ ${messageContent}`);
532
707
  }) : void 0;
533
708
  return { question, guidelines, chatPrompt };
534
709
  }
710
+ function needsRoleMarkers(messages, processedSegmentsByMessage) {
711
+ if (messages.some((msg) => msg.role === "assistant" || msg.role === "tool")) {
712
+ return true;
713
+ }
714
+ let messagesWithContent = 0;
715
+ for (const segments of processedSegmentsByMessage) {
716
+ if (hasVisibleContent(segments)) {
717
+ messagesWithContent++;
718
+ }
719
+ }
720
+ return messagesWithContent > 1;
721
+ }
535
722
  function buildChatPromptFromSegments(options) {
536
723
  const { messages, segmentsByMessage, guidelinePatterns, guidelineContent, systemPrompt } = options;
537
724
  if (messages.length === 0) {
@@ -570,243 +757,203 @@ ${guidelineContent.trim()}`);
570
757
  }
571
758
  for (let i = startIndex; i < messages.length; i++) {
572
759
  const message = messages[i];
573
- const segments = segmentsByMessage[i];
574
- const contentParts = [];
575
- let role = message.role;
576
- let name;
577
- if (role === "system") {
578
- role = "assistant";
579
- contentParts.push("@[System]:");
580
- } else if (role === "tool") {
581
- role = "function";
582
- name = "tool";
583
- }
584
- for (const segment of segments) {
585
- if (segment.type === "guideline_ref") {
586
- continue;
587
- }
588
- const formatted = formatSegment(segment);
589
- if (formatted) {
590
- const isGuidelineRef = segment.type === "file" && typeof segment.path === "string" && guidelinePatterns && isGuidelineFile(segment.path, guidelinePatterns);
591
- if (isGuidelineRef) {
592
- continue;
593
- }
594
- contentParts.push(formatted);
595
- }
596
- }
597
- if (contentParts.length === 0) {
598
- continue;
599
- }
600
- chatPrompt.push({
601
- role,
602
- content: contentParts.join("\n"),
603
- ...name ? { name } : {}
604
- });
605
- }
606
- return chatPrompt.length > 0 ? chatPrompt : void 0;
607
- }
608
- async function fileExists2(absolutePath) {
609
- try {
610
- await access(absolutePath, constants.F_OK);
611
- return true;
612
- } catch {
613
- return false;
614
- }
615
- }
616
- function resolveToAbsolutePath(candidate) {
617
- if (candidate instanceof URL) {
618
- return fileURLToPath(candidate);
619
- }
620
- if (typeof candidate === "string") {
621
- if (candidate.startsWith("file://")) {
622
- return fileURLToPath(new URL(candidate));
623
- }
624
- return path.resolve(candidate);
625
- }
626
- throw new TypeError("Unsupported repoRoot value. Expected string or URL.");
627
- }
628
- function asString(value) {
629
- return typeof value === "string" ? value : void 0;
630
- }
631
- function cloneJsonObject(source) {
632
- const entries = Object.entries(source).map(([key, value]) => [key, cloneJsonValue(value)]);
633
- return Object.fromEntries(entries);
634
- }
635
- function cloneJsonValue(value) {
636
- if (value === null) {
637
- return null;
638
- }
639
- if (typeof value === "string" || typeof value === "number" || typeof value === "boolean") {
640
- return value;
641
- }
642
- if (Array.isArray(value)) {
643
- return value.map((item) => cloneJsonValue(item));
644
- }
645
- return cloneJsonObject(value);
646
- }
647
- function formatFileContents(parts) {
648
- const fileCount = parts.filter((p) => p.isFile).length;
649
- if (fileCount > 0) {
650
- return parts.map((part) => {
651
- if (part.isFile && part.displayPath) {
652
- return `<file path="${part.displayPath}">
653
- ${part.content}
654
- </file>`;
655
- }
656
- return part.content;
657
- }).join("\n\n");
658
- }
659
- return parts.map((p) => p.content).join(" ");
660
- }
661
- async function resolveAssistantContent(content, searchRoots, verbose) {
662
- if (typeof content === "string") {
663
- return content;
664
- }
665
- if (!content) {
666
- return "";
667
- }
668
- const parts = [];
669
- for (const entry of content) {
670
- if (typeof entry === "string") {
671
- parts.push({ content: entry, isFile: false });
672
- continue;
673
- }
674
- if (!isJsonObject(entry)) {
675
- continue;
760
+ const segments = segmentsByMessage[i];
761
+ const contentParts = [];
762
+ let role = message.role;
763
+ let name;
764
+ if (role === "system") {
765
+ role = "assistant";
766
+ contentParts.push("@[System]:");
767
+ } else if (role === "tool") {
768
+ role = "function";
769
+ name = "tool";
676
770
  }
677
- const segmentType = asString(entry.type);
678
- if (segmentType === "file") {
679
- const rawValue = asString(entry.value);
680
- if (!rawValue) {
681
- continue;
682
- }
683
- const { displayPath, resolvedPath, attempted } = await resolveFileReference(
684
- rawValue,
685
- searchRoots
686
- );
687
- if (!resolvedPath) {
688
- const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
689
- logWarning(`File not found in expected_messages: ${displayPath}`, attempts);
771
+ for (const segment of segments) {
772
+ if (segment.type === "guideline_ref") {
690
773
  continue;
691
774
  }
692
- try {
693
- const fileContent = (await readFile(resolvedPath, "utf8")).replace(/\r\n/g, "\n").trim();
694
- parts.push({ content: fileContent, isFile: true, displayPath });
695
- if (verbose) {
696
- console.log(` [Expected Assistant File] Found: ${displayPath}`);
697
- console.log(` Resolved to: ${resolvedPath}`);
775
+ const formatted = formatSegment(segment);
776
+ if (formatted) {
777
+ const isGuidelineRef = segment.type === "file" && typeof segment.path === "string" && guidelinePatterns && isGuidelineFile(segment.path, guidelinePatterns);
778
+ if (isGuidelineRef) {
779
+ continue;
698
780
  }
699
- } catch (error) {
700
- logWarning(`Could not read file ${resolvedPath}: ${error.message}`);
781
+ contentParts.push(formatted);
701
782
  }
702
- continue;
703
783
  }
704
- const textValue = asString(entry.text);
705
- if (typeof textValue === "string") {
706
- parts.push({ content: textValue, isFile: false });
784
+ if (contentParts.length === 0) {
707
785
  continue;
708
786
  }
709
- const valueValue = asString(entry.value);
710
- if (typeof valueValue === "string") {
711
- parts.push({ content: valueValue, isFile: false });
712
- continue;
787
+ chatPrompt.push({
788
+ role,
789
+ content: contentParts.join("\n"),
790
+ ...name ? { name } : {}
791
+ });
792
+ }
793
+ return chatPrompt.length > 0 ? chatPrompt : void 0;
794
+ }
795
+ function asString4(value) {
796
+ return typeof value === "string" ? value : void 0;
797
+ }
798
+ function logWarning4(message) {
799
+ console.warn(`${ANSI_YELLOW4}Warning: ${message}${ANSI_RESET4}`);
800
+ }
801
+
802
+ // src/evaluation/yaml-parser.ts
803
+ var ANSI_YELLOW5 = "\x1B[33m";
804
+ var ANSI_RESET5 = "\x1B[0m";
805
+ var SCHEMA_EVAL_V2 = "agentv-eval-v2";
806
+ async function readTestSuiteMetadata(testFilePath) {
807
+ try {
808
+ const absolutePath = path6.resolve(testFilePath);
809
+ const content = await readFile4(absolutePath, "utf8");
810
+ const parsed = parse2(content);
811
+ if (!isJsonObject(parsed)) {
812
+ return {};
713
813
  }
714
- parts.push({ content: JSON.stringify(entry), isFile: false });
814
+ return { target: extractTargetFromSuite(parsed) };
815
+ } catch {
816
+ return {};
715
817
  }
716
- return formatFileContents(parts);
717
818
  }
718
- async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
719
- const execution = rawEvalCase.execution;
720
- const candidateEvaluators = isJsonObject(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators ?? globalExecution?.evaluators;
721
- if (candidateEvaluators === void 0) {
722
- return void 0;
819
+ async function loadEvalCases(evalFilePath, repoRoot, options) {
820
+ const verbose = options?.verbose ?? false;
821
+ const evalIdFilter = options?.evalId;
822
+ const absoluteTestPath = path6.resolve(evalFilePath);
823
+ const repoRootPath = resolveToAbsolutePath(repoRoot);
824
+ const searchRoots = buildSearchRoots2(absoluteTestPath, repoRootPath);
825
+ const config = await loadConfig(absoluteTestPath, repoRootPath);
826
+ const guidelinePatterns = config?.guideline_patterns;
827
+ const rawFile = await readFile4(absoluteTestPath, "utf8");
828
+ const parsed = parse2(rawFile);
829
+ if (!isJsonObject(parsed)) {
830
+ throw new Error(`Invalid test file format: ${evalFilePath}`);
723
831
  }
724
- if (!Array.isArray(candidateEvaluators)) {
725
- logWarning(`Skipping evaluators for '${evalId}': expected array`);
726
- return void 0;
832
+ const suite = parsed;
833
+ const datasetNameFromSuite = asString5(suite.dataset)?.trim();
834
+ const fallbackDataset = path6.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
835
+ const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
836
+ const schema = suite.$schema;
837
+ if (schema !== SCHEMA_EVAL_V2) {
838
+ const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${evalFilePath}. Expected '${SCHEMA_EVAL_V2}'` : `Missing required field '$schema' in ${evalFilePath}.
839
+ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
840
+ throw new Error(message);
727
841
  }
728
- const evaluators = [];
729
- for (const rawEvaluator of candidateEvaluators) {
730
- if (!isJsonObject(rawEvaluator)) {
731
- logWarning(`Skipping invalid evaluator entry for '${evalId}' (expected object)`);
842
+ const rawTestcases = suite.evalcases;
843
+ if (!Array.isArray(rawTestcases)) {
844
+ throw new Error(`Invalid test file format: ${evalFilePath} - missing 'evalcases' field`);
845
+ }
846
+ const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm_judge";
847
+ const globalExecution = isJsonObject(suite.execution) ? suite.execution : void 0;
848
+ const _globalTarget = asString5(globalExecution?.target) ?? asString5(suite.target);
849
+ const results = [];
850
+ for (const rawEvalcase of rawTestcases) {
851
+ if (!isJsonObject(rawEvalcase)) {
852
+ logWarning5("Skipping invalid eval case entry (expected object)");
732
853
  continue;
733
854
  }
734
- const name = asString(rawEvaluator.name);
735
- const typeValue = rawEvaluator.type;
736
- if (!name || !isEvaluatorKind(typeValue)) {
737
- logWarning(`Skipping evaluator with invalid name/type in '${evalId}'`);
855
+ const evalcase = rawEvalcase;
856
+ const id = asString5(evalcase.id);
857
+ if (evalIdFilter && id !== evalIdFilter) {
738
858
  continue;
739
859
  }
740
- if (typeValue === "code") {
741
- const script = asString(rawEvaluator.script);
742
- if (!script) {
743
- logWarning(`Skipping code evaluator '${name}' in '${evalId}': missing script`);
744
- continue;
745
- }
746
- const cwd = asString(rawEvaluator.cwd);
747
- let resolvedCwd;
748
- if (cwd) {
749
- const resolved = await resolveFileReference(cwd, searchRoots);
750
- if (resolved.resolvedPath) {
751
- resolvedCwd = path.resolve(resolved.resolvedPath);
752
- } else {
753
- logWarning(
754
- `Code evaluator '${name}' in '${evalId}': cwd not found (${resolved.displayPath})`,
755
- resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
756
- );
757
- }
758
- } else {
759
- resolvedCwd = searchRoots[0];
760
- }
761
- evaluators.push({
762
- name,
763
- type: "code",
764
- script,
765
- cwd,
766
- resolvedCwd
767
- });
860
+ const conversationId = asString5(evalcase.conversation_id);
861
+ const outcome = asString5(evalcase.outcome);
862
+ const inputMessagesValue = evalcase.input_messages;
863
+ const expectedMessagesValue = evalcase.expected_messages;
864
+ if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
865
+ logWarning5(`Skipping incomplete eval case: ${id ?? "unknown"}`);
768
866
  continue;
769
867
  }
770
- const prompt = asString(rawEvaluator.prompt);
771
- let promptPath;
772
- if (prompt) {
773
- const resolved = await resolveFileReference(prompt, searchRoots);
774
- if (resolved.resolvedPath) {
775
- promptPath = path.resolve(resolved.resolvedPath);
868
+ const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
869
+ const inputMessages = inputMessagesValue.filter((msg) => isTestMessage(msg));
870
+ const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
871
+ if (hasExpectedMessages && expectedMessages.length === 0) {
872
+ logWarning5(`No valid expected message found for eval case: ${id}`);
873
+ continue;
874
+ }
875
+ if (expectedMessages.length > 1) {
876
+ logWarning5(`Multiple expected messages found for eval case: ${id}, using first`);
877
+ }
878
+ const guidelinePaths = [];
879
+ const inputTextParts = [];
880
+ const inputSegments = await processMessages({
881
+ messages: inputMessages,
882
+ searchRoots,
883
+ repoRootPath,
884
+ guidelinePatterns,
885
+ guidelinePaths,
886
+ textParts: inputTextParts,
887
+ messageType: "input",
888
+ verbose
889
+ });
890
+ const outputSegments = hasExpectedMessages ? await processMessages({
891
+ messages: expectedMessages,
892
+ searchRoots,
893
+ repoRootPath,
894
+ guidelinePatterns,
895
+ messageType: "output",
896
+ verbose
897
+ }) : [];
898
+ const codeSnippets = extractCodeBlocks(inputSegments);
899
+ const expectedContent = expectedMessages[0]?.content;
900
+ const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
901
+ const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
902
+ const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
903
+ const evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
904
+ const userFilePaths = [];
905
+ for (const segment of inputSegments) {
906
+ if (segment.type === "file" && typeof segment.resolvedPath === "string") {
907
+ userFilePaths.push(segment.resolvedPath);
908
+ }
909
+ }
910
+ const allFilePaths = [
911
+ ...guidelinePaths.map((guidelinePath) => path6.resolve(guidelinePath)),
912
+ ...userFilePaths
913
+ ];
914
+ const testCase = {
915
+ id,
916
+ dataset: datasetName,
917
+ conversation_id: conversationId,
918
+ question,
919
+ input_messages: inputMessages,
920
+ input_segments: inputSegments,
921
+ output_segments: outputSegments,
922
+ reference_answer: referenceAnswer,
923
+ guideline_paths: guidelinePaths.map((guidelinePath) => path6.resolve(guidelinePath)),
924
+ guideline_patterns: guidelinePatterns,
925
+ file_paths: allFilePaths,
926
+ code_snippets: codeSnippets,
927
+ expected_outcome: outcome,
928
+ evaluator: evalCaseEvaluatorKind,
929
+ evaluators
930
+ };
931
+ if (verbose) {
932
+ console.log(`
933
+ [Eval Case: ${id}]`);
934
+ if (testCase.guideline_paths.length > 0) {
935
+ console.log(` Guidelines used: ${testCase.guideline_paths.length}`);
936
+ for (const guidelinePath of testCase.guideline_paths) {
937
+ console.log(` - ${guidelinePath}`);
938
+ }
776
939
  } else {
777
- logWarning(
778
- `Inline prompt used for evaluator '${name}' in '${evalId}' (file not found: ${resolved.displayPath})`,
779
- resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
780
- );
940
+ console.log(" No guidelines found");
781
941
  }
782
942
  }
783
- const model = asString(rawEvaluator.model);
784
- evaluators.push({
785
- name,
786
- type: "llm_judge",
787
- prompt,
788
- promptPath
789
- });
943
+ results.push(testCase);
790
944
  }
791
- return evaluators.length > 0 ? evaluators : void 0;
945
+ return results;
792
946
  }
793
- function coerceEvaluator(candidate, contextId) {
794
- if (typeof candidate !== "string") {
795
- return void 0;
796
- }
797
- if (isEvaluatorKind(candidate)) {
798
- return candidate;
799
- }
800
- logWarning(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
801
- return void 0;
947
+ function asString5(value) {
948
+ return typeof value === "string" ? value : void 0;
802
949
  }
803
- function logWarning(message, details) {
950
+ function logWarning5(message, details) {
804
951
  if (details && details.length > 0) {
805
952
  const detailBlock = details.join("\n");
806
- console.warn(`${ANSI_YELLOW}Warning: ${message}
807
- ${detailBlock}${ANSI_RESET}`);
953
+ console.warn(`${ANSI_YELLOW5}Warning: ${message}
954
+ ${detailBlock}${ANSI_RESET5}`);
808
955
  } else {
809
- console.warn(`${ANSI_YELLOW}Warning: ${message}${ANSI_RESET}`);
956
+ console.warn(`${ANSI_YELLOW5}Warning: ${message}${ANSI_RESET5}`);
810
957
  }
811
958
  }
812
959
 
@@ -838,9 +985,8 @@ function buildChatPrompt(request) {
838
985
  }
839
986
  function resolveSystemContent(request) {
840
987
  const systemSegments = [];
841
- const metadataSystemPrompt = typeof request.metadata?.systemPrompt === "string" ? request.metadata.systemPrompt : void 0;
842
- if (metadataSystemPrompt && metadataSystemPrompt.trim().length > 0) {
843
- systemSegments.push(metadataSystemPrompt.trim());
988
+ if (request.systemPrompt && request.systemPrompt.trim().length > 0) {
989
+ systemSegments.push(request.systemPrompt.trim());
844
990
  } else {
845
991
  systemSegments.push(DEFAULT_SYSTEM_PROMPT);
846
992
  }
@@ -1093,7 +1239,7 @@ var GeminiProvider = class {
1093
1239
  import { exec as execWithCallback } from "node:child_process";
1094
1240
  import fs from "node:fs/promises";
1095
1241
  import os from "node:os";
1096
- import path2 from "node:path";
1242
+ import path7 from "node:path";
1097
1243
  import { promisify } from "node:util";
1098
1244
  var execAsync = promisify(execWithCallback);
1099
1245
  var DEFAULT_MAX_BUFFER = 10 * 1024 * 1024;
@@ -1272,7 +1418,7 @@ function normalizeInputFiles(inputFiles) {
1272
1418
  }
1273
1419
  const unique = /* @__PURE__ */ new Map();
1274
1420
  for (const inputFile of inputFiles) {
1275
- const absolutePath = path2.resolve(inputFile);
1421
+ const absolutePath = path7.resolve(inputFile);
1276
1422
  if (!unique.has(absolutePath)) {
1277
1423
  unique.set(absolutePath, absolutePath);
1278
1424
  }
@@ -1286,7 +1432,7 @@ function formatFileList(files, template) {
1286
1432
  const formatter = template ?? "{path}";
1287
1433
  return files.map((filePath) => {
1288
1434
  const escapedPath = shellEscape(filePath);
1289
- const escapedName = shellEscape(path2.basename(filePath));
1435
+ const escapedName = shellEscape(path7.basename(filePath));
1290
1436
  return formatter.replaceAll("{path}", escapedPath).replaceAll("{basename}", escapedName);
1291
1437
  }).join(" ");
1292
1438
  }
@@ -1310,7 +1456,7 @@ function generateOutputFilePath(evalCaseId) {
1310
1456
  const safeEvalId = evalCaseId || "unknown";
1311
1457
  const timestamp = Date.now();
1312
1458
  const random = Math.random().toString(36).substring(2, 9);
1313
- return path2.join(os.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}.json`);
1459
+ return path7.join(os.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}.json`);
1314
1460
  }
1315
1461
  function formatTimeoutSuffix(timeoutMs) {
1316
1462
  if (!timeoutMs || timeoutMs <= 0) {
@@ -1326,7 +1472,7 @@ import { randomUUID } from "node:crypto";
1326
1472
  import { constants as constants2, createWriteStream } from "node:fs";
1327
1473
  import { access as access2, mkdtemp, mkdir, rm, writeFile } from "node:fs/promises";
1328
1474
  import { tmpdir } from "node:os";
1329
- import path4 from "node:path";
1475
+ import path9 from "node:path";
1330
1476
  import { promisify as promisify2 } from "node:util";
1331
1477
 
1332
1478
  // src/evaluation/providers/codex-log-tracker.ts
@@ -1383,7 +1529,7 @@ function subscribeToCodexLogEntries(listener) {
1383
1529
  }
1384
1530
 
1385
1531
  // src/evaluation/providers/preread.ts
1386
- import path3 from "node:path";
1532
+ import path8 from "node:path";
1387
1533
  function buildPromptDocument(request, inputFiles, options) {
1388
1534
  const parts = [];
1389
1535
  const guidelineFiles = collectGuidelineFiles(
@@ -1408,7 +1554,7 @@ function normalizeInputFiles2(inputFiles) {
1408
1554
  }
1409
1555
  const deduped = /* @__PURE__ */ new Map();
1410
1556
  for (const inputFile of inputFiles) {
1411
- const absolutePath = path3.resolve(inputFile);
1557
+ const absolutePath = path8.resolve(inputFile);
1412
1558
  if (!deduped.has(absolutePath)) {
1413
1559
  deduped.set(absolutePath, absolutePath);
1414
1560
  }
@@ -1421,14 +1567,14 @@ function collectGuidelineFiles(inputFiles, guidelinePatterns, overrides) {
1421
1567
  }
1422
1568
  const unique = /* @__PURE__ */ new Map();
1423
1569
  for (const inputFile of inputFiles) {
1424
- const absolutePath = path3.resolve(inputFile);
1570
+ const absolutePath = path8.resolve(inputFile);
1425
1571
  if (overrides?.has(absolutePath)) {
1426
1572
  if (!unique.has(absolutePath)) {
1427
1573
  unique.set(absolutePath, absolutePath);
1428
1574
  }
1429
1575
  continue;
1430
1576
  }
1431
- const normalized = absolutePath.split(path3.sep).join("/");
1577
+ const normalized = absolutePath.split(path8.sep).join("/");
1432
1578
  if (isGuidelineFile(normalized, guidelinePatterns)) {
1433
1579
  if (!unique.has(absolutePath)) {
1434
1580
  unique.set(absolutePath, absolutePath);
@@ -1443,7 +1589,7 @@ function collectInputFiles(inputFiles) {
1443
1589
  }
1444
1590
  const unique = /* @__PURE__ */ new Map();
1445
1591
  for (const inputFile of inputFiles) {
1446
- const absolutePath = path3.resolve(inputFile);
1592
+ const absolutePath = path8.resolve(inputFile);
1447
1593
  if (!unique.has(absolutePath)) {
1448
1594
  unique.set(absolutePath, absolutePath);
1449
1595
  }
@@ -1455,7 +1601,7 @@ function buildMandatoryPrereadBlock(guidelineFiles, inputFiles) {
1455
1601
  return "";
1456
1602
  }
1457
1603
  const buildList = (files) => files.map((absolutePath) => {
1458
- const fileName = path3.basename(absolutePath);
1604
+ const fileName = path8.basename(absolutePath);
1459
1605
  const fileUri = pathToFileUri(absolutePath);
1460
1606
  return `* [${fileName}](${fileUri})`;
1461
1607
  });
@@ -1475,7 +1621,7 @@ ${buildList(inputFiles).join("\n")}.`);
1475
1621
  return sections.join("\n");
1476
1622
  }
1477
1623
  function pathToFileUri(filePath) {
1478
- const absolutePath = path3.isAbsolute(filePath) ? filePath : path3.resolve(filePath);
1624
+ const absolutePath = path8.isAbsolute(filePath) ? filePath : path8.resolve(filePath);
1479
1625
  const normalizedPath = absolutePath.replace(/\\/g, "/");
1480
1626
  if (/^[a-zA-Z]:\//.test(normalizedPath)) {
1481
1627
  return `file:///${normalizedPath}`;
@@ -1513,7 +1659,7 @@ var CodexProvider = class {
1513
1659
  const logger = await this.createStreamLogger(request).catch(() => void 0);
1514
1660
  try {
1515
1661
  const promptContent = buildPromptDocument(request, inputFiles);
1516
- const promptFile = path4.join(workspaceRoot, PROMPT_FILENAME);
1662
+ const promptFile = path9.join(workspaceRoot, PROMPT_FILENAME);
1517
1663
  await writeFile(promptFile, promptContent, "utf8");
1518
1664
  const args = this.buildCodexArgs();
1519
1665
  const cwd = this.resolveCwd(workspaceRoot);
@@ -1563,7 +1709,7 @@ var CodexProvider = class {
1563
1709
  if (!this.config.cwd) {
1564
1710
  return workspaceRoot;
1565
1711
  }
1566
- return path4.resolve(this.config.cwd);
1712
+ return path9.resolve(this.config.cwd);
1567
1713
  }
1568
1714
  buildCodexArgs() {
1569
1715
  const args = ["--ask-for-approval", "never", "exec", "--json", "--color", "never", "--skip-git-repo-check"];
@@ -1597,7 +1743,7 @@ var CodexProvider = class {
1597
1743
  }
1598
1744
  }
1599
1745
  async createWorkspace() {
1600
- return await mkdtemp(path4.join(tmpdir(), WORKSPACE_PREFIX));
1746
+ return await mkdtemp(path9.join(tmpdir(), WORKSPACE_PREFIX));
1601
1747
  }
1602
1748
  async cleanupWorkspace(workspaceRoot) {
1603
1749
  try {
@@ -1611,9 +1757,9 @@ var CodexProvider = class {
1611
1757
  return void 0;
1612
1758
  }
1613
1759
  if (this.config.logDir) {
1614
- return path4.resolve(this.config.logDir);
1760
+ return path9.resolve(this.config.logDir);
1615
1761
  }
1616
- return path4.join(process.cwd(), ".agentv", "logs", "codex");
1762
+ return path9.join(process.cwd(), ".agentv", "logs", "codex");
1617
1763
  }
1618
1764
  async createStreamLogger(request) {
1619
1765
  const logDir = this.resolveLogDirectory();
@@ -1627,7 +1773,7 @@ var CodexProvider = class {
1627
1773
  console.warn(`Skipping Codex stream logging (could not create ${logDir}): ${message}`);
1628
1774
  return void 0;
1629
1775
  }
1630
- const filePath = path4.join(logDir, buildLogFilename(request, this.targetName));
1776
+ const filePath = path9.join(logDir, buildLogFilename(request, this.targetName));
1631
1777
  try {
1632
1778
  const logger = await CodexStreamLogger.create({
1633
1779
  filePath,
@@ -1842,7 +1988,7 @@ function tryParseJsonValue(rawLine) {
1842
1988
  async function locateExecutable(candidate) {
1843
1989
  const includesPathSeparator = candidate.includes("/") || candidate.includes("\\");
1844
1990
  if (includesPathSeparator) {
1845
- const resolved = path4.isAbsolute(candidate) ? candidate : path4.resolve(candidate);
1991
+ const resolved = path9.isAbsolute(candidate) ? candidate : path9.resolve(candidate);
1846
1992
  const executablePath = await ensureWindowsExecutableVariant(resolved);
1847
1993
  await access2(executablePath, constants2.F_OK);
1848
1994
  return executablePath;
@@ -2189,7 +2335,7 @@ var MockProvider = class {
2189
2335
  };
2190
2336
 
2191
2337
  // src/evaluation/providers/vscode.ts
2192
- import path5 from "node:path";
2338
+ import path10 from "node:path";
2193
2339
  import { dispatchAgentSession, dispatchBatchAgent, getSubagentRoot, provisionSubagents } from "subagent";
2194
2340
  var VSCodeProvider = class {
2195
2341
  id;
@@ -2302,6 +2448,9 @@ var VSCodeProvider = class {
2302
2448
  };
2303
2449
  function buildPromptDocument2(request, attachments, guidelinePatterns) {
2304
2450
  const parts = [];
2451
+ if (request.systemPrompt && request.systemPrompt.trim().length > 0) {
2452
+ parts.push(request.systemPrompt.trim());
2453
+ }
2305
2454
  const guidelineFiles = collectGuidelineFiles2(attachments, guidelinePatterns);
2306
2455
  const attachmentFiles = collectAttachmentFiles(attachments);
2307
2456
  const nonGuidelineAttachments = attachmentFiles.filter(
@@ -2319,7 +2468,7 @@ function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
2319
2468
  return "";
2320
2469
  }
2321
2470
  const buildList = (files) => files.map((absolutePath) => {
2322
- const fileName = path5.basename(absolutePath);
2471
+ const fileName = path10.basename(absolutePath);
2323
2472
  const fileUri = pathToFileUri2(absolutePath);
2324
2473
  return `* [${fileName}](${fileUri})`;
2325
2474
  });
@@ -2344,8 +2493,8 @@ function collectGuidelineFiles2(attachments, guidelinePatterns) {
2344
2493
  }
2345
2494
  const unique = /* @__PURE__ */ new Map();
2346
2495
  for (const attachment of attachments) {
2347
- const absolutePath = path5.resolve(attachment);
2348
- const normalized = absolutePath.split(path5.sep).join("/");
2496
+ const absolutePath = path10.resolve(attachment);
2497
+ const normalized = absolutePath.split(path10.sep).join("/");
2349
2498
  if (isGuidelineFile(normalized, guidelinePatterns)) {
2350
2499
  if (!unique.has(absolutePath)) {
2351
2500
  unique.set(absolutePath, absolutePath);
@@ -2360,7 +2509,7 @@ function collectAttachmentFiles(attachments) {
2360
2509
  }
2361
2510
  const unique = /* @__PURE__ */ new Map();
2362
2511
  for (const attachment of attachments) {
2363
- const absolutePath = path5.resolve(attachment);
2512
+ const absolutePath = path10.resolve(attachment);
2364
2513
  if (!unique.has(absolutePath)) {
2365
2514
  unique.set(absolutePath, absolutePath);
2366
2515
  }
@@ -2368,7 +2517,7 @@ function collectAttachmentFiles(attachments) {
2368
2517
  return Array.from(unique.values());
2369
2518
  }
2370
2519
  function pathToFileUri2(filePath) {
2371
- const absolutePath = path5.isAbsolute(filePath) ? filePath : path5.resolve(filePath);
2520
+ const absolutePath = path10.isAbsolute(filePath) ? filePath : path10.resolve(filePath);
2372
2521
  const normalizedPath = absolutePath.replace(/\\/g, "/");
2373
2522
  if (/^[a-zA-Z]:\//.test(normalizedPath)) {
2374
2523
  return `file:///${normalizedPath}`;
@@ -2381,7 +2530,7 @@ function normalizeAttachments(attachments) {
2381
2530
  }
2382
2531
  const deduped = /* @__PURE__ */ new Set();
2383
2532
  for (const attachment of attachments) {
2384
- deduped.add(path5.resolve(attachment));
2533
+ deduped.add(path10.resolve(attachment));
2385
2534
  }
2386
2535
  return Array.from(deduped);
2387
2536
  }
@@ -2390,7 +2539,7 @@ function mergeAttachments(all) {
2390
2539
  for (const list of all) {
2391
2540
  if (!list) continue;
2392
2541
  for (const inputFile of list) {
2393
- deduped.add(path5.resolve(inputFile));
2542
+ deduped.add(path10.resolve(inputFile));
2394
2543
  }
2395
2544
  }
2396
2545
  return deduped.size > 0 ? Array.from(deduped) : void 0;
@@ -2436,9 +2585,9 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
2436
2585
 
2437
2586
  // src/evaluation/providers/targets-file.ts
2438
2587
  import { constants as constants3 } from "node:fs";
2439
- import { access as access3, readFile as readFile2 } from "node:fs/promises";
2440
- import path6 from "node:path";
2441
- import { parse as parse2 } from "yaml";
2588
+ import { access as access3, readFile as readFile5 } from "node:fs/promises";
2589
+ import path11 from "node:path";
2590
+ import { parse as parse3 } from "yaml";
2442
2591
  function isRecord(value) {
2443
2592
  return typeof value === "object" && value !== null && !Array.isArray(value);
2444
2593
  }
@@ -2493,12 +2642,12 @@ async function fileExists3(filePath) {
2493
2642
  }
2494
2643
  }
2495
2644
  async function readTargetDefinitions(filePath) {
2496
- const absolutePath = path6.resolve(filePath);
2645
+ const absolutePath = path11.resolve(filePath);
2497
2646
  if (!await fileExists3(absolutePath)) {
2498
2647
  throw new Error(`targets.yaml not found at ${absolutePath}`);
2499
2648
  }
2500
- const raw = await readFile2(absolutePath, "utf8");
2501
- const parsed = parse2(raw);
2649
+ const raw = await readFile5(absolutePath, "utf8");
2650
+ const parsed = parse3(raw);
2502
2651
  if (!isRecord(parsed)) {
2503
2652
  throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with '$schema' and 'targets' fields`);
2504
2653
  }
@@ -2541,18 +2690,34 @@ function resolveAndCreateProvider(definition, env = process.env) {
2541
2690
  }
2542
2691
 
2543
2692
  // src/evaluation/evaluators.ts
2544
- import { randomUUID as randomUUID2 } from "node:crypto";
2693
+ var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
2694
+
2695
+ Use the reference_answer as a gold standard for a high-quality response (if provided). The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
2696
+
2697
+ Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.
2698
+
2699
+ [[ ## expected_outcome ## ]]
2700
+ {{expected_outcome}}
2701
+
2702
+ [[ ## question ## ]]
2703
+ {{question}}
2704
+
2705
+ [[ ## reference_answer ## ]]
2706
+ {{reference_answer}}
2707
+
2708
+ [[ ## candidate_answer ## ]]
2709
+ {{candidate_answer}}`;
2545
2710
  var LlmJudgeEvaluator = class {
2546
2711
  kind = "llm_judge";
2547
2712
  resolveJudgeProvider;
2548
2713
  maxOutputTokens;
2549
2714
  temperature;
2550
- customPrompt;
2715
+ evaluatorTemplate;
2551
2716
  constructor(options) {
2552
2717
  this.resolveJudgeProvider = options.resolveJudgeProvider;
2553
2718
  this.maxOutputTokens = options.maxOutputTokens;
2554
2719
  this.temperature = options.temperature;
2555
- this.customPrompt = options.customPrompt;
2720
+ this.evaluatorTemplate = options.evaluatorTemplate;
2556
2721
  }
2557
2722
  async evaluate(context) {
2558
2723
  const judgeProvider = await this.resolveJudgeProvider(context);
@@ -2562,26 +2727,21 @@ var LlmJudgeEvaluator = class {
2562
2727
  return this.evaluateWithPrompt(context, judgeProvider);
2563
2728
  }
2564
2729
  async evaluateWithPrompt(context, judgeProvider) {
2565
- const hasReferenceAnswer = hasNonEmptyReferenceAnswer(context.evalCase);
2566
2730
  const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
2567
- let prompt = buildQualityPrompt(context.evalCase, context.candidate, formattedQuestion);
2568
- let systemPrompt = context.systemPrompt ?? this.customPrompt ?? buildSystemPrompt(hasReferenceAnswer);
2569
- if (systemPrompt && hasTemplateVariables(systemPrompt)) {
2570
- const variables = {
2571
- input_messages: JSON.stringify(context.evalCase.input_segments, null, 2),
2572
- output_messages: JSON.stringify(context.evalCase.output_segments, null, 2),
2573
- candidate_answer: context.candidate,
2574
- reference_answer: context.evalCase.reference_answer ?? "",
2575
- expected_outcome: context.evalCase.expected_outcome,
2576
- question: formattedQuestion
2577
- };
2578
- prompt = substituteVariables(systemPrompt, variables);
2579
- systemPrompt = buildSystemPrompt(hasReferenceAnswer);
2580
- }
2581
- const metadata = systemPrompt !== void 0 ? { systemPrompt } : {};
2731
+ const variables = {
2732
+ input_messages: JSON.stringify(context.evalCase.input_segments, null, 2),
2733
+ output_messages: JSON.stringify(context.evalCase.output_segments, null, 2),
2734
+ candidate_answer: context.candidate.trim(),
2735
+ reference_answer: (context.evalCase.reference_answer ?? "").trim(),
2736
+ expected_outcome: context.evalCase.expected_outcome.trim(),
2737
+ question: formattedQuestion.trim()
2738
+ };
2739
+ const systemPrompt = buildOutputSchema();
2740
+ const evaluatorTemplate = context.evaluatorTemplateOverride ?? this.evaluatorTemplate ?? DEFAULT_EVALUATOR_TEMPLATE;
2741
+ const userPrompt = substituteVariables(evaluatorTemplate, variables);
2582
2742
  const response = await judgeProvider.invoke({
2583
- question: prompt,
2584
- metadata,
2743
+ question: userPrompt,
2744
+ systemPrompt,
2585
2745
  evalCaseId: context.evalCase.id,
2586
2746
  attempt: context.attempt,
2587
2747
  maxOutputTokens: this.maxOutputTokens,
@@ -2594,11 +2754,9 @@ var LlmJudgeEvaluator = class {
2594
2754
  const reasoning = parsed.reasoning ?? response.reasoning;
2595
2755
  const expectedAspectCount = Math.max(hits.length + misses.length, 1);
2596
2756
  const evaluatorRawRequest = {
2597
- id: randomUUID2(),
2598
- provider: judgeProvider.id,
2599
- prompt,
2600
- target: context.target.name,
2601
- ...systemPrompt !== void 0 && { systemPrompt }
2757
+ userPrompt,
2758
+ systemPrompt,
2759
+ target: judgeProvider.targetName
2602
2760
  };
2603
2761
  return {
2604
2762
  score,
@@ -2610,20 +2768,8 @@ var LlmJudgeEvaluator = class {
2610
2768
  };
2611
2769
  }
2612
2770
  };
2613
- function buildSystemPrompt(hasReferenceAnswer) {
2614
- const basePrompt = [
2615
- "You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.",
2616
- ""
2617
- ];
2618
- if (hasReferenceAnswer) {
2619
- basePrompt.push(
2620
- "Use the reference_answer as a gold standard for a high-quality response. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.",
2621
- ""
2622
- );
2623
- }
2624
- basePrompt.push(
2625
- "Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.",
2626
- "",
2771
+ function buildOutputSchema() {
2772
+ return [
2627
2773
  "You must respond with a single JSON object matching this schema:",
2628
2774
  "",
2629
2775
  "{",
@@ -2632,30 +2778,7 @@ function buildSystemPrompt(hasReferenceAnswer) {
2632
2778
  ' "misses": [<array of strings, max 4 items, brief specific failures or omissions, empty if none>],',
2633
2779
  ' "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
2634
2780
  "}"
2635
- );
2636
- return basePrompt.join("\n");
2637
- }
2638
- function buildQualityPrompt(evalCase, candidate, question) {
2639
- const parts = [
2640
- "[[ ## expected_outcome ## ]]",
2641
- evalCase.expected_outcome.trim(),
2642
- "",
2643
- "[[ ## question ## ]]",
2644
- question.trim(),
2645
- ""
2646
- ];
2647
- if (hasNonEmptyReferenceAnswer(evalCase)) {
2648
- parts.push(
2649
- "[[ ## reference_answer ## ]]",
2650
- evalCase.reference_answer.trim(),
2651
- ""
2652
- );
2653
- }
2654
- parts.push(
2655
- "[[ ## candidate_answer ## ]]",
2656
- candidate.trim()
2657
- );
2658
- return parts.join("\n");
2781
+ ].join("\n");
2659
2782
  }
2660
2783
  function clampScore(value) {
2661
2784
  if (Number.isNaN(value) || !Number.isFinite(value)) {
@@ -2737,9 +2860,6 @@ function extractJsonBlob(text) {
2737
2860
  function isNonEmptyString(value) {
2738
2861
  return typeof value === "string" && value.trim().length > 0;
2739
2862
  }
2740
- function hasNonEmptyReferenceAnswer(evalCase) {
2741
- return evalCase.reference_answer !== void 0 && evalCase.reference_answer.trim().length > 0;
2742
- }
2743
2863
  var CodeEvaluator = class {
2744
2864
  kind = "code";
2745
2865
  script;
@@ -2845,19 +2965,16 @@ function parseJsonSafe(payload) {
2845
2965
  return void 0;
2846
2966
  }
2847
2967
  }
2848
- function hasTemplateVariables(text) {
2849
- return /\$\{[a-zA-Z0-9_]+\}/.test(text);
2850
- }
2851
2968
  function substituteVariables(template, variables) {
2852
- return template.replace(/\$\{([a-zA-Z0-9_]+)\}/g, (match, varName) => {
2969
+ return template.replace(/\{\{([a-zA-Z0-9_]+)\}\}/g, (match, varName) => {
2853
2970
  return variables[varName] ?? match;
2854
2971
  });
2855
2972
  }
2856
2973
 
2857
2974
  // src/evaluation/orchestrator.ts
2858
- import { createHash, randomUUID as randomUUID3 } from "node:crypto";
2975
+ import { createHash, randomUUID as randomUUID2 } from "node:crypto";
2859
2976
  import { mkdir as mkdir2, writeFile as writeFile2 } from "node:fs/promises";
2860
- import path7 from "node:path";
2977
+ import path12 from "node:path";
2861
2978
 
2862
2979
  // ../../node_modules/.pnpm/yocto-queue@1.2.1/node_modules/yocto-queue/index.js
2863
2980
  var Node = class {
@@ -3420,6 +3537,7 @@ async function evaluateCandidate(options) {
3420
3537
  }
3421
3538
  }
3422
3539
  return {
3540
+ timestamp: completedAt.toISOString(),
3423
3541
  eval_id: evalCase.id,
3424
3542
  dataset: evalCase.dataset,
3425
3543
  conversation_id: evalCase.conversation_id,
@@ -3427,14 +3545,12 @@ async function evaluateCandidate(options) {
3427
3545
  hits: score.hits,
3428
3546
  misses: score.misses,
3429
3547
  candidate_answer: candidate,
3430
- expected_aspect_count: score.expectedAspectCount,
3431
3548
  target: target.name,
3432
- timestamp: completedAt.toISOString(),
3433
3549
  reasoning: score.reasoning,
3434
3550
  raw_aspects: score.rawAspects,
3435
3551
  agent_provider_request: agentProviderRequest,
3436
3552
  lm_provider_request: lmProviderRequest,
3437
- evaluator_raw_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
3553
+ evaluator_provider_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
3438
3554
  evaluator_results: evaluatorResults
3439
3555
  };
3440
3556
  }
@@ -3511,7 +3627,7 @@ async function runEvaluatorList(options) {
3511
3627
  hits: score2.hits,
3512
3628
  misses: score2.misses,
3513
3629
  reasoning: score2.reasoning,
3514
- evaluator_raw_request: score2.evaluatorRawRequest
3630
+ evaluator_provider_request: score2.evaluatorRawRequest
3515
3631
  });
3516
3632
  continue;
3517
3633
  }
@@ -3538,7 +3654,7 @@ async function runEvaluatorList(options) {
3538
3654
  hits: score2.hits,
3539
3655
  misses: score2.misses,
3540
3656
  reasoning: score2.reasoning,
3541
- evaluator_raw_request: score2.evaluatorRawRequest
3657
+ evaluator_provider_request: score2.evaluatorRawRequest
3542
3658
  });
3543
3659
  continue;
3544
3660
  }
@@ -3591,7 +3707,7 @@ async function runLlmJudgeEvaluator(options) {
3591
3707
  promptInputs,
3592
3708
  now,
3593
3709
  judgeProvider,
3594
- systemPrompt: customPrompt,
3710
+ evaluatorTemplateOverride: customPrompt,
3595
3711
  evaluator: config
3596
3712
  });
3597
3713
  }
@@ -3632,8 +3748,8 @@ function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
3632
3748
  async function dumpPrompt(directory, evalCase, promptInputs) {
3633
3749
  const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
3634
3750
  const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
3635
- const filePath = path7.resolve(directory, filename);
3636
- await mkdir2(path7.dirname(filePath), { recursive: true });
3751
+ const filePath = path12.resolve(directory, filename);
3752
+ await mkdir2(path12.dirname(filePath), { recursive: true });
3637
3753
  const payload = {
3638
3754
  eval_id: evalCase.id,
3639
3755
  question: promptInputs.question,
@@ -3647,7 +3763,7 @@ function sanitizeFilename(value) {
3647
3763
  return "prompt";
3648
3764
  }
3649
3765
  const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
3650
- return sanitized.length > 0 ? sanitized : randomUUID3();
3766
+ return sanitized.length > 0 ? sanitized : randomUUID2();
3651
3767
  }
3652
3768
  async function invokeProvider(provider, options) {
3653
3769
  const { evalCase, promptInputs, attempt, agentTimeoutMs, signal } = options;
@@ -3703,6 +3819,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
3703
3819
  }
3704
3820
  }
3705
3821
  return {
3822
+ timestamp: timestamp.toISOString(),
3706
3823
  eval_id: evalCase.id,
3707
3824
  dataset: evalCase.dataset,
3708
3825
  conversation_id: evalCase.conversation_id,
@@ -3710,9 +3827,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
3710
3827
  hits: [],
3711
3828
  misses: [`Error: ${message}`],
3712
3829
  candidate_answer: `Error occurred: ${message}`,
3713
- expected_aspect_count: 0,
3714
3830
  target: targetName,
3715
- timestamp: timestamp.toISOString(),
3716
3831
  raw_aspects: [],
3717
3832
  agent_provider_request: agentProviderRequest,
3718
3833
  lm_provider_request: lmProviderRequest,