@agentv/core 0.10.1 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-YQBJAT5I.js → chunk-U3GEJ3K7.js} +1 -1
- package/dist/{chunk-YQBJAT5I.js.map → chunk-U3GEJ3K7.js.map} +1 -1
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/index.cjs +691 -562
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +29 -26
- package/dist/index.d.ts +29 -26
- package/dist/index.js +638 -507
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -9,7 +9,7 @@ import {
|
|
|
9
9
|
readTextFile,
|
|
10
10
|
resolveFileReference,
|
|
11
11
|
resolveTargetDefinition
|
|
12
|
-
} from "./chunk-
|
|
12
|
+
} from "./chunk-U3GEJ3K7.js";
|
|
13
13
|
|
|
14
14
|
// src/evaluation/types.ts
|
|
15
15
|
var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
|
|
@@ -62,48 +62,197 @@ function getHitCount(result) {
|
|
|
62
62
|
}
|
|
63
63
|
|
|
64
64
|
// src/evaluation/yaml-parser.ts
|
|
65
|
+
import { readFile as readFile4 } from "node:fs/promises";
|
|
66
|
+
import path6 from "node:path";
|
|
67
|
+
import { parse as parse2 } from "yaml";
|
|
68
|
+
|
|
69
|
+
// src/evaluation/formatting/segment-formatter.ts
|
|
70
|
+
function extractCodeBlocks(segments) {
|
|
71
|
+
const CODE_BLOCK_PATTERN = /```[\s\S]*?```/g;
|
|
72
|
+
const codeBlocks = [];
|
|
73
|
+
for (const segment of segments) {
|
|
74
|
+
const typeValue = segment["type"];
|
|
75
|
+
if (typeof typeValue !== "string" || typeValue !== "text") {
|
|
76
|
+
continue;
|
|
77
|
+
}
|
|
78
|
+
const textValue = segment["value"];
|
|
79
|
+
if (typeof textValue !== "string") {
|
|
80
|
+
continue;
|
|
81
|
+
}
|
|
82
|
+
const matches = textValue.match(CODE_BLOCK_PATTERN);
|
|
83
|
+
if (matches) {
|
|
84
|
+
codeBlocks.push(...matches);
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
return codeBlocks;
|
|
88
|
+
}
|
|
89
|
+
function formatFileContents(parts) {
|
|
90
|
+
const fileCount = parts.filter((p) => p.isFile).length;
|
|
91
|
+
if (fileCount > 0) {
|
|
92
|
+
return parts.map((part) => {
|
|
93
|
+
if (part.isFile && part.displayPath) {
|
|
94
|
+
return `<file path="${part.displayPath}">
|
|
95
|
+
${part.content}
|
|
96
|
+
</file>`;
|
|
97
|
+
}
|
|
98
|
+
return part.content;
|
|
99
|
+
}).join("\n\n");
|
|
100
|
+
}
|
|
101
|
+
return parts.map((p) => p.content).join(" ");
|
|
102
|
+
}
|
|
103
|
+
function formatSegment(segment) {
|
|
104
|
+
const type = asString(segment.type);
|
|
105
|
+
if (type === "text") {
|
|
106
|
+
return asString(segment.value);
|
|
107
|
+
}
|
|
108
|
+
if (type === "guideline_ref") {
|
|
109
|
+
const refPath = asString(segment.path);
|
|
110
|
+
return refPath ? `<Attached: ${refPath}>` : void 0;
|
|
111
|
+
}
|
|
112
|
+
if (type === "file") {
|
|
113
|
+
const text = asString(segment.text);
|
|
114
|
+
const filePath = asString(segment.path);
|
|
115
|
+
if (text && filePath) {
|
|
116
|
+
return formatFileContents([{ content: text.trim(), isFile: true, displayPath: filePath }]);
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
return void 0;
|
|
120
|
+
}
|
|
121
|
+
function hasVisibleContent(segments) {
|
|
122
|
+
return segments.some((segment) => {
|
|
123
|
+
const type = asString(segment.type);
|
|
124
|
+
if (type === "text") {
|
|
125
|
+
const value = asString(segment.value);
|
|
126
|
+
return value !== void 0 && value.trim().length > 0;
|
|
127
|
+
}
|
|
128
|
+
if (type === "guideline_ref") {
|
|
129
|
+
return false;
|
|
130
|
+
}
|
|
131
|
+
if (type === "file") {
|
|
132
|
+
const text = asString(segment.text);
|
|
133
|
+
return text !== void 0 && text.trim().length > 0;
|
|
134
|
+
}
|
|
135
|
+
return false;
|
|
136
|
+
});
|
|
137
|
+
}
|
|
138
|
+
function asString(value) {
|
|
139
|
+
return typeof value === "string" ? value : void 0;
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
// src/evaluation/loaders/config-loader.ts
|
|
65
143
|
import micromatch from "micromatch";
|
|
144
|
+
import { readFile } from "node:fs/promises";
|
|
145
|
+
import path2 from "node:path";
|
|
146
|
+
import { parse } from "yaml";
|
|
147
|
+
|
|
148
|
+
// src/evaluation/loaders/file-resolver.ts
|
|
66
149
|
import { constants } from "node:fs";
|
|
67
|
-
import { access
|
|
150
|
+
import { access } from "node:fs/promises";
|
|
68
151
|
import path from "node:path";
|
|
69
|
-
|
|
70
|
-
import { parse } from "yaml";
|
|
71
|
-
var CODE_BLOCK_PATTERN = /```[\s\S]*?```/g;
|
|
72
|
-
var ANSI_YELLOW = "\x1B[33m";
|
|
73
|
-
var ANSI_RESET = "\x1B[0m";
|
|
74
|
-
var SCHEMA_EVAL_V2 = "agentv-eval-v2";
|
|
75
|
-
var SCHEMA_CONFIG_V2 = "agentv-config-v2";
|
|
76
|
-
async function readTestSuiteMetadata(testFilePath) {
|
|
152
|
+
async function fileExists2(absolutePath) {
|
|
77
153
|
try {
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
const parsed = parse(content);
|
|
81
|
-
if (!isJsonObject(parsed)) {
|
|
82
|
-
return {};
|
|
83
|
-
}
|
|
84
|
-
return { target: extractTargetFromSuite(parsed) };
|
|
154
|
+
await access(absolutePath, constants.F_OK);
|
|
155
|
+
return true;
|
|
85
156
|
} catch {
|
|
86
|
-
return
|
|
157
|
+
return false;
|
|
87
158
|
}
|
|
88
159
|
}
|
|
89
|
-
function
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
160
|
+
function resolveToAbsolutePath(candidate) {
|
|
161
|
+
if (candidate instanceof URL) {
|
|
162
|
+
return new URL(candidate).pathname;
|
|
163
|
+
}
|
|
164
|
+
if (typeof candidate === "string") {
|
|
165
|
+
if (candidate.startsWith("file://")) {
|
|
166
|
+
return new URL(candidate).pathname;
|
|
95
167
|
}
|
|
168
|
+
return path.resolve(candidate);
|
|
96
169
|
}
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
170
|
+
throw new TypeError("Unsupported repoRoot value. Expected string or URL.");
|
|
171
|
+
}
|
|
172
|
+
function buildDirectoryChain2(filePath, repoRoot) {
|
|
173
|
+
const directories = [];
|
|
174
|
+
const seen = /* @__PURE__ */ new Set();
|
|
175
|
+
const boundary = path.resolve(repoRoot);
|
|
176
|
+
let current = path.resolve(path.dirname(filePath));
|
|
177
|
+
while (current !== void 0) {
|
|
178
|
+
if (!seen.has(current)) {
|
|
179
|
+
directories.push(current);
|
|
180
|
+
seen.add(current);
|
|
181
|
+
}
|
|
182
|
+
if (current === boundary) {
|
|
183
|
+
break;
|
|
184
|
+
}
|
|
185
|
+
const parent = path.dirname(current);
|
|
186
|
+
if (parent === current) {
|
|
187
|
+
break;
|
|
188
|
+
}
|
|
189
|
+
current = parent;
|
|
100
190
|
}
|
|
101
|
-
|
|
191
|
+
if (!seen.has(boundary)) {
|
|
192
|
+
directories.push(boundary);
|
|
193
|
+
}
|
|
194
|
+
return directories;
|
|
195
|
+
}
|
|
196
|
+
function buildSearchRoots2(evalPath, repoRoot) {
|
|
197
|
+
const uniqueRoots = [];
|
|
198
|
+
const addRoot = (root) => {
|
|
199
|
+
const normalized = path.resolve(root);
|
|
200
|
+
if (!uniqueRoots.includes(normalized)) {
|
|
201
|
+
uniqueRoots.push(normalized);
|
|
202
|
+
}
|
|
203
|
+
};
|
|
204
|
+
let currentDir = path.dirname(evalPath);
|
|
205
|
+
let reachedBoundary = false;
|
|
206
|
+
while (!reachedBoundary) {
|
|
207
|
+
addRoot(currentDir);
|
|
208
|
+
const parentDir = path.dirname(currentDir);
|
|
209
|
+
if (currentDir === repoRoot || parentDir === currentDir) {
|
|
210
|
+
reachedBoundary = true;
|
|
211
|
+
} else {
|
|
212
|
+
currentDir = parentDir;
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
addRoot(repoRoot);
|
|
216
|
+
addRoot(process.cwd());
|
|
217
|
+
return uniqueRoots;
|
|
218
|
+
}
|
|
219
|
+
function trimLeadingSeparators(value) {
|
|
220
|
+
const trimmed = value.replace(/^[/\\]+/, "");
|
|
221
|
+
return trimmed.length > 0 ? trimmed : value;
|
|
102
222
|
}
|
|
223
|
+
async function resolveFileReference2(rawValue, searchRoots) {
|
|
224
|
+
const displayPath = trimLeadingSeparators(rawValue);
|
|
225
|
+
const potentialPaths = [];
|
|
226
|
+
if (path.isAbsolute(rawValue)) {
|
|
227
|
+
potentialPaths.push(path.normalize(rawValue));
|
|
228
|
+
}
|
|
229
|
+
for (const base of searchRoots) {
|
|
230
|
+
potentialPaths.push(path.resolve(base, displayPath));
|
|
231
|
+
}
|
|
232
|
+
const attempted = [];
|
|
233
|
+
const seen = /* @__PURE__ */ new Set();
|
|
234
|
+
for (const candidate of potentialPaths) {
|
|
235
|
+
const absoluteCandidate = path.resolve(candidate);
|
|
236
|
+
if (seen.has(absoluteCandidate)) {
|
|
237
|
+
continue;
|
|
238
|
+
}
|
|
239
|
+
seen.add(absoluteCandidate);
|
|
240
|
+
attempted.push(absoluteCandidate);
|
|
241
|
+
if (await fileExists2(absoluteCandidate)) {
|
|
242
|
+
return { displayPath, resolvedPath: absoluteCandidate, attempted };
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
return { displayPath, attempted };
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
// src/evaluation/loaders/config-loader.ts
|
|
249
|
+
var SCHEMA_CONFIG_V2 = "agentv-config-v2";
|
|
250
|
+
var ANSI_YELLOW = "\x1B[33m";
|
|
251
|
+
var ANSI_RESET = "\x1B[0m";
|
|
103
252
|
async function loadConfig(evalFilePath, repoRoot) {
|
|
104
|
-
const directories =
|
|
253
|
+
const directories = buildDirectoryChain2(evalFilePath, repoRoot);
|
|
105
254
|
for (const directory of directories) {
|
|
106
|
-
const configPath =
|
|
255
|
+
const configPath = path2.join(directory, ".agentv", "config.yaml");
|
|
107
256
|
if (!await fileExists2(configPath)) {
|
|
108
257
|
continue;
|
|
109
258
|
}
|
|
@@ -146,24 +295,134 @@ function isGuidelineFile(filePath, patterns) {
|
|
|
146
295
|
const patternsToUse = patterns ?? [];
|
|
147
296
|
return micromatch.isMatch(normalized, patternsToUse);
|
|
148
297
|
}
|
|
149
|
-
function
|
|
150
|
-
const
|
|
151
|
-
|
|
152
|
-
const
|
|
153
|
-
if (typeof
|
|
298
|
+
function extractTargetFromSuite(suite) {
|
|
299
|
+
const execution = suite.execution;
|
|
300
|
+
if (execution && typeof execution === "object" && !Array.isArray(execution)) {
|
|
301
|
+
const executionTarget = execution.target;
|
|
302
|
+
if (typeof executionTarget === "string" && executionTarget.trim().length > 0) {
|
|
303
|
+
return executionTarget.trim();
|
|
304
|
+
}
|
|
305
|
+
}
|
|
306
|
+
const targetValue = suite.target;
|
|
307
|
+
if (typeof targetValue === "string" && targetValue.trim().length > 0) {
|
|
308
|
+
return targetValue.trim();
|
|
309
|
+
}
|
|
310
|
+
return void 0;
|
|
311
|
+
}
|
|
312
|
+
function logWarning(message) {
|
|
313
|
+
console.warn(`${ANSI_YELLOW}Warning: ${message}${ANSI_RESET}`);
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
// src/evaluation/loaders/evaluator-parser.ts
|
|
317
|
+
import path3 from "node:path";
|
|
318
|
+
var ANSI_YELLOW2 = "\x1B[33m";
|
|
319
|
+
var ANSI_RESET2 = "\x1B[0m";
|
|
320
|
+
async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
|
|
321
|
+
const execution = rawEvalCase.execution;
|
|
322
|
+
const candidateEvaluators = isJsonObject2(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators ?? globalExecution?.evaluators;
|
|
323
|
+
if (candidateEvaluators === void 0) {
|
|
324
|
+
return void 0;
|
|
325
|
+
}
|
|
326
|
+
if (!Array.isArray(candidateEvaluators)) {
|
|
327
|
+
logWarning2(`Skipping evaluators for '${evalId}': expected array`);
|
|
328
|
+
return void 0;
|
|
329
|
+
}
|
|
330
|
+
const evaluators = [];
|
|
331
|
+
for (const rawEvaluator of candidateEvaluators) {
|
|
332
|
+
if (!isJsonObject2(rawEvaluator)) {
|
|
333
|
+
logWarning2(`Skipping invalid evaluator entry for '${evalId}' (expected object)`);
|
|
154
334
|
continue;
|
|
155
335
|
}
|
|
156
|
-
const
|
|
157
|
-
|
|
336
|
+
const name = asString2(rawEvaluator.name);
|
|
337
|
+
const typeValue = rawEvaluator.type;
|
|
338
|
+
if (!name || !isEvaluatorKind(typeValue)) {
|
|
339
|
+
logWarning2(`Skipping evaluator with invalid name/type in '${evalId}'`);
|
|
158
340
|
continue;
|
|
159
341
|
}
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
342
|
+
if (typeValue === "code") {
|
|
343
|
+
const script = asString2(rawEvaluator.script);
|
|
344
|
+
if (!script) {
|
|
345
|
+
logWarning2(`Skipping code evaluator '${name}' in '${evalId}': missing script`);
|
|
346
|
+
continue;
|
|
347
|
+
}
|
|
348
|
+
const cwd = asString2(rawEvaluator.cwd);
|
|
349
|
+
let resolvedCwd;
|
|
350
|
+
if (cwd) {
|
|
351
|
+
const resolved = await resolveFileReference2(cwd, searchRoots);
|
|
352
|
+
if (resolved.resolvedPath) {
|
|
353
|
+
resolvedCwd = path3.resolve(resolved.resolvedPath);
|
|
354
|
+
} else {
|
|
355
|
+
logWarning2(
|
|
356
|
+
`Code evaluator '${name}' in '${evalId}': cwd not found (${resolved.displayPath})`,
|
|
357
|
+
resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
|
|
358
|
+
);
|
|
359
|
+
}
|
|
360
|
+
} else {
|
|
361
|
+
resolvedCwd = searchRoots[0];
|
|
362
|
+
}
|
|
363
|
+
evaluators.push({
|
|
364
|
+
name,
|
|
365
|
+
type: "code",
|
|
366
|
+
script,
|
|
367
|
+
cwd,
|
|
368
|
+
resolvedCwd
|
|
369
|
+
});
|
|
370
|
+
continue;
|
|
163
371
|
}
|
|
372
|
+
const prompt = asString2(rawEvaluator.prompt);
|
|
373
|
+
let promptPath;
|
|
374
|
+
if (prompt) {
|
|
375
|
+
const resolved = await resolveFileReference2(prompt, searchRoots);
|
|
376
|
+
if (resolved.resolvedPath) {
|
|
377
|
+
promptPath = path3.resolve(resolved.resolvedPath);
|
|
378
|
+
} else {
|
|
379
|
+
logWarning2(
|
|
380
|
+
`Inline prompt used for evaluator '${name}' in '${evalId}' (file not found: ${resolved.displayPath})`,
|
|
381
|
+
resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
|
|
382
|
+
);
|
|
383
|
+
}
|
|
384
|
+
}
|
|
385
|
+
const _model = asString2(rawEvaluator.model);
|
|
386
|
+
evaluators.push({
|
|
387
|
+
name,
|
|
388
|
+
type: "llm_judge",
|
|
389
|
+
prompt,
|
|
390
|
+
promptPath
|
|
391
|
+
});
|
|
164
392
|
}
|
|
165
|
-
return
|
|
393
|
+
return evaluators.length > 0 ? evaluators : void 0;
|
|
394
|
+
}
|
|
395
|
+
function coerceEvaluator(candidate, contextId) {
|
|
396
|
+
if (typeof candidate !== "string") {
|
|
397
|
+
return void 0;
|
|
398
|
+
}
|
|
399
|
+
if (isEvaluatorKind(candidate)) {
|
|
400
|
+
return candidate;
|
|
401
|
+
}
|
|
402
|
+
logWarning2(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
|
|
403
|
+
return void 0;
|
|
404
|
+
}
|
|
405
|
+
function asString2(value) {
|
|
406
|
+
return typeof value === "string" ? value : void 0;
|
|
166
407
|
}
|
|
408
|
+
function isJsonObject2(value) {
|
|
409
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
410
|
+
}
|
|
411
|
+
function logWarning2(message, details) {
|
|
412
|
+
if (details && details.length > 0) {
|
|
413
|
+
const detailBlock = details.join("\n");
|
|
414
|
+
console.warn(`${ANSI_YELLOW2}Warning: ${message}
|
|
415
|
+
${detailBlock}${ANSI_RESET2}`);
|
|
416
|
+
} else {
|
|
417
|
+
console.warn(`${ANSI_YELLOW2}Warning: ${message}${ANSI_RESET2}`);
|
|
418
|
+
}
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
// src/evaluation/loaders/message-processor.ts
|
|
422
|
+
import { readFile as readFile2 } from "node:fs/promises";
|
|
423
|
+
import path4 from "node:path";
|
|
424
|
+
var ANSI_YELLOW3 = "\x1B[33m";
|
|
425
|
+
var ANSI_RESET3 = "\x1B[0m";
|
|
167
426
|
async function processMessages(options) {
|
|
168
427
|
const {
|
|
169
428
|
messages,
|
|
@@ -189,28 +448,28 @@ async function processMessages(options) {
|
|
|
189
448
|
if (!isJsonObject(rawSegment)) {
|
|
190
449
|
continue;
|
|
191
450
|
}
|
|
192
|
-
const segmentType =
|
|
451
|
+
const segmentType = asString3(rawSegment.type);
|
|
193
452
|
if (segmentType === "file") {
|
|
194
|
-
const rawValue =
|
|
453
|
+
const rawValue = asString3(rawSegment.value);
|
|
195
454
|
if (!rawValue) {
|
|
196
455
|
continue;
|
|
197
456
|
}
|
|
198
|
-
const { displayPath, resolvedPath, attempted } = await
|
|
457
|
+
const { displayPath, resolvedPath, attempted } = await resolveFileReference2(
|
|
199
458
|
rawValue,
|
|
200
459
|
searchRoots
|
|
201
460
|
);
|
|
202
461
|
if (!resolvedPath) {
|
|
203
462
|
const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
|
|
204
463
|
const context = messageType === "input" ? "" : " in expected_messages";
|
|
205
|
-
|
|
464
|
+
logWarning3(`File not found${context}: ${displayPath}`, attempts);
|
|
206
465
|
continue;
|
|
207
466
|
}
|
|
208
467
|
try {
|
|
209
|
-
const fileContent = (await
|
|
468
|
+
const fileContent = (await readFile2(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
|
|
210
469
|
if (messageType === "input" && guidelinePatterns && guidelinePaths) {
|
|
211
|
-
const relativeToRepo =
|
|
470
|
+
const relativeToRepo = path4.relative(repoRootPath, resolvedPath);
|
|
212
471
|
if (isGuidelineFile(relativeToRepo, guidelinePatterns)) {
|
|
213
|
-
guidelinePaths.push(
|
|
472
|
+
guidelinePaths.push(path4.resolve(resolvedPath));
|
|
214
473
|
if (verbose) {
|
|
215
474
|
console.log(` [Guideline] Found: ${displayPath}`);
|
|
216
475
|
console.log(` Resolved to: ${resolvedPath}`);
|
|
@@ -222,7 +481,7 @@ async function processMessages(options) {
|
|
|
222
481
|
type: "file",
|
|
223
482
|
path: displayPath,
|
|
224
483
|
text: fileContent,
|
|
225
|
-
resolvedPath:
|
|
484
|
+
resolvedPath: path4.resolve(resolvedPath)
|
|
226
485
|
});
|
|
227
486
|
if (verbose) {
|
|
228
487
|
const label = messageType === "input" ? "[File]" : "[Expected Output File]";
|
|
@@ -231,7 +490,7 @@ async function processMessages(options) {
|
|
|
231
490
|
}
|
|
232
491
|
} catch (error) {
|
|
233
492
|
const context = messageType === "input" ? "" : " expected output";
|
|
234
|
-
|
|
493
|
+
logWarning3(`Could not read${context} file ${resolvedPath}: ${error.message}`);
|
|
235
494
|
}
|
|
236
495
|
continue;
|
|
237
496
|
}
|
|
@@ -245,202 +504,120 @@ async function processMessages(options) {
|
|
|
245
504
|
}
|
|
246
505
|
return segments;
|
|
247
506
|
}
|
|
248
|
-
async function
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
const absoluteTestPath = path.resolve(evalFilePath);
|
|
252
|
-
if (!await fileExists2(absoluteTestPath)) {
|
|
253
|
-
throw new Error(`Test file not found: ${evalFilePath}`);
|
|
254
|
-
}
|
|
255
|
-
const repoRootPath = resolveToAbsolutePath(repoRoot);
|
|
256
|
-
const searchRoots = buildSearchRoots(absoluteTestPath, repoRootPath);
|
|
257
|
-
const config = await loadConfig(absoluteTestPath, repoRootPath);
|
|
258
|
-
const guidelinePatterns = config?.guideline_patterns;
|
|
259
|
-
const rawFile = await readFile(absoluteTestPath, "utf8");
|
|
260
|
-
const parsed = parse(rawFile);
|
|
261
|
-
if (!isJsonObject(parsed)) {
|
|
262
|
-
throw new Error(`Invalid test file format: ${evalFilePath}`);
|
|
263
|
-
}
|
|
264
|
-
const suite = parsed;
|
|
265
|
-
const datasetNameFromSuite = asString(suite.dataset)?.trim();
|
|
266
|
-
const fallbackDataset = path.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
|
|
267
|
-
const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
|
|
268
|
-
const schema = suite.$schema;
|
|
269
|
-
if (schema !== SCHEMA_EVAL_V2) {
|
|
270
|
-
const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${evalFilePath}. Expected '${SCHEMA_EVAL_V2}'` : `Missing required field '$schema' in ${evalFilePath}.
|
|
271
|
-
Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
272
|
-
throw new Error(message);
|
|
507
|
+
async function resolveAssistantContent(content, searchRoots, verbose) {
|
|
508
|
+
if (typeof content === "string") {
|
|
509
|
+
return content;
|
|
273
510
|
}
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
throw new Error(`Invalid test file format: ${evalFilePath} - missing 'evalcases' field`);
|
|
511
|
+
if (!content) {
|
|
512
|
+
return "";
|
|
277
513
|
}
|
|
278
|
-
const
|
|
279
|
-
const
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
for (const rawEvalcase of rawTestcases) {
|
|
283
|
-
if (!isJsonObject(rawEvalcase)) {
|
|
284
|
-
logWarning("Skipping invalid eval case entry (expected object)");
|
|
514
|
+
const parts = [];
|
|
515
|
+
for (const entry of content) {
|
|
516
|
+
if (typeof entry === "string") {
|
|
517
|
+
parts.push({ content: entry, isFile: false });
|
|
285
518
|
continue;
|
|
286
519
|
}
|
|
287
|
-
|
|
288
|
-
const id = asString(evalcase.id);
|
|
289
|
-
if (evalIdFilter && id !== evalIdFilter) {
|
|
520
|
+
if (!isJsonObject(entry)) {
|
|
290
521
|
continue;
|
|
291
522
|
}
|
|
292
|
-
const
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
523
|
+
const segmentType = asString3(entry.type);
|
|
524
|
+
if (segmentType === "file") {
|
|
525
|
+
const rawValue = asString3(entry.value);
|
|
526
|
+
if (!rawValue) {
|
|
527
|
+
continue;
|
|
528
|
+
}
|
|
529
|
+
const { displayPath, resolvedPath, attempted } = await resolveFileReference2(
|
|
530
|
+
rawValue,
|
|
531
|
+
searchRoots
|
|
532
|
+
);
|
|
533
|
+
if (!resolvedPath) {
|
|
534
|
+
const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
|
|
535
|
+
logWarning3(`File not found in expected_messages: ${displayPath}`, attempts);
|
|
536
|
+
continue;
|
|
537
|
+
}
|
|
538
|
+
try {
|
|
539
|
+
const fileContent = (await readFile2(resolvedPath, "utf8")).replace(/\r\n/g, "\n").trim();
|
|
540
|
+
parts.push({ content: fileContent, isFile: true, displayPath });
|
|
541
|
+
if (verbose) {
|
|
542
|
+
console.log(` [Expected Assistant File] Found: ${displayPath}`);
|
|
543
|
+
console.log(` Resolved to: ${resolvedPath}`);
|
|
544
|
+
}
|
|
545
|
+
} catch (error) {
|
|
546
|
+
logWarning3(`Could not read file ${resolvedPath}: ${error.message}`);
|
|
547
|
+
}
|
|
298
548
|
continue;
|
|
299
549
|
}
|
|
300
|
-
const
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
if (hasExpectedMessages && expectedMessages.length === 0) {
|
|
304
|
-
logWarning(`No valid expected message found for eval case: ${id}`);
|
|
550
|
+
const textValue = asString3(entry.text);
|
|
551
|
+
if (typeof textValue === "string") {
|
|
552
|
+
parts.push({ content: textValue, isFile: false });
|
|
305
553
|
continue;
|
|
306
554
|
}
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
const inputTextParts = [];
|
|
312
|
-
const inputSegments = await processMessages({
|
|
313
|
-
messages: inputMessages,
|
|
314
|
-
searchRoots,
|
|
315
|
-
repoRootPath,
|
|
316
|
-
guidelinePatterns,
|
|
317
|
-
guidelinePaths,
|
|
318
|
-
textParts: inputTextParts,
|
|
319
|
-
messageType: "input",
|
|
320
|
-
verbose
|
|
321
|
-
});
|
|
322
|
-
const outputSegments = hasExpectedMessages ? await processMessages({
|
|
323
|
-
messages: expectedMessages,
|
|
324
|
-
searchRoots,
|
|
325
|
-
repoRootPath,
|
|
326
|
-
guidelinePatterns,
|
|
327
|
-
messageType: "output",
|
|
328
|
-
verbose
|
|
329
|
-
}) : [];
|
|
330
|
-
const codeSnippets = extractCodeBlocks(inputSegments);
|
|
331
|
-
const expectedContent = expectedMessages[0]?.content;
|
|
332
|
-
const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
|
|
333
|
-
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
334
|
-
const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
|
|
335
|
-
const evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
|
|
336
|
-
const userFilePaths = [];
|
|
337
|
-
for (const segment of inputSegments) {
|
|
338
|
-
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
339
|
-
userFilePaths.push(segment.resolvedPath);
|
|
340
|
-
}
|
|
341
|
-
}
|
|
342
|
-
const allFilePaths = [
|
|
343
|
-
...guidelinePaths.map((guidelinePath) => path.resolve(guidelinePath)),
|
|
344
|
-
...userFilePaths
|
|
345
|
-
];
|
|
346
|
-
const testCase = {
|
|
347
|
-
id,
|
|
348
|
-
dataset: datasetName,
|
|
349
|
-
conversation_id: conversationId,
|
|
350
|
-
question,
|
|
351
|
-
input_messages: inputMessages,
|
|
352
|
-
input_segments: inputSegments,
|
|
353
|
-
output_segments: outputSegments,
|
|
354
|
-
reference_answer: referenceAnswer,
|
|
355
|
-
guideline_paths: guidelinePaths.map((guidelinePath) => path.resolve(guidelinePath)),
|
|
356
|
-
guideline_patterns: guidelinePatterns,
|
|
357
|
-
file_paths: allFilePaths,
|
|
358
|
-
code_snippets: codeSnippets,
|
|
359
|
-
expected_outcome: outcome,
|
|
360
|
-
evaluator: evalCaseEvaluatorKind,
|
|
361
|
-
evaluators
|
|
362
|
-
};
|
|
363
|
-
if (verbose) {
|
|
364
|
-
console.log(`
|
|
365
|
-
[Eval Case: ${id}]`);
|
|
366
|
-
if (testCase.guideline_paths.length > 0) {
|
|
367
|
-
console.log(` Guidelines used: ${testCase.guideline_paths.length}`);
|
|
368
|
-
for (const guidelinePath of testCase.guideline_paths) {
|
|
369
|
-
console.log(` - ${guidelinePath}`);
|
|
370
|
-
}
|
|
371
|
-
} else {
|
|
372
|
-
console.log(" No guidelines found");
|
|
373
|
-
}
|
|
555
|
+
const valueValue = asString3(entry.value);
|
|
556
|
+
if (typeof valueValue === "string") {
|
|
557
|
+
parts.push({ content: valueValue, isFile: false });
|
|
558
|
+
continue;
|
|
374
559
|
}
|
|
375
|
-
|
|
560
|
+
parts.push({ content: JSON.stringify(entry), isFile: false });
|
|
376
561
|
}
|
|
377
|
-
return
|
|
562
|
+
return formatFileContents(parts);
|
|
378
563
|
}
|
|
379
|
-
function
|
|
380
|
-
|
|
381
|
-
return true;
|
|
382
|
-
}
|
|
383
|
-
let messagesWithContent = 0;
|
|
384
|
-
for (const segments of processedSegmentsByMessage) {
|
|
385
|
-
if (hasVisibleContent(segments)) {
|
|
386
|
-
messagesWithContent++;
|
|
387
|
-
}
|
|
388
|
-
}
|
|
389
|
-
return messagesWithContent > 1;
|
|
564
|
+
function asString3(value) {
|
|
565
|
+
return typeof value === "string" ? value : void 0;
|
|
390
566
|
}
|
|
391
|
-
function
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
if (type === "text") {
|
|
395
|
-
const value = asString(segment.value);
|
|
396
|
-
return value !== void 0 && value.trim().length > 0;
|
|
397
|
-
}
|
|
398
|
-
if (type === "guideline_ref") {
|
|
399
|
-
return false;
|
|
400
|
-
}
|
|
401
|
-
if (type === "file") {
|
|
402
|
-
const text = asString(segment.text);
|
|
403
|
-
return text !== void 0 && text.trim().length > 0;
|
|
404
|
-
}
|
|
405
|
-
return false;
|
|
406
|
-
});
|
|
567
|
+
function cloneJsonObject(source) {
|
|
568
|
+
const entries = Object.entries(source).map(([key, value]) => [key, cloneJsonValue(value)]);
|
|
569
|
+
return Object.fromEntries(entries);
|
|
407
570
|
}
|
|
408
|
-
function
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
return asString(segment.value);
|
|
571
|
+
function cloneJsonValue(value) {
|
|
572
|
+
if (value === null) {
|
|
573
|
+
return null;
|
|
412
574
|
}
|
|
413
|
-
if (
|
|
414
|
-
|
|
415
|
-
return refPath ? `<Attached: ${refPath}>` : void 0;
|
|
575
|
+
if (typeof value === "string" || typeof value === "number" || typeof value === "boolean") {
|
|
576
|
+
return value;
|
|
416
577
|
}
|
|
417
|
-
if (
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
578
|
+
if (Array.isArray(value)) {
|
|
579
|
+
return value.map((item) => cloneJsonValue(item));
|
|
580
|
+
}
|
|
581
|
+
if (typeof value === "object") {
|
|
582
|
+
return cloneJsonObject(value);
|
|
583
|
+
}
|
|
584
|
+
return value;
|
|
585
|
+
}
|
|
586
|
+
function logWarning3(message, details) {
|
|
587
|
+
if (details && details.length > 0) {
|
|
588
|
+
const detailBlock = details.join("\n");
|
|
589
|
+
console.warn(`${ANSI_YELLOW3}Warning: ${message}
|
|
590
|
+
${detailBlock}${ANSI_RESET3}`);
|
|
591
|
+
} else {
|
|
592
|
+
console.warn(`${ANSI_YELLOW3}Warning: ${message}${ANSI_RESET3}`);
|
|
424
593
|
}
|
|
425
|
-
return void 0;
|
|
426
594
|
}
|
|
595
|
+
|
|
596
|
+
// src/evaluation/formatting/prompt-builder.ts
|
|
597
|
+
import { readFile as readFile3 } from "node:fs/promises";
|
|
598
|
+
import path5 from "node:path";
|
|
599
|
+
var ANSI_YELLOW4 = "\x1B[33m";
|
|
600
|
+
var ANSI_RESET4 = "\x1B[0m";
|
|
427
601
|
async function buildPromptInputs(testCase) {
|
|
428
|
-
const
|
|
602
|
+
const guidelineParts = [];
|
|
429
603
|
for (const rawPath of testCase.guideline_paths) {
|
|
430
|
-
const absolutePath =
|
|
604
|
+
const absolutePath = path5.resolve(rawPath);
|
|
431
605
|
if (!await fileExists2(absolutePath)) {
|
|
432
|
-
|
|
606
|
+
logWarning4(`Could not read guideline file ${absolutePath}: file does not exist`);
|
|
433
607
|
continue;
|
|
434
608
|
}
|
|
435
609
|
try {
|
|
436
|
-
const content = (await
|
|
437
|
-
|
|
438
|
-
|
|
610
|
+
const content = (await readFile3(absolutePath, "utf8")).replace(/\r\n/g, "\n").trim();
|
|
611
|
+
guidelineParts.push({
|
|
612
|
+
content,
|
|
613
|
+
isFile: true,
|
|
614
|
+
displayPath: path5.basename(absolutePath)
|
|
615
|
+
});
|
|
439
616
|
} catch (error) {
|
|
440
|
-
|
|
617
|
+
logWarning4(`Could not read guideline file ${absolutePath}: ${error.message}`);
|
|
441
618
|
}
|
|
442
619
|
}
|
|
443
|
-
const guidelines =
|
|
620
|
+
const guidelines = formatFileContents(guidelineParts);
|
|
444
621
|
const segmentsByMessage = [];
|
|
445
622
|
const fileContentsByPath = /* @__PURE__ */ new Map();
|
|
446
623
|
for (const segment of testCase.input_segments) {
|
|
@@ -461,9 +638,9 @@ ${content}`);
|
|
|
461
638
|
messageSegments.push({ type: "text", value: segment });
|
|
462
639
|
}
|
|
463
640
|
} else if (isJsonObject(segment)) {
|
|
464
|
-
const type =
|
|
641
|
+
const type = asString4(segment.type);
|
|
465
642
|
if (type === "file") {
|
|
466
|
-
const value =
|
|
643
|
+
const value = asString4(segment.value);
|
|
467
644
|
if (!value) continue;
|
|
468
645
|
if (testCase.guideline_patterns && isGuidelineFile(value, testCase.guideline_patterns)) {
|
|
469
646
|
messageSegments.push({ type: "guideline_ref", path: value });
|
|
@@ -474,7 +651,7 @@ ${content}`);
|
|
|
474
651
|
messageSegments.push({ type: "file", text: fileText, path: value });
|
|
475
652
|
}
|
|
476
653
|
} else if (type === "text") {
|
|
477
|
-
const textValue =
|
|
654
|
+
const textValue = asString4(segment.value);
|
|
478
655
|
if (textValue && textValue.trim().length > 0) {
|
|
479
656
|
messageSegments.push({ type: "text", value: textValue });
|
|
480
657
|
}
|
|
@@ -530,6 +707,18 @@ ${messageContent}`);
|
|
|
530
707
|
}) : void 0;
|
|
531
708
|
return { question, guidelines, chatPrompt };
|
|
532
709
|
}
|
|
710
|
+
function needsRoleMarkers(messages, processedSegmentsByMessage) {
|
|
711
|
+
if (messages.some((msg) => msg.role === "assistant" || msg.role === "tool")) {
|
|
712
|
+
return true;
|
|
713
|
+
}
|
|
714
|
+
let messagesWithContent = 0;
|
|
715
|
+
for (const segments of processedSegmentsByMessage) {
|
|
716
|
+
if (hasVisibleContent(segments)) {
|
|
717
|
+
messagesWithContent++;
|
|
718
|
+
}
|
|
719
|
+
}
|
|
720
|
+
return messagesWithContent > 1;
|
|
721
|
+
}
|
|
533
722
|
function buildChatPromptFromSegments(options) {
|
|
534
723
|
const { messages, segmentsByMessage, guidelinePatterns, guidelineContent, systemPrompt } = options;
|
|
535
724
|
if (messages.length === 0) {
|
|
@@ -596,201 +785,175 @@ ${guidelineContent.trim()}`);
|
|
|
596
785
|
continue;
|
|
597
786
|
}
|
|
598
787
|
chatPrompt.push({
|
|
599
|
-
role,
|
|
600
|
-
content: contentParts.join("\n"),
|
|
601
|
-
...name ? { name } : {}
|
|
602
|
-
});
|
|
603
|
-
}
|
|
604
|
-
return chatPrompt.length > 0 ? chatPrompt : void 0;
|
|
605
|
-
}
|
|
606
|
-
async function fileExists2(absolutePath) {
|
|
607
|
-
try {
|
|
608
|
-
await access(absolutePath, constants.F_OK);
|
|
609
|
-
return true;
|
|
610
|
-
} catch {
|
|
611
|
-
return false;
|
|
612
|
-
}
|
|
613
|
-
}
|
|
614
|
-
function resolveToAbsolutePath(candidate) {
|
|
615
|
-
if (candidate instanceof URL) {
|
|
616
|
-
return fileURLToPath(candidate);
|
|
617
|
-
}
|
|
618
|
-
if (typeof candidate === "string") {
|
|
619
|
-
if (candidate.startsWith("file://")) {
|
|
620
|
-
return fileURLToPath(new URL(candidate));
|
|
621
|
-
}
|
|
622
|
-
return path.resolve(candidate);
|
|
788
|
+
role,
|
|
789
|
+
content: contentParts.join("\n"),
|
|
790
|
+
...name ? { name } : {}
|
|
791
|
+
});
|
|
623
792
|
}
|
|
624
|
-
|
|
793
|
+
return chatPrompt.length > 0 ? chatPrompt : void 0;
|
|
625
794
|
}
|
|
626
|
-
function
|
|
795
|
+
function asString4(value) {
|
|
627
796
|
return typeof value === "string" ? value : void 0;
|
|
628
797
|
}
|
|
629
|
-
function
|
|
630
|
-
|
|
631
|
-
return Object.fromEntries(entries);
|
|
798
|
+
function logWarning4(message) {
|
|
799
|
+
console.warn(`${ANSI_YELLOW4}Warning: ${message}${ANSI_RESET4}`);
|
|
632
800
|
}
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
801
|
+
|
|
802
|
+
// src/evaluation/yaml-parser.ts
|
|
803
|
+
var ANSI_YELLOW5 = "\x1B[33m";
|
|
804
|
+
var ANSI_RESET5 = "\x1B[0m";
|
|
805
|
+
var SCHEMA_EVAL_V2 = "agentv-eval-v2";
|
|
806
|
+
async function readTestSuiteMetadata(testFilePath) {
|
|
807
|
+
try {
|
|
808
|
+
const absolutePath = path6.resolve(testFilePath);
|
|
809
|
+
const content = await readFile4(absolutePath, "utf8");
|
|
810
|
+
const parsed = parse2(content);
|
|
811
|
+
if (!isJsonObject(parsed)) {
|
|
812
|
+
return {};
|
|
813
|
+
}
|
|
814
|
+
return { target: extractTargetFromSuite(parsed) };
|
|
815
|
+
} catch {
|
|
816
|
+
return {};
|
|
642
817
|
}
|
|
643
|
-
return cloneJsonObject(value);
|
|
644
818
|
}
|
|
645
|
-
async function
|
|
646
|
-
|
|
647
|
-
|
|
819
|
+
async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
820
|
+
const verbose = options?.verbose ?? false;
|
|
821
|
+
const evalIdFilter = options?.evalId;
|
|
822
|
+
const absoluteTestPath = path6.resolve(evalFilePath);
|
|
823
|
+
const repoRootPath = resolveToAbsolutePath(repoRoot);
|
|
824
|
+
const searchRoots = buildSearchRoots2(absoluteTestPath, repoRootPath);
|
|
825
|
+
const config = await loadConfig(absoluteTestPath, repoRootPath);
|
|
826
|
+
const guidelinePatterns = config?.guideline_patterns;
|
|
827
|
+
const rawFile = await readFile4(absoluteTestPath, "utf8");
|
|
828
|
+
const parsed = parse2(rawFile);
|
|
829
|
+
if (!isJsonObject(parsed)) {
|
|
830
|
+
throw new Error(`Invalid test file format: ${evalFilePath}`);
|
|
648
831
|
}
|
|
649
|
-
|
|
650
|
-
|
|
832
|
+
const suite = parsed;
|
|
833
|
+
const datasetNameFromSuite = asString5(suite.dataset)?.trim();
|
|
834
|
+
const fallbackDataset = path6.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
|
|
835
|
+
const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
|
|
836
|
+
const schema = suite.$schema;
|
|
837
|
+
if (schema !== SCHEMA_EVAL_V2) {
|
|
838
|
+
const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${evalFilePath}. Expected '${SCHEMA_EVAL_V2}'` : `Missing required field '$schema' in ${evalFilePath}.
|
|
839
|
+
Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
840
|
+
throw new Error(message);
|
|
651
841
|
}
|
|
652
|
-
const
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
const rawValue = asString(entry.value);
|
|
664
|
-
if (!rawValue) {
|
|
665
|
-
continue;
|
|
666
|
-
}
|
|
667
|
-
const { displayPath, resolvedPath, attempted } = await resolveFileReference(
|
|
668
|
-
rawValue,
|
|
669
|
-
searchRoots
|
|
670
|
-
);
|
|
671
|
-
if (!resolvedPath) {
|
|
672
|
-
const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
|
|
673
|
-
logWarning(`File not found in expected_messages: ${displayPath}`, attempts);
|
|
674
|
-
continue;
|
|
675
|
-
}
|
|
676
|
-
try {
|
|
677
|
-
const fileContent = (await readFile(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
|
|
678
|
-
parts.push(fileContent);
|
|
679
|
-
if (verbose) {
|
|
680
|
-
console.log(` [Expected Assistant File] Found: ${displayPath}`);
|
|
681
|
-
console.log(` Resolved to: ${resolvedPath}`);
|
|
682
|
-
}
|
|
683
|
-
} catch (error) {
|
|
684
|
-
logWarning(`Could not read file ${resolvedPath}: ${error.message}`);
|
|
685
|
-
}
|
|
842
|
+
const rawTestcases = suite.evalcases;
|
|
843
|
+
if (!Array.isArray(rawTestcases)) {
|
|
844
|
+
throw new Error(`Invalid test file format: ${evalFilePath} - missing 'evalcases' field`);
|
|
845
|
+
}
|
|
846
|
+
const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm_judge";
|
|
847
|
+
const globalExecution = isJsonObject(suite.execution) ? suite.execution : void 0;
|
|
848
|
+
const _globalTarget = asString5(globalExecution?.target) ?? asString5(suite.target);
|
|
849
|
+
const results = [];
|
|
850
|
+
for (const rawEvalcase of rawTestcases) {
|
|
851
|
+
if (!isJsonObject(rawEvalcase)) {
|
|
852
|
+
logWarning5("Skipping invalid eval case entry (expected object)");
|
|
686
853
|
continue;
|
|
687
854
|
}
|
|
688
|
-
const
|
|
689
|
-
|
|
690
|
-
|
|
855
|
+
const evalcase = rawEvalcase;
|
|
856
|
+
const id = asString5(evalcase.id);
|
|
857
|
+
if (evalIdFilter && id !== evalIdFilter) {
|
|
691
858
|
continue;
|
|
692
859
|
}
|
|
693
|
-
const
|
|
694
|
-
|
|
695
|
-
|
|
860
|
+
const conversationId = asString5(evalcase.conversation_id);
|
|
861
|
+
const outcome = asString5(evalcase.outcome);
|
|
862
|
+
const inputMessagesValue = evalcase.input_messages;
|
|
863
|
+
const expectedMessagesValue = evalcase.expected_messages;
|
|
864
|
+
if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
|
|
865
|
+
logWarning5(`Skipping incomplete eval case: ${id ?? "unknown"}`);
|
|
696
866
|
continue;
|
|
697
867
|
}
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
const execution = rawEvalCase.execution;
|
|
704
|
-
const candidateEvaluators = isJsonObject(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators ?? globalExecution?.evaluators;
|
|
705
|
-
if (candidateEvaluators === void 0) {
|
|
706
|
-
return void 0;
|
|
707
|
-
}
|
|
708
|
-
if (!Array.isArray(candidateEvaluators)) {
|
|
709
|
-
logWarning(`Skipping evaluators for '${evalId}': expected array`);
|
|
710
|
-
return void 0;
|
|
711
|
-
}
|
|
712
|
-
const evaluators = [];
|
|
713
|
-
for (const rawEvaluator of candidateEvaluators) {
|
|
714
|
-
if (!isJsonObject(rawEvaluator)) {
|
|
715
|
-
logWarning(`Skipping invalid evaluator entry for '${evalId}' (expected object)`);
|
|
868
|
+
const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
|
|
869
|
+
const inputMessages = inputMessagesValue.filter((msg) => isTestMessage(msg));
|
|
870
|
+
const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
|
|
871
|
+
if (hasExpectedMessages && expectedMessages.length === 0) {
|
|
872
|
+
logWarning5(`No valid expected message found for eval case: ${id}`);
|
|
716
873
|
continue;
|
|
717
874
|
}
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
if (!name || !isEvaluatorKind(typeValue)) {
|
|
721
|
-
logWarning(`Skipping evaluator with invalid name/type in '${evalId}'`);
|
|
722
|
-
continue;
|
|
875
|
+
if (expectedMessages.length > 1) {
|
|
876
|
+
logWarning5(`Multiple expected messages found for eval case: ${id}, using first`);
|
|
723
877
|
}
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
878
|
+
const guidelinePaths = [];
|
|
879
|
+
const inputTextParts = [];
|
|
880
|
+
const inputSegments = await processMessages({
|
|
881
|
+
messages: inputMessages,
|
|
882
|
+
searchRoots,
|
|
883
|
+
repoRootPath,
|
|
884
|
+
guidelinePatterns,
|
|
885
|
+
guidelinePaths,
|
|
886
|
+
textParts: inputTextParts,
|
|
887
|
+
messageType: "input",
|
|
888
|
+
verbose
|
|
889
|
+
});
|
|
890
|
+
const outputSegments = hasExpectedMessages ? await processMessages({
|
|
891
|
+
messages: expectedMessages,
|
|
892
|
+
searchRoots,
|
|
893
|
+
repoRootPath,
|
|
894
|
+
guidelinePatterns,
|
|
895
|
+
messageType: "output",
|
|
896
|
+
verbose
|
|
897
|
+
}) : [];
|
|
898
|
+
const codeSnippets = extractCodeBlocks(inputSegments);
|
|
899
|
+
const expectedContent = expectedMessages[0]?.content;
|
|
900
|
+
const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
|
|
901
|
+
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
902
|
+
const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
|
|
903
|
+
const evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
|
|
904
|
+
const userFilePaths = [];
|
|
905
|
+
for (const segment of inputSegments) {
|
|
906
|
+
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
907
|
+
userFilePaths.push(segment.resolvedPath);
|
|
744
908
|
}
|
|
745
|
-
evaluators.push({
|
|
746
|
-
name,
|
|
747
|
-
type: "code",
|
|
748
|
-
script,
|
|
749
|
-
cwd,
|
|
750
|
-
resolvedCwd
|
|
751
|
-
});
|
|
752
|
-
continue;
|
|
753
909
|
}
|
|
754
|
-
const
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
910
|
+
const allFilePaths = [
|
|
911
|
+
...guidelinePaths.map((guidelinePath) => path6.resolve(guidelinePath)),
|
|
912
|
+
...userFilePaths
|
|
913
|
+
];
|
|
914
|
+
const testCase = {
|
|
915
|
+
id,
|
|
916
|
+
dataset: datasetName,
|
|
917
|
+
conversation_id: conversationId,
|
|
918
|
+
question,
|
|
919
|
+
input_messages: inputMessages,
|
|
920
|
+
input_segments: inputSegments,
|
|
921
|
+
output_segments: outputSegments,
|
|
922
|
+
reference_answer: referenceAnswer,
|
|
923
|
+
guideline_paths: guidelinePaths.map((guidelinePath) => path6.resolve(guidelinePath)),
|
|
924
|
+
guideline_patterns: guidelinePatterns,
|
|
925
|
+
file_paths: allFilePaths,
|
|
926
|
+
code_snippets: codeSnippets,
|
|
927
|
+
expected_outcome: outcome,
|
|
928
|
+
evaluator: evalCaseEvaluatorKind,
|
|
929
|
+
evaluators
|
|
930
|
+
};
|
|
931
|
+
if (verbose) {
|
|
932
|
+
console.log(`
|
|
933
|
+
[Eval Case: ${id}]`);
|
|
934
|
+
if (testCase.guideline_paths.length > 0) {
|
|
935
|
+
console.log(` Guidelines used: ${testCase.guideline_paths.length}`);
|
|
936
|
+
for (const guidelinePath of testCase.guideline_paths) {
|
|
937
|
+
console.log(` - ${guidelinePath}`);
|
|
938
|
+
}
|
|
760
939
|
} else {
|
|
761
|
-
|
|
762
|
-
`Inline prompt used for evaluator '${name}' in '${evalId}' (file not found: ${resolved.displayPath})`,
|
|
763
|
-
resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
|
|
764
|
-
);
|
|
940
|
+
console.log(" No guidelines found");
|
|
765
941
|
}
|
|
766
942
|
}
|
|
767
|
-
|
|
768
|
-
evaluators.push({
|
|
769
|
-
name,
|
|
770
|
-
type: "llm_judge",
|
|
771
|
-
prompt,
|
|
772
|
-
promptPath
|
|
773
|
-
});
|
|
943
|
+
results.push(testCase);
|
|
774
944
|
}
|
|
775
|
-
return
|
|
945
|
+
return results;
|
|
776
946
|
}
|
|
777
|
-
function
|
|
778
|
-
|
|
779
|
-
return void 0;
|
|
780
|
-
}
|
|
781
|
-
if (isEvaluatorKind(candidate)) {
|
|
782
|
-
return candidate;
|
|
783
|
-
}
|
|
784
|
-
logWarning(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
|
|
785
|
-
return void 0;
|
|
947
|
+
function asString5(value) {
|
|
948
|
+
return typeof value === "string" ? value : void 0;
|
|
786
949
|
}
|
|
787
|
-
function
|
|
950
|
+
function logWarning5(message, details) {
|
|
788
951
|
if (details && details.length > 0) {
|
|
789
952
|
const detailBlock = details.join("\n");
|
|
790
|
-
console.warn(`${
|
|
791
|
-
${detailBlock}${
|
|
953
|
+
console.warn(`${ANSI_YELLOW5}Warning: ${message}
|
|
954
|
+
${detailBlock}${ANSI_RESET5}`);
|
|
792
955
|
} else {
|
|
793
|
-
console.warn(`${
|
|
956
|
+
console.warn(`${ANSI_YELLOW5}Warning: ${message}${ANSI_RESET5}`);
|
|
794
957
|
}
|
|
795
958
|
}
|
|
796
959
|
|
|
@@ -822,9 +985,8 @@ function buildChatPrompt(request) {
|
|
|
822
985
|
}
|
|
823
986
|
function resolveSystemContent(request) {
|
|
824
987
|
const systemSegments = [];
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
systemSegments.push(metadataSystemPrompt.trim());
|
|
988
|
+
if (request.systemPrompt && request.systemPrompt.trim().length > 0) {
|
|
989
|
+
systemSegments.push(request.systemPrompt.trim());
|
|
828
990
|
} else {
|
|
829
991
|
systemSegments.push(DEFAULT_SYSTEM_PROMPT);
|
|
830
992
|
}
|
|
@@ -1077,7 +1239,7 @@ var GeminiProvider = class {
|
|
|
1077
1239
|
import { exec as execWithCallback } from "node:child_process";
|
|
1078
1240
|
import fs from "node:fs/promises";
|
|
1079
1241
|
import os from "node:os";
|
|
1080
|
-
import
|
|
1242
|
+
import path7 from "node:path";
|
|
1081
1243
|
import { promisify } from "node:util";
|
|
1082
1244
|
var execAsync = promisify(execWithCallback);
|
|
1083
1245
|
var DEFAULT_MAX_BUFFER = 10 * 1024 * 1024;
|
|
@@ -1256,7 +1418,7 @@ function normalizeInputFiles(inputFiles) {
|
|
|
1256
1418
|
}
|
|
1257
1419
|
const unique = /* @__PURE__ */ new Map();
|
|
1258
1420
|
for (const inputFile of inputFiles) {
|
|
1259
|
-
const absolutePath =
|
|
1421
|
+
const absolutePath = path7.resolve(inputFile);
|
|
1260
1422
|
if (!unique.has(absolutePath)) {
|
|
1261
1423
|
unique.set(absolutePath, absolutePath);
|
|
1262
1424
|
}
|
|
@@ -1270,7 +1432,7 @@ function formatFileList(files, template) {
|
|
|
1270
1432
|
const formatter = template ?? "{path}";
|
|
1271
1433
|
return files.map((filePath) => {
|
|
1272
1434
|
const escapedPath = shellEscape(filePath);
|
|
1273
|
-
const escapedName = shellEscape(
|
|
1435
|
+
const escapedName = shellEscape(path7.basename(filePath));
|
|
1274
1436
|
return formatter.replaceAll("{path}", escapedPath).replaceAll("{basename}", escapedName);
|
|
1275
1437
|
}).join(" ");
|
|
1276
1438
|
}
|
|
@@ -1294,7 +1456,7 @@ function generateOutputFilePath(evalCaseId) {
|
|
|
1294
1456
|
const safeEvalId = evalCaseId || "unknown";
|
|
1295
1457
|
const timestamp = Date.now();
|
|
1296
1458
|
const random = Math.random().toString(36).substring(2, 9);
|
|
1297
|
-
return
|
|
1459
|
+
return path7.join(os.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}.json`);
|
|
1298
1460
|
}
|
|
1299
1461
|
function formatTimeoutSuffix(timeoutMs) {
|
|
1300
1462
|
if (!timeoutMs || timeoutMs <= 0) {
|
|
@@ -1310,7 +1472,7 @@ import { randomUUID } from "node:crypto";
|
|
|
1310
1472
|
import { constants as constants2, createWriteStream } from "node:fs";
|
|
1311
1473
|
import { access as access2, mkdtemp, mkdir, rm, writeFile } from "node:fs/promises";
|
|
1312
1474
|
import { tmpdir } from "node:os";
|
|
1313
|
-
import
|
|
1475
|
+
import path9 from "node:path";
|
|
1314
1476
|
import { promisify as promisify2 } from "node:util";
|
|
1315
1477
|
|
|
1316
1478
|
// src/evaluation/providers/codex-log-tracker.ts
|
|
@@ -1367,7 +1529,7 @@ function subscribeToCodexLogEntries(listener) {
|
|
|
1367
1529
|
}
|
|
1368
1530
|
|
|
1369
1531
|
// src/evaluation/providers/preread.ts
|
|
1370
|
-
import
|
|
1532
|
+
import path8 from "node:path";
|
|
1371
1533
|
function buildPromptDocument(request, inputFiles, options) {
|
|
1372
1534
|
const parts = [];
|
|
1373
1535
|
const guidelineFiles = collectGuidelineFiles(
|
|
@@ -1392,7 +1554,7 @@ function normalizeInputFiles2(inputFiles) {
|
|
|
1392
1554
|
}
|
|
1393
1555
|
const deduped = /* @__PURE__ */ new Map();
|
|
1394
1556
|
for (const inputFile of inputFiles) {
|
|
1395
|
-
const absolutePath =
|
|
1557
|
+
const absolutePath = path8.resolve(inputFile);
|
|
1396
1558
|
if (!deduped.has(absolutePath)) {
|
|
1397
1559
|
deduped.set(absolutePath, absolutePath);
|
|
1398
1560
|
}
|
|
@@ -1405,14 +1567,14 @@ function collectGuidelineFiles(inputFiles, guidelinePatterns, overrides) {
|
|
|
1405
1567
|
}
|
|
1406
1568
|
const unique = /* @__PURE__ */ new Map();
|
|
1407
1569
|
for (const inputFile of inputFiles) {
|
|
1408
|
-
const absolutePath =
|
|
1570
|
+
const absolutePath = path8.resolve(inputFile);
|
|
1409
1571
|
if (overrides?.has(absolutePath)) {
|
|
1410
1572
|
if (!unique.has(absolutePath)) {
|
|
1411
1573
|
unique.set(absolutePath, absolutePath);
|
|
1412
1574
|
}
|
|
1413
1575
|
continue;
|
|
1414
1576
|
}
|
|
1415
|
-
const normalized = absolutePath.split(
|
|
1577
|
+
const normalized = absolutePath.split(path8.sep).join("/");
|
|
1416
1578
|
if (isGuidelineFile(normalized, guidelinePatterns)) {
|
|
1417
1579
|
if (!unique.has(absolutePath)) {
|
|
1418
1580
|
unique.set(absolutePath, absolutePath);
|
|
@@ -1427,7 +1589,7 @@ function collectInputFiles(inputFiles) {
|
|
|
1427
1589
|
}
|
|
1428
1590
|
const unique = /* @__PURE__ */ new Map();
|
|
1429
1591
|
for (const inputFile of inputFiles) {
|
|
1430
|
-
const absolutePath =
|
|
1592
|
+
const absolutePath = path8.resolve(inputFile);
|
|
1431
1593
|
if (!unique.has(absolutePath)) {
|
|
1432
1594
|
unique.set(absolutePath, absolutePath);
|
|
1433
1595
|
}
|
|
@@ -1439,7 +1601,7 @@ function buildMandatoryPrereadBlock(guidelineFiles, inputFiles) {
|
|
|
1439
1601
|
return "";
|
|
1440
1602
|
}
|
|
1441
1603
|
const buildList = (files) => files.map((absolutePath) => {
|
|
1442
|
-
const fileName =
|
|
1604
|
+
const fileName = path8.basename(absolutePath);
|
|
1443
1605
|
const fileUri = pathToFileUri(absolutePath);
|
|
1444
1606
|
return `* [${fileName}](${fileUri})`;
|
|
1445
1607
|
});
|
|
@@ -1459,7 +1621,7 @@ ${buildList(inputFiles).join("\n")}.`);
|
|
|
1459
1621
|
return sections.join("\n");
|
|
1460
1622
|
}
|
|
1461
1623
|
function pathToFileUri(filePath) {
|
|
1462
|
-
const absolutePath =
|
|
1624
|
+
const absolutePath = path8.isAbsolute(filePath) ? filePath : path8.resolve(filePath);
|
|
1463
1625
|
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
1464
1626
|
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
1465
1627
|
return `file:///${normalizedPath}`;
|
|
@@ -1497,7 +1659,7 @@ var CodexProvider = class {
|
|
|
1497
1659
|
const logger = await this.createStreamLogger(request).catch(() => void 0);
|
|
1498
1660
|
try {
|
|
1499
1661
|
const promptContent = buildPromptDocument(request, inputFiles);
|
|
1500
|
-
const promptFile =
|
|
1662
|
+
const promptFile = path9.join(workspaceRoot, PROMPT_FILENAME);
|
|
1501
1663
|
await writeFile(promptFile, promptContent, "utf8");
|
|
1502
1664
|
const args = this.buildCodexArgs();
|
|
1503
1665
|
const cwd = this.resolveCwd(workspaceRoot);
|
|
@@ -1547,7 +1709,7 @@ var CodexProvider = class {
|
|
|
1547
1709
|
if (!this.config.cwd) {
|
|
1548
1710
|
return workspaceRoot;
|
|
1549
1711
|
}
|
|
1550
|
-
return
|
|
1712
|
+
return path9.resolve(this.config.cwd);
|
|
1551
1713
|
}
|
|
1552
1714
|
buildCodexArgs() {
|
|
1553
1715
|
const args = ["--ask-for-approval", "never", "exec", "--json", "--color", "never", "--skip-git-repo-check"];
|
|
@@ -1581,7 +1743,7 @@ var CodexProvider = class {
|
|
|
1581
1743
|
}
|
|
1582
1744
|
}
|
|
1583
1745
|
async createWorkspace() {
|
|
1584
|
-
return await mkdtemp(
|
|
1746
|
+
return await mkdtemp(path9.join(tmpdir(), WORKSPACE_PREFIX));
|
|
1585
1747
|
}
|
|
1586
1748
|
async cleanupWorkspace(workspaceRoot) {
|
|
1587
1749
|
try {
|
|
@@ -1595,9 +1757,9 @@ var CodexProvider = class {
|
|
|
1595
1757
|
return void 0;
|
|
1596
1758
|
}
|
|
1597
1759
|
if (this.config.logDir) {
|
|
1598
|
-
return
|
|
1760
|
+
return path9.resolve(this.config.logDir);
|
|
1599
1761
|
}
|
|
1600
|
-
return
|
|
1762
|
+
return path9.join(process.cwd(), ".agentv", "logs", "codex");
|
|
1601
1763
|
}
|
|
1602
1764
|
async createStreamLogger(request) {
|
|
1603
1765
|
const logDir = this.resolveLogDirectory();
|
|
@@ -1611,7 +1773,7 @@ var CodexProvider = class {
|
|
|
1611
1773
|
console.warn(`Skipping Codex stream logging (could not create ${logDir}): ${message}`);
|
|
1612
1774
|
return void 0;
|
|
1613
1775
|
}
|
|
1614
|
-
const filePath =
|
|
1776
|
+
const filePath = path9.join(logDir, buildLogFilename(request, this.targetName));
|
|
1615
1777
|
try {
|
|
1616
1778
|
const logger = await CodexStreamLogger.create({
|
|
1617
1779
|
filePath,
|
|
@@ -1826,7 +1988,7 @@ function tryParseJsonValue(rawLine) {
|
|
|
1826
1988
|
async function locateExecutable(candidate) {
|
|
1827
1989
|
const includesPathSeparator = candidate.includes("/") || candidate.includes("\\");
|
|
1828
1990
|
if (includesPathSeparator) {
|
|
1829
|
-
const resolved =
|
|
1991
|
+
const resolved = path9.isAbsolute(candidate) ? candidate : path9.resolve(candidate);
|
|
1830
1992
|
const executablePath = await ensureWindowsExecutableVariant(resolved);
|
|
1831
1993
|
await access2(executablePath, constants2.F_OK);
|
|
1832
1994
|
return executablePath;
|
|
@@ -2173,7 +2335,7 @@ var MockProvider = class {
|
|
|
2173
2335
|
};
|
|
2174
2336
|
|
|
2175
2337
|
// src/evaluation/providers/vscode.ts
|
|
2176
|
-
import
|
|
2338
|
+
import path10 from "node:path";
|
|
2177
2339
|
import { dispatchAgentSession, dispatchBatchAgent, getSubagentRoot, provisionSubagents } from "subagent";
|
|
2178
2340
|
var VSCodeProvider = class {
|
|
2179
2341
|
id;
|
|
@@ -2286,6 +2448,9 @@ var VSCodeProvider = class {
|
|
|
2286
2448
|
};
|
|
2287
2449
|
function buildPromptDocument2(request, attachments, guidelinePatterns) {
|
|
2288
2450
|
const parts = [];
|
|
2451
|
+
if (request.systemPrompt && request.systemPrompt.trim().length > 0) {
|
|
2452
|
+
parts.push(request.systemPrompt.trim());
|
|
2453
|
+
}
|
|
2289
2454
|
const guidelineFiles = collectGuidelineFiles2(attachments, guidelinePatterns);
|
|
2290
2455
|
const attachmentFiles = collectAttachmentFiles(attachments);
|
|
2291
2456
|
const nonGuidelineAttachments = attachmentFiles.filter(
|
|
@@ -2303,7 +2468,7 @@ function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
|
|
|
2303
2468
|
return "";
|
|
2304
2469
|
}
|
|
2305
2470
|
const buildList = (files) => files.map((absolutePath) => {
|
|
2306
|
-
const fileName =
|
|
2471
|
+
const fileName = path10.basename(absolutePath);
|
|
2307
2472
|
const fileUri = pathToFileUri2(absolutePath);
|
|
2308
2473
|
return `* [${fileName}](${fileUri})`;
|
|
2309
2474
|
});
|
|
@@ -2328,8 +2493,8 @@ function collectGuidelineFiles2(attachments, guidelinePatterns) {
|
|
|
2328
2493
|
}
|
|
2329
2494
|
const unique = /* @__PURE__ */ new Map();
|
|
2330
2495
|
for (const attachment of attachments) {
|
|
2331
|
-
const absolutePath =
|
|
2332
|
-
const normalized = absolutePath.split(
|
|
2496
|
+
const absolutePath = path10.resolve(attachment);
|
|
2497
|
+
const normalized = absolutePath.split(path10.sep).join("/");
|
|
2333
2498
|
if (isGuidelineFile(normalized, guidelinePatterns)) {
|
|
2334
2499
|
if (!unique.has(absolutePath)) {
|
|
2335
2500
|
unique.set(absolutePath, absolutePath);
|
|
@@ -2344,7 +2509,7 @@ function collectAttachmentFiles(attachments) {
|
|
|
2344
2509
|
}
|
|
2345
2510
|
const unique = /* @__PURE__ */ new Map();
|
|
2346
2511
|
for (const attachment of attachments) {
|
|
2347
|
-
const absolutePath =
|
|
2512
|
+
const absolutePath = path10.resolve(attachment);
|
|
2348
2513
|
if (!unique.has(absolutePath)) {
|
|
2349
2514
|
unique.set(absolutePath, absolutePath);
|
|
2350
2515
|
}
|
|
@@ -2352,7 +2517,7 @@ function collectAttachmentFiles(attachments) {
|
|
|
2352
2517
|
return Array.from(unique.values());
|
|
2353
2518
|
}
|
|
2354
2519
|
function pathToFileUri2(filePath) {
|
|
2355
|
-
const absolutePath =
|
|
2520
|
+
const absolutePath = path10.isAbsolute(filePath) ? filePath : path10.resolve(filePath);
|
|
2356
2521
|
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
2357
2522
|
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
2358
2523
|
return `file:///${normalizedPath}`;
|
|
@@ -2365,7 +2530,7 @@ function normalizeAttachments(attachments) {
|
|
|
2365
2530
|
}
|
|
2366
2531
|
const deduped = /* @__PURE__ */ new Set();
|
|
2367
2532
|
for (const attachment of attachments) {
|
|
2368
|
-
deduped.add(
|
|
2533
|
+
deduped.add(path10.resolve(attachment));
|
|
2369
2534
|
}
|
|
2370
2535
|
return Array.from(deduped);
|
|
2371
2536
|
}
|
|
@@ -2374,7 +2539,7 @@ function mergeAttachments(all) {
|
|
|
2374
2539
|
for (const list of all) {
|
|
2375
2540
|
if (!list) continue;
|
|
2376
2541
|
for (const inputFile of list) {
|
|
2377
|
-
deduped.add(
|
|
2542
|
+
deduped.add(path10.resolve(inputFile));
|
|
2378
2543
|
}
|
|
2379
2544
|
}
|
|
2380
2545
|
return deduped.size > 0 ? Array.from(deduped) : void 0;
|
|
@@ -2420,9 +2585,9 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
|
|
|
2420
2585
|
|
|
2421
2586
|
// src/evaluation/providers/targets-file.ts
|
|
2422
2587
|
import { constants as constants3 } from "node:fs";
|
|
2423
|
-
import { access as access3, readFile as
|
|
2424
|
-
import
|
|
2425
|
-
import { parse as
|
|
2588
|
+
import { access as access3, readFile as readFile5 } from "node:fs/promises";
|
|
2589
|
+
import path11 from "node:path";
|
|
2590
|
+
import { parse as parse3 } from "yaml";
|
|
2426
2591
|
function isRecord(value) {
|
|
2427
2592
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
2428
2593
|
}
|
|
@@ -2477,12 +2642,12 @@ async function fileExists3(filePath) {
|
|
|
2477
2642
|
}
|
|
2478
2643
|
}
|
|
2479
2644
|
async function readTargetDefinitions(filePath) {
|
|
2480
|
-
const absolutePath =
|
|
2645
|
+
const absolutePath = path11.resolve(filePath);
|
|
2481
2646
|
if (!await fileExists3(absolutePath)) {
|
|
2482
2647
|
throw new Error(`targets.yaml not found at ${absolutePath}`);
|
|
2483
2648
|
}
|
|
2484
|
-
const raw = await
|
|
2485
|
-
const parsed =
|
|
2649
|
+
const raw = await readFile5(absolutePath, "utf8");
|
|
2650
|
+
const parsed = parse3(raw);
|
|
2486
2651
|
if (!isRecord(parsed)) {
|
|
2487
2652
|
throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with '$schema' and 'targets' fields`);
|
|
2488
2653
|
}
|
|
@@ -2525,18 +2690,34 @@ function resolveAndCreateProvider(definition, env = process.env) {
|
|
|
2525
2690
|
}
|
|
2526
2691
|
|
|
2527
2692
|
// src/evaluation/evaluators.ts
|
|
2528
|
-
|
|
2693
|
+
var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
|
|
2694
|
+
|
|
2695
|
+
Use the reference_answer as a gold standard for a high-quality response (if provided). The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
|
|
2696
|
+
|
|
2697
|
+
Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.
|
|
2698
|
+
|
|
2699
|
+
[[ ## expected_outcome ## ]]
|
|
2700
|
+
{{expected_outcome}}
|
|
2701
|
+
|
|
2702
|
+
[[ ## question ## ]]
|
|
2703
|
+
{{question}}
|
|
2704
|
+
|
|
2705
|
+
[[ ## reference_answer ## ]]
|
|
2706
|
+
{{reference_answer}}
|
|
2707
|
+
|
|
2708
|
+
[[ ## candidate_answer ## ]]
|
|
2709
|
+
{{candidate_answer}}`;
|
|
2529
2710
|
var LlmJudgeEvaluator = class {
|
|
2530
2711
|
kind = "llm_judge";
|
|
2531
2712
|
resolveJudgeProvider;
|
|
2532
2713
|
maxOutputTokens;
|
|
2533
2714
|
temperature;
|
|
2534
|
-
|
|
2715
|
+
evaluatorTemplate;
|
|
2535
2716
|
constructor(options) {
|
|
2536
2717
|
this.resolveJudgeProvider = options.resolveJudgeProvider;
|
|
2537
2718
|
this.maxOutputTokens = options.maxOutputTokens;
|
|
2538
2719
|
this.temperature = options.temperature;
|
|
2539
|
-
this.
|
|
2720
|
+
this.evaluatorTemplate = options.evaluatorTemplate;
|
|
2540
2721
|
}
|
|
2541
2722
|
async evaluate(context) {
|
|
2542
2723
|
const judgeProvider = await this.resolveJudgeProvider(context);
|
|
@@ -2546,26 +2727,21 @@ var LlmJudgeEvaluator = class {
|
|
|
2546
2727
|
return this.evaluateWithPrompt(context, judgeProvider);
|
|
2547
2728
|
}
|
|
2548
2729
|
async evaluateWithPrompt(context, judgeProvider) {
|
|
2549
|
-
const hasReferenceAnswer = hasNonEmptyReferenceAnswer(context.evalCase);
|
|
2550
2730
|
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
2551
|
-
|
|
2552
|
-
|
|
2553
|
-
|
|
2554
|
-
|
|
2555
|
-
|
|
2556
|
-
|
|
2557
|
-
|
|
2558
|
-
|
|
2559
|
-
|
|
2560
|
-
|
|
2561
|
-
|
|
2562
|
-
prompt = substituteVariables(systemPrompt, variables);
|
|
2563
|
-
systemPrompt = buildSystemPrompt(hasReferenceAnswer);
|
|
2564
|
-
}
|
|
2565
|
-
const metadata = systemPrompt !== void 0 ? { systemPrompt } : {};
|
|
2731
|
+
const variables = {
|
|
2732
|
+
input_messages: JSON.stringify(context.evalCase.input_segments, null, 2),
|
|
2733
|
+
output_messages: JSON.stringify(context.evalCase.output_segments, null, 2),
|
|
2734
|
+
candidate_answer: context.candidate.trim(),
|
|
2735
|
+
reference_answer: (context.evalCase.reference_answer ?? "").trim(),
|
|
2736
|
+
expected_outcome: context.evalCase.expected_outcome.trim(),
|
|
2737
|
+
question: formattedQuestion.trim()
|
|
2738
|
+
};
|
|
2739
|
+
const systemPrompt = buildOutputSchema();
|
|
2740
|
+
const evaluatorTemplate = context.evaluatorTemplateOverride ?? this.evaluatorTemplate ?? DEFAULT_EVALUATOR_TEMPLATE;
|
|
2741
|
+
const userPrompt = substituteVariables(evaluatorTemplate, variables);
|
|
2566
2742
|
const response = await judgeProvider.invoke({
|
|
2567
|
-
question:
|
|
2568
|
-
|
|
2743
|
+
question: userPrompt,
|
|
2744
|
+
systemPrompt,
|
|
2569
2745
|
evalCaseId: context.evalCase.id,
|
|
2570
2746
|
attempt: context.attempt,
|
|
2571
2747
|
maxOutputTokens: this.maxOutputTokens,
|
|
@@ -2578,11 +2754,9 @@ var LlmJudgeEvaluator = class {
|
|
|
2578
2754
|
const reasoning = parsed.reasoning ?? response.reasoning;
|
|
2579
2755
|
const expectedAspectCount = Math.max(hits.length + misses.length, 1);
|
|
2580
2756
|
const evaluatorRawRequest = {
|
|
2581
|
-
|
|
2582
|
-
|
|
2583
|
-
|
|
2584
|
-
target: context.target.name,
|
|
2585
|
-
...systemPrompt !== void 0 && { systemPrompt }
|
|
2757
|
+
userPrompt,
|
|
2758
|
+
systemPrompt,
|
|
2759
|
+
target: judgeProvider.targetName
|
|
2586
2760
|
};
|
|
2587
2761
|
return {
|
|
2588
2762
|
score,
|
|
@@ -2594,20 +2768,8 @@ var LlmJudgeEvaluator = class {
|
|
|
2594
2768
|
};
|
|
2595
2769
|
}
|
|
2596
2770
|
};
|
|
2597
|
-
function
|
|
2598
|
-
|
|
2599
|
-
"You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.",
|
|
2600
|
-
""
|
|
2601
|
-
];
|
|
2602
|
-
if (hasReferenceAnswer) {
|
|
2603
|
-
basePrompt.push(
|
|
2604
|
-
"Use the reference_answer as a gold standard for a high-quality response. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.",
|
|
2605
|
-
""
|
|
2606
|
-
);
|
|
2607
|
-
}
|
|
2608
|
-
basePrompt.push(
|
|
2609
|
-
"Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.",
|
|
2610
|
-
"",
|
|
2771
|
+
function buildOutputSchema() {
|
|
2772
|
+
return [
|
|
2611
2773
|
"You must respond with a single JSON object matching this schema:",
|
|
2612
2774
|
"",
|
|
2613
2775
|
"{",
|
|
@@ -2616,30 +2778,7 @@ function buildSystemPrompt(hasReferenceAnswer) {
|
|
|
2616
2778
|
' "misses": [<array of strings, max 4 items, brief specific failures or omissions, empty if none>],',
|
|
2617
2779
|
' "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
|
|
2618
2780
|
"}"
|
|
2619
|
-
);
|
|
2620
|
-
return basePrompt.join("\n");
|
|
2621
|
-
}
|
|
2622
|
-
function buildQualityPrompt(evalCase, candidate, question) {
|
|
2623
|
-
const parts = [
|
|
2624
|
-
"[[ ## expected_outcome ## ]]",
|
|
2625
|
-
evalCase.expected_outcome.trim(),
|
|
2626
|
-
"",
|
|
2627
|
-
"[[ ## question ## ]]",
|
|
2628
|
-
question.trim(),
|
|
2629
|
-
""
|
|
2630
|
-
];
|
|
2631
|
-
if (hasNonEmptyReferenceAnswer(evalCase)) {
|
|
2632
|
-
parts.push(
|
|
2633
|
-
"[[ ## reference_answer ## ]]",
|
|
2634
|
-
evalCase.reference_answer.trim(),
|
|
2635
|
-
""
|
|
2636
|
-
);
|
|
2637
|
-
}
|
|
2638
|
-
parts.push(
|
|
2639
|
-
"[[ ## candidate_answer ## ]]",
|
|
2640
|
-
candidate.trim()
|
|
2641
|
-
);
|
|
2642
|
-
return parts.join("\n");
|
|
2781
|
+
].join("\n");
|
|
2643
2782
|
}
|
|
2644
2783
|
function clampScore(value) {
|
|
2645
2784
|
if (Number.isNaN(value) || !Number.isFinite(value)) {
|
|
@@ -2721,9 +2860,6 @@ function extractJsonBlob(text) {
|
|
|
2721
2860
|
function isNonEmptyString(value) {
|
|
2722
2861
|
return typeof value === "string" && value.trim().length > 0;
|
|
2723
2862
|
}
|
|
2724
|
-
function hasNonEmptyReferenceAnswer(evalCase) {
|
|
2725
|
-
return evalCase.reference_answer !== void 0 && evalCase.reference_answer.trim().length > 0;
|
|
2726
|
-
}
|
|
2727
2863
|
var CodeEvaluator = class {
|
|
2728
2864
|
kind = "code";
|
|
2729
2865
|
script;
|
|
@@ -2829,19 +2965,16 @@ function parseJsonSafe(payload) {
|
|
|
2829
2965
|
return void 0;
|
|
2830
2966
|
}
|
|
2831
2967
|
}
|
|
2832
|
-
function hasTemplateVariables(text) {
|
|
2833
|
-
return /\$\{[a-zA-Z0-9_]+\}/.test(text);
|
|
2834
|
-
}
|
|
2835
2968
|
function substituteVariables(template, variables) {
|
|
2836
|
-
return template.replace(
|
|
2969
|
+
return template.replace(/\{\{([a-zA-Z0-9_]+)\}\}/g, (match, varName) => {
|
|
2837
2970
|
return variables[varName] ?? match;
|
|
2838
2971
|
});
|
|
2839
2972
|
}
|
|
2840
2973
|
|
|
2841
2974
|
// src/evaluation/orchestrator.ts
|
|
2842
|
-
import { createHash, randomUUID as
|
|
2975
|
+
import { createHash, randomUUID as randomUUID2 } from "node:crypto";
|
|
2843
2976
|
import { mkdir as mkdir2, writeFile as writeFile2 } from "node:fs/promises";
|
|
2844
|
-
import
|
|
2977
|
+
import path12 from "node:path";
|
|
2845
2978
|
|
|
2846
2979
|
// ../../node_modules/.pnpm/yocto-queue@1.2.1/node_modules/yocto-queue/index.js
|
|
2847
2980
|
var Node = class {
|
|
@@ -3404,6 +3537,7 @@ async function evaluateCandidate(options) {
|
|
|
3404
3537
|
}
|
|
3405
3538
|
}
|
|
3406
3539
|
return {
|
|
3540
|
+
timestamp: completedAt.toISOString(),
|
|
3407
3541
|
eval_id: evalCase.id,
|
|
3408
3542
|
dataset: evalCase.dataset,
|
|
3409
3543
|
conversation_id: evalCase.conversation_id,
|
|
@@ -3411,14 +3545,12 @@ async function evaluateCandidate(options) {
|
|
|
3411
3545
|
hits: score.hits,
|
|
3412
3546
|
misses: score.misses,
|
|
3413
3547
|
candidate_answer: candidate,
|
|
3414
|
-
expected_aspect_count: score.expectedAspectCount,
|
|
3415
3548
|
target: target.name,
|
|
3416
|
-
timestamp: completedAt.toISOString(),
|
|
3417
3549
|
reasoning: score.reasoning,
|
|
3418
3550
|
raw_aspects: score.rawAspects,
|
|
3419
3551
|
agent_provider_request: agentProviderRequest,
|
|
3420
3552
|
lm_provider_request: lmProviderRequest,
|
|
3421
|
-
|
|
3553
|
+
evaluator_provider_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
|
|
3422
3554
|
evaluator_results: evaluatorResults
|
|
3423
3555
|
};
|
|
3424
3556
|
}
|
|
@@ -3495,7 +3627,7 @@ async function runEvaluatorList(options) {
|
|
|
3495
3627
|
hits: score2.hits,
|
|
3496
3628
|
misses: score2.misses,
|
|
3497
3629
|
reasoning: score2.reasoning,
|
|
3498
|
-
|
|
3630
|
+
evaluator_provider_request: score2.evaluatorRawRequest
|
|
3499
3631
|
});
|
|
3500
3632
|
continue;
|
|
3501
3633
|
}
|
|
@@ -3522,7 +3654,7 @@ async function runEvaluatorList(options) {
|
|
|
3522
3654
|
hits: score2.hits,
|
|
3523
3655
|
misses: score2.misses,
|
|
3524
3656
|
reasoning: score2.reasoning,
|
|
3525
|
-
|
|
3657
|
+
evaluator_provider_request: score2.evaluatorRawRequest
|
|
3526
3658
|
});
|
|
3527
3659
|
continue;
|
|
3528
3660
|
}
|
|
@@ -3575,7 +3707,7 @@ async function runLlmJudgeEvaluator(options) {
|
|
|
3575
3707
|
promptInputs,
|
|
3576
3708
|
now,
|
|
3577
3709
|
judgeProvider,
|
|
3578
|
-
|
|
3710
|
+
evaluatorTemplateOverride: customPrompt,
|
|
3579
3711
|
evaluator: config
|
|
3580
3712
|
});
|
|
3581
3713
|
}
|
|
@@ -3616,8 +3748,8 @@ function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
|
|
|
3616
3748
|
async function dumpPrompt(directory, evalCase, promptInputs) {
|
|
3617
3749
|
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
3618
3750
|
const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
|
|
3619
|
-
const filePath =
|
|
3620
|
-
await mkdir2(
|
|
3751
|
+
const filePath = path12.resolve(directory, filename);
|
|
3752
|
+
await mkdir2(path12.dirname(filePath), { recursive: true });
|
|
3621
3753
|
const payload = {
|
|
3622
3754
|
eval_id: evalCase.id,
|
|
3623
3755
|
question: promptInputs.question,
|
|
@@ -3631,7 +3763,7 @@ function sanitizeFilename(value) {
|
|
|
3631
3763
|
return "prompt";
|
|
3632
3764
|
}
|
|
3633
3765
|
const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
|
|
3634
|
-
return sanitized.length > 0 ? sanitized :
|
|
3766
|
+
return sanitized.length > 0 ? sanitized : randomUUID2();
|
|
3635
3767
|
}
|
|
3636
3768
|
async function invokeProvider(provider, options) {
|
|
3637
3769
|
const { evalCase, promptInputs, attempt, agentTimeoutMs, signal } = options;
|
|
@@ -3687,6 +3819,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
3687
3819
|
}
|
|
3688
3820
|
}
|
|
3689
3821
|
return {
|
|
3822
|
+
timestamp: timestamp.toISOString(),
|
|
3690
3823
|
eval_id: evalCase.id,
|
|
3691
3824
|
dataset: evalCase.dataset,
|
|
3692
3825
|
conversation_id: evalCase.conversation_id,
|
|
@@ -3694,9 +3827,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
3694
3827
|
hits: [],
|
|
3695
3828
|
misses: [`Error: ${message}`],
|
|
3696
3829
|
candidate_answer: `Error occurred: ${message}`,
|
|
3697
|
-
expected_aspect_count: 0,
|
|
3698
3830
|
target: targetName,
|
|
3699
|
-
timestamp: timestamp.toISOString(),
|
|
3700
3831
|
raw_aspects: [],
|
|
3701
3832
|
agent_provider_request: agentProviderRequest,
|
|
3702
3833
|
lm_provider_request: lmProviderRequest,
|