@agentv/core 0.11.0 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-YQBJAT5I.js → chunk-U3GEJ3K7.js} +1 -1
- package/dist/{chunk-YQBJAT5I.js.map → chunk-U3GEJ3K7.js.map} +1 -1
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/index.cjs +675 -562
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +29 -26
- package/dist/index.d.ts +29 -26
- package/dist/index.js +707 -592
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -9,7 +9,7 @@ import {
|
|
|
9
9
|
readTextFile,
|
|
10
10
|
resolveFileReference,
|
|
11
11
|
resolveTargetDefinition
|
|
12
|
-
} from "./chunk-
|
|
12
|
+
} from "./chunk-U3GEJ3K7.js";
|
|
13
13
|
|
|
14
14
|
// src/evaluation/types.ts
|
|
15
15
|
var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
|
|
@@ -62,48 +62,197 @@ function getHitCount(result) {
|
|
|
62
62
|
}
|
|
63
63
|
|
|
64
64
|
// src/evaluation/yaml-parser.ts
|
|
65
|
+
import { readFile as readFile4 } from "node:fs/promises";
|
|
66
|
+
import path6 from "node:path";
|
|
67
|
+
import { parse as parse2 } from "yaml";
|
|
68
|
+
|
|
69
|
+
// src/evaluation/formatting/segment-formatter.ts
|
|
70
|
+
function extractCodeBlocks(segments) {
|
|
71
|
+
const CODE_BLOCK_PATTERN = /```[\s\S]*?```/g;
|
|
72
|
+
const codeBlocks = [];
|
|
73
|
+
for (const segment of segments) {
|
|
74
|
+
const typeValue = segment["type"];
|
|
75
|
+
if (typeof typeValue !== "string" || typeValue !== "text") {
|
|
76
|
+
continue;
|
|
77
|
+
}
|
|
78
|
+
const textValue = segment["value"];
|
|
79
|
+
if (typeof textValue !== "string") {
|
|
80
|
+
continue;
|
|
81
|
+
}
|
|
82
|
+
const matches = textValue.match(CODE_BLOCK_PATTERN);
|
|
83
|
+
if (matches) {
|
|
84
|
+
codeBlocks.push(...matches);
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
return codeBlocks;
|
|
88
|
+
}
|
|
89
|
+
function formatFileContents(parts) {
|
|
90
|
+
const fileCount = parts.filter((p) => p.isFile).length;
|
|
91
|
+
if (fileCount > 0) {
|
|
92
|
+
return parts.map((part) => {
|
|
93
|
+
if (part.isFile && part.displayPath) {
|
|
94
|
+
return `<file path="${part.displayPath}">
|
|
95
|
+
${part.content}
|
|
96
|
+
</file>`;
|
|
97
|
+
}
|
|
98
|
+
return part.content;
|
|
99
|
+
}).join("\n\n");
|
|
100
|
+
}
|
|
101
|
+
return parts.map((p) => p.content).join(" ");
|
|
102
|
+
}
|
|
103
|
+
function formatSegment(segment) {
|
|
104
|
+
const type = asString(segment.type);
|
|
105
|
+
if (type === "text") {
|
|
106
|
+
return asString(segment.value);
|
|
107
|
+
}
|
|
108
|
+
if (type === "guideline_ref") {
|
|
109
|
+
const refPath = asString(segment.path);
|
|
110
|
+
return refPath ? `<Attached: ${refPath}>` : void 0;
|
|
111
|
+
}
|
|
112
|
+
if (type === "file") {
|
|
113
|
+
const text = asString(segment.text);
|
|
114
|
+
const filePath = asString(segment.path);
|
|
115
|
+
if (text && filePath) {
|
|
116
|
+
return formatFileContents([{ content: text.trim(), isFile: true, displayPath: filePath }]);
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
return void 0;
|
|
120
|
+
}
|
|
121
|
+
function hasVisibleContent(segments) {
|
|
122
|
+
return segments.some((segment) => {
|
|
123
|
+
const type = asString(segment.type);
|
|
124
|
+
if (type === "text") {
|
|
125
|
+
const value = asString(segment.value);
|
|
126
|
+
return value !== void 0 && value.trim().length > 0;
|
|
127
|
+
}
|
|
128
|
+
if (type === "guideline_ref") {
|
|
129
|
+
return false;
|
|
130
|
+
}
|
|
131
|
+
if (type === "file") {
|
|
132
|
+
const text = asString(segment.text);
|
|
133
|
+
return text !== void 0 && text.trim().length > 0;
|
|
134
|
+
}
|
|
135
|
+
return false;
|
|
136
|
+
});
|
|
137
|
+
}
|
|
138
|
+
function asString(value) {
|
|
139
|
+
return typeof value === "string" ? value : void 0;
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
// src/evaluation/loaders/config-loader.ts
|
|
65
143
|
import micromatch from "micromatch";
|
|
144
|
+
import { readFile } from "node:fs/promises";
|
|
145
|
+
import path2 from "node:path";
|
|
146
|
+
import { parse } from "yaml";
|
|
147
|
+
|
|
148
|
+
// src/evaluation/loaders/file-resolver.ts
|
|
66
149
|
import { constants } from "node:fs";
|
|
67
|
-
import { access
|
|
150
|
+
import { access } from "node:fs/promises";
|
|
68
151
|
import path from "node:path";
|
|
69
|
-
|
|
70
|
-
import { parse } from "yaml";
|
|
71
|
-
var CODE_BLOCK_PATTERN = /```[\s\S]*?```/g;
|
|
72
|
-
var ANSI_YELLOW = "\x1B[33m";
|
|
73
|
-
var ANSI_RESET = "\x1B[0m";
|
|
74
|
-
var SCHEMA_EVAL_V2 = "agentv-eval-v2";
|
|
75
|
-
var SCHEMA_CONFIG_V2 = "agentv-config-v2";
|
|
76
|
-
async function readTestSuiteMetadata(testFilePath) {
|
|
152
|
+
async function fileExists2(absolutePath) {
|
|
77
153
|
try {
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
const parsed = parse(content);
|
|
81
|
-
if (!isJsonObject(parsed)) {
|
|
82
|
-
return {};
|
|
83
|
-
}
|
|
84
|
-
return { target: extractTargetFromSuite(parsed) };
|
|
154
|
+
await access(absolutePath, constants.F_OK);
|
|
155
|
+
return true;
|
|
85
156
|
} catch {
|
|
86
|
-
return
|
|
157
|
+
return false;
|
|
87
158
|
}
|
|
88
159
|
}
|
|
89
|
-
function
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
160
|
+
function resolveToAbsolutePath(candidate) {
|
|
161
|
+
if (candidate instanceof URL) {
|
|
162
|
+
return new URL(candidate).pathname;
|
|
163
|
+
}
|
|
164
|
+
if (typeof candidate === "string") {
|
|
165
|
+
if (candidate.startsWith("file://")) {
|
|
166
|
+
return new URL(candidate).pathname;
|
|
95
167
|
}
|
|
168
|
+
return path.resolve(candidate);
|
|
96
169
|
}
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
170
|
+
throw new TypeError("Unsupported repoRoot value. Expected string or URL.");
|
|
171
|
+
}
|
|
172
|
+
function buildDirectoryChain2(filePath, repoRoot) {
|
|
173
|
+
const directories = [];
|
|
174
|
+
const seen = /* @__PURE__ */ new Set();
|
|
175
|
+
const boundary = path.resolve(repoRoot);
|
|
176
|
+
let current = path.resolve(path.dirname(filePath));
|
|
177
|
+
while (current !== void 0) {
|
|
178
|
+
if (!seen.has(current)) {
|
|
179
|
+
directories.push(current);
|
|
180
|
+
seen.add(current);
|
|
181
|
+
}
|
|
182
|
+
if (current === boundary) {
|
|
183
|
+
break;
|
|
184
|
+
}
|
|
185
|
+
const parent = path.dirname(current);
|
|
186
|
+
if (parent === current) {
|
|
187
|
+
break;
|
|
188
|
+
}
|
|
189
|
+
current = parent;
|
|
100
190
|
}
|
|
101
|
-
|
|
191
|
+
if (!seen.has(boundary)) {
|
|
192
|
+
directories.push(boundary);
|
|
193
|
+
}
|
|
194
|
+
return directories;
|
|
102
195
|
}
|
|
196
|
+
function buildSearchRoots2(evalPath, repoRoot) {
|
|
197
|
+
const uniqueRoots = [];
|
|
198
|
+
const addRoot = (root) => {
|
|
199
|
+
const normalized = path.resolve(root);
|
|
200
|
+
if (!uniqueRoots.includes(normalized)) {
|
|
201
|
+
uniqueRoots.push(normalized);
|
|
202
|
+
}
|
|
203
|
+
};
|
|
204
|
+
let currentDir = path.dirname(evalPath);
|
|
205
|
+
let reachedBoundary = false;
|
|
206
|
+
while (!reachedBoundary) {
|
|
207
|
+
addRoot(currentDir);
|
|
208
|
+
const parentDir = path.dirname(currentDir);
|
|
209
|
+
if (currentDir === repoRoot || parentDir === currentDir) {
|
|
210
|
+
reachedBoundary = true;
|
|
211
|
+
} else {
|
|
212
|
+
currentDir = parentDir;
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
addRoot(repoRoot);
|
|
216
|
+
addRoot(process.cwd());
|
|
217
|
+
return uniqueRoots;
|
|
218
|
+
}
|
|
219
|
+
function trimLeadingSeparators(value) {
|
|
220
|
+
const trimmed = value.replace(/^[/\\]+/, "");
|
|
221
|
+
return trimmed.length > 0 ? trimmed : value;
|
|
222
|
+
}
|
|
223
|
+
async function resolveFileReference2(rawValue, searchRoots) {
|
|
224
|
+
const displayPath = trimLeadingSeparators(rawValue);
|
|
225
|
+
const potentialPaths = [];
|
|
226
|
+
if (path.isAbsolute(rawValue)) {
|
|
227
|
+
potentialPaths.push(path.normalize(rawValue));
|
|
228
|
+
}
|
|
229
|
+
for (const base of searchRoots) {
|
|
230
|
+
potentialPaths.push(path.resolve(base, displayPath));
|
|
231
|
+
}
|
|
232
|
+
const attempted = [];
|
|
233
|
+
const seen = /* @__PURE__ */ new Set();
|
|
234
|
+
for (const candidate of potentialPaths) {
|
|
235
|
+
const absoluteCandidate = path.resolve(candidate);
|
|
236
|
+
if (seen.has(absoluteCandidate)) {
|
|
237
|
+
continue;
|
|
238
|
+
}
|
|
239
|
+
seen.add(absoluteCandidate);
|
|
240
|
+
attempted.push(absoluteCandidate);
|
|
241
|
+
if (await fileExists2(absoluteCandidate)) {
|
|
242
|
+
return { displayPath, resolvedPath: absoluteCandidate, attempted };
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
return { displayPath, attempted };
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
// src/evaluation/loaders/config-loader.ts
|
|
249
|
+
var SCHEMA_CONFIG_V2 = "agentv-config-v2";
|
|
250
|
+
var ANSI_YELLOW = "\x1B[33m";
|
|
251
|
+
var ANSI_RESET = "\x1B[0m";
|
|
103
252
|
async function loadConfig(evalFilePath, repoRoot) {
|
|
104
|
-
const directories =
|
|
253
|
+
const directories = buildDirectoryChain2(evalFilePath, repoRoot);
|
|
105
254
|
for (const directory of directories) {
|
|
106
|
-
const configPath =
|
|
255
|
+
const configPath = path2.join(directory, ".agentv", "config.yaml");
|
|
107
256
|
if (!await fileExists2(configPath)) {
|
|
108
257
|
continue;
|
|
109
258
|
}
|
|
@@ -146,24 +295,134 @@ function isGuidelineFile(filePath, patterns) {
|
|
|
146
295
|
const patternsToUse = patterns ?? [];
|
|
147
296
|
return micromatch.isMatch(normalized, patternsToUse);
|
|
148
297
|
}
|
|
149
|
-
function
|
|
150
|
-
const
|
|
151
|
-
|
|
152
|
-
const
|
|
153
|
-
if (typeof
|
|
298
|
+
function extractTargetFromSuite(suite) {
|
|
299
|
+
const execution = suite.execution;
|
|
300
|
+
if (execution && typeof execution === "object" && !Array.isArray(execution)) {
|
|
301
|
+
const executionTarget = execution.target;
|
|
302
|
+
if (typeof executionTarget === "string" && executionTarget.trim().length > 0) {
|
|
303
|
+
return executionTarget.trim();
|
|
304
|
+
}
|
|
305
|
+
}
|
|
306
|
+
const targetValue = suite.target;
|
|
307
|
+
if (typeof targetValue === "string" && targetValue.trim().length > 0) {
|
|
308
|
+
return targetValue.trim();
|
|
309
|
+
}
|
|
310
|
+
return void 0;
|
|
311
|
+
}
|
|
312
|
+
function logWarning(message) {
|
|
313
|
+
console.warn(`${ANSI_YELLOW}Warning: ${message}${ANSI_RESET}`);
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
// src/evaluation/loaders/evaluator-parser.ts
|
|
317
|
+
import path3 from "node:path";
|
|
318
|
+
var ANSI_YELLOW2 = "\x1B[33m";
|
|
319
|
+
var ANSI_RESET2 = "\x1B[0m";
|
|
320
|
+
async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
|
|
321
|
+
const execution = rawEvalCase.execution;
|
|
322
|
+
const candidateEvaluators = isJsonObject2(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators ?? globalExecution?.evaluators;
|
|
323
|
+
if (candidateEvaluators === void 0) {
|
|
324
|
+
return void 0;
|
|
325
|
+
}
|
|
326
|
+
if (!Array.isArray(candidateEvaluators)) {
|
|
327
|
+
logWarning2(`Skipping evaluators for '${evalId}': expected array`);
|
|
328
|
+
return void 0;
|
|
329
|
+
}
|
|
330
|
+
const evaluators = [];
|
|
331
|
+
for (const rawEvaluator of candidateEvaluators) {
|
|
332
|
+
if (!isJsonObject2(rawEvaluator)) {
|
|
333
|
+
logWarning2(`Skipping invalid evaluator entry for '${evalId}' (expected object)`);
|
|
154
334
|
continue;
|
|
155
335
|
}
|
|
156
|
-
const
|
|
157
|
-
|
|
336
|
+
const name = asString2(rawEvaluator.name);
|
|
337
|
+
const typeValue = rawEvaluator.type;
|
|
338
|
+
if (!name || !isEvaluatorKind(typeValue)) {
|
|
339
|
+
logWarning2(`Skipping evaluator with invalid name/type in '${evalId}'`);
|
|
158
340
|
continue;
|
|
159
341
|
}
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
342
|
+
if (typeValue === "code") {
|
|
343
|
+
const script = asString2(rawEvaluator.script);
|
|
344
|
+
if (!script) {
|
|
345
|
+
logWarning2(`Skipping code evaluator '${name}' in '${evalId}': missing script`);
|
|
346
|
+
continue;
|
|
347
|
+
}
|
|
348
|
+
const cwd = asString2(rawEvaluator.cwd);
|
|
349
|
+
let resolvedCwd;
|
|
350
|
+
if (cwd) {
|
|
351
|
+
const resolved = await resolveFileReference2(cwd, searchRoots);
|
|
352
|
+
if (resolved.resolvedPath) {
|
|
353
|
+
resolvedCwd = path3.resolve(resolved.resolvedPath);
|
|
354
|
+
} else {
|
|
355
|
+
logWarning2(
|
|
356
|
+
`Code evaluator '${name}' in '${evalId}': cwd not found (${resolved.displayPath})`,
|
|
357
|
+
resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
|
|
358
|
+
);
|
|
359
|
+
}
|
|
360
|
+
} else {
|
|
361
|
+
resolvedCwd = searchRoots[0];
|
|
362
|
+
}
|
|
363
|
+
evaluators.push({
|
|
364
|
+
name,
|
|
365
|
+
type: "code",
|
|
366
|
+
script,
|
|
367
|
+
cwd,
|
|
368
|
+
resolvedCwd
|
|
369
|
+
});
|
|
370
|
+
continue;
|
|
371
|
+
}
|
|
372
|
+
const prompt = asString2(rawEvaluator.prompt);
|
|
373
|
+
let promptPath;
|
|
374
|
+
if (prompt) {
|
|
375
|
+
const resolved = await resolveFileReference2(prompt, searchRoots);
|
|
376
|
+
if (resolved.resolvedPath) {
|
|
377
|
+
promptPath = path3.resolve(resolved.resolvedPath);
|
|
378
|
+
} else {
|
|
379
|
+
logWarning2(
|
|
380
|
+
`Inline prompt used for evaluator '${name}' in '${evalId}' (file not found: ${resolved.displayPath})`,
|
|
381
|
+
resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
|
|
382
|
+
);
|
|
383
|
+
}
|
|
163
384
|
}
|
|
385
|
+
const _model = asString2(rawEvaluator.model);
|
|
386
|
+
evaluators.push({
|
|
387
|
+
name,
|
|
388
|
+
type: "llm_judge",
|
|
389
|
+
prompt,
|
|
390
|
+
promptPath
|
|
391
|
+
});
|
|
164
392
|
}
|
|
165
|
-
return
|
|
393
|
+
return evaluators.length > 0 ? evaluators : void 0;
|
|
166
394
|
}
|
|
395
|
+
function coerceEvaluator(candidate, contextId) {
|
|
396
|
+
if (typeof candidate !== "string") {
|
|
397
|
+
return void 0;
|
|
398
|
+
}
|
|
399
|
+
if (isEvaluatorKind(candidate)) {
|
|
400
|
+
return candidate;
|
|
401
|
+
}
|
|
402
|
+
logWarning2(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
|
|
403
|
+
return void 0;
|
|
404
|
+
}
|
|
405
|
+
function asString2(value) {
|
|
406
|
+
return typeof value === "string" ? value : void 0;
|
|
407
|
+
}
|
|
408
|
+
function isJsonObject2(value) {
|
|
409
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
410
|
+
}
|
|
411
|
+
function logWarning2(message, details) {
|
|
412
|
+
if (details && details.length > 0) {
|
|
413
|
+
const detailBlock = details.join("\n");
|
|
414
|
+
console.warn(`${ANSI_YELLOW2}Warning: ${message}
|
|
415
|
+
${detailBlock}${ANSI_RESET2}`);
|
|
416
|
+
} else {
|
|
417
|
+
console.warn(`${ANSI_YELLOW2}Warning: ${message}${ANSI_RESET2}`);
|
|
418
|
+
}
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
// src/evaluation/loaders/message-processor.ts
|
|
422
|
+
import { readFile as readFile2 } from "node:fs/promises";
|
|
423
|
+
import path4 from "node:path";
|
|
424
|
+
var ANSI_YELLOW3 = "\x1B[33m";
|
|
425
|
+
var ANSI_RESET3 = "\x1B[0m";
|
|
167
426
|
async function processMessages(options) {
|
|
168
427
|
const {
|
|
169
428
|
messages,
|
|
@@ -189,257 +448,173 @@ async function processMessages(options) {
|
|
|
189
448
|
if (!isJsonObject(rawSegment)) {
|
|
190
449
|
continue;
|
|
191
450
|
}
|
|
192
|
-
const segmentType =
|
|
451
|
+
const segmentType = asString3(rawSegment.type);
|
|
193
452
|
if (segmentType === "file") {
|
|
194
|
-
const rawValue =
|
|
453
|
+
const rawValue = asString3(rawSegment.value);
|
|
195
454
|
if (!rawValue) {
|
|
196
455
|
continue;
|
|
197
456
|
}
|
|
198
|
-
const { displayPath, resolvedPath, attempted } = await
|
|
457
|
+
const { displayPath, resolvedPath, attempted } = await resolveFileReference2(
|
|
199
458
|
rawValue,
|
|
200
|
-
searchRoots
|
|
201
|
-
);
|
|
202
|
-
if (!resolvedPath) {
|
|
203
|
-
const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
|
|
204
|
-
const context = messageType === "input" ? "" : " in expected_messages";
|
|
205
|
-
|
|
206
|
-
continue;
|
|
207
|
-
}
|
|
208
|
-
try {
|
|
209
|
-
const fileContent = (await readFile(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
|
|
210
|
-
if (messageType === "input" && guidelinePatterns && guidelinePaths) {
|
|
211
|
-
const relativeToRepo = path.relative(repoRootPath, resolvedPath);
|
|
212
|
-
if (isGuidelineFile(relativeToRepo, guidelinePatterns)) {
|
|
213
|
-
guidelinePaths.push(path.resolve(resolvedPath));
|
|
214
|
-
if (verbose) {
|
|
215
|
-
console.log(` [Guideline] Found: ${displayPath}`);
|
|
216
|
-
console.log(` Resolved to: ${resolvedPath}`);
|
|
217
|
-
}
|
|
218
|
-
continue;
|
|
219
|
-
}
|
|
220
|
-
}
|
|
221
|
-
segments.push({
|
|
222
|
-
type: "file",
|
|
223
|
-
path: displayPath,
|
|
224
|
-
text: fileContent,
|
|
225
|
-
resolvedPath: path.resolve(resolvedPath)
|
|
226
|
-
});
|
|
227
|
-
if (verbose) {
|
|
228
|
-
const label = messageType === "input" ? "[File]" : "[Expected Output File]";
|
|
229
|
-
console.log(` ${label} Found: ${displayPath}`);
|
|
230
|
-
console.log(` Resolved to: ${resolvedPath}`);
|
|
231
|
-
}
|
|
232
|
-
} catch (error) {
|
|
233
|
-
const context = messageType === "input" ? "" : " expected output";
|
|
234
|
-
logWarning(`Could not read${context} file ${resolvedPath}: ${error.message}`);
|
|
235
|
-
}
|
|
236
|
-
continue;
|
|
237
|
-
}
|
|
238
|
-
const clonedSegment = cloneJsonObject(rawSegment);
|
|
239
|
-
segments.push(clonedSegment);
|
|
240
|
-
const inlineValue = clonedSegment.value;
|
|
241
|
-
if (typeof inlineValue === "string" && textParts) {
|
|
242
|
-
textParts.push(inlineValue);
|
|
243
|
-
}
|
|
244
|
-
}
|
|
245
|
-
}
|
|
246
|
-
return segments;
|
|
247
|
-
}
|
|
248
|
-
async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
249
|
-
const verbose = options?.verbose ?? false;
|
|
250
|
-
const evalIdFilter = options?.evalId;
|
|
251
|
-
const absoluteTestPath = path.resolve(evalFilePath);
|
|
252
|
-
if (!await fileExists2(absoluteTestPath)) {
|
|
253
|
-
throw new Error(`Test file not found: ${evalFilePath}`);
|
|
254
|
-
}
|
|
255
|
-
const repoRootPath = resolveToAbsolutePath(repoRoot);
|
|
256
|
-
const searchRoots = buildSearchRoots(absoluteTestPath, repoRootPath);
|
|
257
|
-
const config = await loadConfig(absoluteTestPath, repoRootPath);
|
|
258
|
-
const guidelinePatterns = config?.guideline_patterns;
|
|
259
|
-
const rawFile = await readFile(absoluteTestPath, "utf8");
|
|
260
|
-
const parsed = parse(rawFile);
|
|
261
|
-
if (!isJsonObject(parsed)) {
|
|
262
|
-
throw new Error(`Invalid test file format: ${evalFilePath}`);
|
|
263
|
-
}
|
|
264
|
-
const suite = parsed;
|
|
265
|
-
const datasetNameFromSuite = asString(suite.dataset)?.trim();
|
|
266
|
-
const fallbackDataset = path.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
|
|
267
|
-
const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
|
|
268
|
-
const schema = suite.$schema;
|
|
269
|
-
if (schema !== SCHEMA_EVAL_V2) {
|
|
270
|
-
const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${evalFilePath}. Expected '${SCHEMA_EVAL_V2}'` : `Missing required field '$schema' in ${evalFilePath}.
|
|
271
|
-
Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
272
|
-
throw new Error(message);
|
|
273
|
-
}
|
|
274
|
-
const rawTestcases = suite.evalcases;
|
|
275
|
-
if (!Array.isArray(rawTestcases)) {
|
|
276
|
-
throw new Error(`Invalid test file format: ${evalFilePath} - missing 'evalcases' field`);
|
|
277
|
-
}
|
|
278
|
-
const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm_judge";
|
|
279
|
-
const globalExecution = isJsonObject(suite.execution) ? suite.execution : void 0;
|
|
280
|
-
const globalTarget = asString(globalExecution?.target) ?? asString(suite.target);
|
|
281
|
-
const results = [];
|
|
282
|
-
for (const rawEvalcase of rawTestcases) {
|
|
283
|
-
if (!isJsonObject(rawEvalcase)) {
|
|
284
|
-
logWarning("Skipping invalid eval case entry (expected object)");
|
|
285
|
-
continue;
|
|
286
|
-
}
|
|
287
|
-
const evalcase = rawEvalcase;
|
|
288
|
-
const id = asString(evalcase.id);
|
|
289
|
-
if (evalIdFilter && id !== evalIdFilter) {
|
|
290
|
-
continue;
|
|
291
|
-
}
|
|
292
|
-
const conversationId = asString(evalcase.conversation_id);
|
|
293
|
-
const outcome = asString(evalcase.outcome);
|
|
294
|
-
const inputMessagesValue = evalcase.input_messages;
|
|
295
|
-
const expectedMessagesValue = evalcase.expected_messages;
|
|
296
|
-
if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
|
|
297
|
-
logWarning(`Skipping incomplete eval case: ${id ?? "unknown"}`);
|
|
298
|
-
continue;
|
|
299
|
-
}
|
|
300
|
-
const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
|
|
301
|
-
const inputMessages = inputMessagesValue.filter((msg) => isTestMessage(msg));
|
|
302
|
-
const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
|
|
303
|
-
if (hasExpectedMessages && expectedMessages.length === 0) {
|
|
304
|
-
logWarning(`No valid expected message found for eval case: ${id}`);
|
|
305
|
-
continue;
|
|
306
|
-
}
|
|
307
|
-
if (expectedMessages.length > 1) {
|
|
308
|
-
logWarning(`Multiple expected messages found for eval case: ${id}, using first`);
|
|
309
|
-
}
|
|
310
|
-
const guidelinePaths = [];
|
|
311
|
-
const inputTextParts = [];
|
|
312
|
-
const inputSegments = await processMessages({
|
|
313
|
-
messages: inputMessages,
|
|
314
|
-
searchRoots,
|
|
315
|
-
repoRootPath,
|
|
316
|
-
guidelinePatterns,
|
|
317
|
-
guidelinePaths,
|
|
318
|
-
textParts: inputTextParts,
|
|
319
|
-
messageType: "input",
|
|
320
|
-
verbose
|
|
321
|
-
});
|
|
322
|
-
const outputSegments = hasExpectedMessages ? await processMessages({
|
|
323
|
-
messages: expectedMessages,
|
|
324
|
-
searchRoots,
|
|
325
|
-
repoRootPath,
|
|
326
|
-
guidelinePatterns,
|
|
327
|
-
messageType: "output",
|
|
328
|
-
verbose
|
|
329
|
-
}) : [];
|
|
330
|
-
const codeSnippets = extractCodeBlocks(inputSegments);
|
|
331
|
-
const expectedContent = expectedMessages[0]?.content;
|
|
332
|
-
const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
|
|
333
|
-
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
334
|
-
const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
|
|
335
|
-
const evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
|
|
336
|
-
const userFilePaths = [];
|
|
337
|
-
for (const segment of inputSegments) {
|
|
338
|
-
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
339
|
-
userFilePaths.push(segment.resolvedPath);
|
|
340
|
-
}
|
|
341
|
-
}
|
|
342
|
-
const allFilePaths = [
|
|
343
|
-
...guidelinePaths.map((guidelinePath) => path.resolve(guidelinePath)),
|
|
344
|
-
...userFilePaths
|
|
345
|
-
];
|
|
346
|
-
const testCase = {
|
|
347
|
-
id,
|
|
348
|
-
dataset: datasetName,
|
|
349
|
-
conversation_id: conversationId,
|
|
350
|
-
question,
|
|
351
|
-
input_messages: inputMessages,
|
|
352
|
-
input_segments: inputSegments,
|
|
353
|
-
output_segments: outputSegments,
|
|
354
|
-
reference_answer: referenceAnswer,
|
|
355
|
-
guideline_paths: guidelinePaths.map((guidelinePath) => path.resolve(guidelinePath)),
|
|
356
|
-
guideline_patterns: guidelinePatterns,
|
|
357
|
-
file_paths: allFilePaths,
|
|
358
|
-
code_snippets: codeSnippets,
|
|
359
|
-
expected_outcome: outcome,
|
|
360
|
-
evaluator: evalCaseEvaluatorKind,
|
|
361
|
-
evaluators
|
|
362
|
-
};
|
|
363
|
-
if (verbose) {
|
|
364
|
-
console.log(`
|
|
365
|
-
[Eval Case: ${id}]`);
|
|
366
|
-
if (testCase.guideline_paths.length > 0) {
|
|
367
|
-
console.log(` Guidelines used: ${testCase.guideline_paths.length}`);
|
|
368
|
-
for (const guidelinePath of testCase.guideline_paths) {
|
|
369
|
-
console.log(` - ${guidelinePath}`);
|
|
459
|
+
searchRoots
|
|
460
|
+
);
|
|
461
|
+
if (!resolvedPath) {
|
|
462
|
+
const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
|
|
463
|
+
const context = messageType === "input" ? "" : " in expected_messages";
|
|
464
|
+
logWarning3(`File not found${context}: ${displayPath}`, attempts);
|
|
465
|
+
continue;
|
|
370
466
|
}
|
|
371
|
-
|
|
372
|
-
|
|
467
|
+
try {
|
|
468
|
+
const fileContent = (await readFile2(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
|
|
469
|
+
if (messageType === "input" && guidelinePatterns && guidelinePaths) {
|
|
470
|
+
const relativeToRepo = path4.relative(repoRootPath, resolvedPath);
|
|
471
|
+
if (isGuidelineFile(relativeToRepo, guidelinePatterns)) {
|
|
472
|
+
guidelinePaths.push(path4.resolve(resolvedPath));
|
|
473
|
+
if (verbose) {
|
|
474
|
+
console.log(` [Guideline] Found: ${displayPath}`);
|
|
475
|
+
console.log(` Resolved to: ${resolvedPath}`);
|
|
476
|
+
}
|
|
477
|
+
continue;
|
|
478
|
+
}
|
|
479
|
+
}
|
|
480
|
+
segments.push({
|
|
481
|
+
type: "file",
|
|
482
|
+
path: displayPath,
|
|
483
|
+
text: fileContent,
|
|
484
|
+
resolvedPath: path4.resolve(resolvedPath)
|
|
485
|
+
});
|
|
486
|
+
if (verbose) {
|
|
487
|
+
const label = messageType === "input" ? "[File]" : "[Expected Output File]";
|
|
488
|
+
console.log(` ${label} Found: ${displayPath}`);
|
|
489
|
+
console.log(` Resolved to: ${resolvedPath}`);
|
|
490
|
+
}
|
|
491
|
+
} catch (error) {
|
|
492
|
+
const context = messageType === "input" ? "" : " expected output";
|
|
493
|
+
logWarning3(`Could not read${context} file ${resolvedPath}: ${error.message}`);
|
|
494
|
+
}
|
|
495
|
+
continue;
|
|
496
|
+
}
|
|
497
|
+
const clonedSegment = cloneJsonObject(rawSegment);
|
|
498
|
+
segments.push(clonedSegment);
|
|
499
|
+
const inlineValue = clonedSegment.value;
|
|
500
|
+
if (typeof inlineValue === "string" && textParts) {
|
|
501
|
+
textParts.push(inlineValue);
|
|
373
502
|
}
|
|
374
503
|
}
|
|
375
|
-
results.push(testCase);
|
|
376
504
|
}
|
|
377
|
-
return
|
|
505
|
+
return segments;
|
|
378
506
|
}
|
|
379
|
-
function
|
|
380
|
-
if (
|
|
381
|
-
return
|
|
507
|
+
async function resolveAssistantContent(content, searchRoots, verbose) {
|
|
508
|
+
if (typeof content === "string") {
|
|
509
|
+
return content;
|
|
382
510
|
}
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
if (hasVisibleContent(segments)) {
|
|
386
|
-
messagesWithContent++;
|
|
387
|
-
}
|
|
511
|
+
if (!content) {
|
|
512
|
+
return "";
|
|
388
513
|
}
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
if (type === "text") {
|
|
395
|
-
const value = asString(segment.value);
|
|
396
|
-
return value !== void 0 && value.trim().length > 0;
|
|
514
|
+
const parts = [];
|
|
515
|
+
for (const entry of content) {
|
|
516
|
+
if (typeof entry === "string") {
|
|
517
|
+
parts.push({ content: entry, isFile: false });
|
|
518
|
+
continue;
|
|
397
519
|
}
|
|
398
|
-
if (
|
|
399
|
-
|
|
520
|
+
if (!isJsonObject(entry)) {
|
|
521
|
+
continue;
|
|
400
522
|
}
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
523
|
+
const segmentType = asString3(entry.type);
|
|
524
|
+
if (segmentType === "file") {
|
|
525
|
+
const rawValue = asString3(entry.value);
|
|
526
|
+
if (!rawValue) {
|
|
527
|
+
continue;
|
|
528
|
+
}
|
|
529
|
+
const { displayPath, resolvedPath, attempted } = await resolveFileReference2(
|
|
530
|
+
rawValue,
|
|
531
|
+
searchRoots
|
|
532
|
+
);
|
|
533
|
+
if (!resolvedPath) {
|
|
534
|
+
const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
|
|
535
|
+
logWarning3(`File not found in expected_messages: ${displayPath}`, attempts);
|
|
536
|
+
continue;
|
|
537
|
+
}
|
|
538
|
+
try {
|
|
539
|
+
const fileContent = (await readFile2(resolvedPath, "utf8")).replace(/\r\n/g, "\n").trim();
|
|
540
|
+
parts.push({ content: fileContent, isFile: true, displayPath });
|
|
541
|
+
if (verbose) {
|
|
542
|
+
console.log(` [Expected Assistant File] Found: ${displayPath}`);
|
|
543
|
+
console.log(` Resolved to: ${resolvedPath}`);
|
|
544
|
+
}
|
|
545
|
+
} catch (error) {
|
|
546
|
+
logWarning3(`Could not read file ${resolvedPath}: ${error.message}`);
|
|
547
|
+
}
|
|
548
|
+
continue;
|
|
404
549
|
}
|
|
405
|
-
|
|
406
|
-
|
|
550
|
+
const textValue = asString3(entry.text);
|
|
551
|
+
if (typeof textValue === "string") {
|
|
552
|
+
parts.push({ content: textValue, isFile: false });
|
|
553
|
+
continue;
|
|
554
|
+
}
|
|
555
|
+
const valueValue = asString3(entry.value);
|
|
556
|
+
if (typeof valueValue === "string") {
|
|
557
|
+
parts.push({ content: valueValue, isFile: false });
|
|
558
|
+
continue;
|
|
559
|
+
}
|
|
560
|
+
parts.push({ content: JSON.stringify(entry), isFile: false });
|
|
561
|
+
}
|
|
562
|
+
return formatFileContents(parts);
|
|
407
563
|
}
|
|
408
|
-
function
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
564
|
+
function asString3(value) {
|
|
565
|
+
return typeof value === "string" ? value : void 0;
|
|
566
|
+
}
|
|
567
|
+
function cloneJsonObject(source) {
|
|
568
|
+
const entries = Object.entries(source).map(([key, value]) => [key, cloneJsonValue(value)]);
|
|
569
|
+
return Object.fromEntries(entries);
|
|
570
|
+
}
|
|
571
|
+
function cloneJsonValue(value) {
|
|
572
|
+
if (value === null) {
|
|
573
|
+
return null;
|
|
412
574
|
}
|
|
413
|
-
if (
|
|
414
|
-
|
|
415
|
-
return refPath ? `<Attached: ${refPath}>` : void 0;
|
|
575
|
+
if (typeof value === "string" || typeof value === "number" || typeof value === "boolean") {
|
|
576
|
+
return value;
|
|
416
577
|
}
|
|
417
|
-
if (
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
578
|
+
if (Array.isArray(value)) {
|
|
579
|
+
return value.map((item) => cloneJsonValue(item));
|
|
580
|
+
}
|
|
581
|
+
if (typeof value === "object") {
|
|
582
|
+
return cloneJsonObject(value);
|
|
583
|
+
}
|
|
584
|
+
return value;
|
|
585
|
+
}
|
|
586
|
+
function logWarning3(message, details) {
|
|
587
|
+
if (details && details.length > 0) {
|
|
588
|
+
const detailBlock = details.join("\n");
|
|
589
|
+
console.warn(`${ANSI_YELLOW3}Warning: ${message}
|
|
590
|
+
${detailBlock}${ANSI_RESET3}`);
|
|
591
|
+
} else {
|
|
592
|
+
console.warn(`${ANSI_YELLOW3}Warning: ${message}${ANSI_RESET3}`);
|
|
423
593
|
}
|
|
424
|
-
return void 0;
|
|
425
594
|
}
|
|
595
|
+
|
|
596
|
+
// src/evaluation/formatting/prompt-builder.ts
|
|
597
|
+
import { readFile as readFile3 } from "node:fs/promises";
|
|
598
|
+
import path5 from "node:path";
|
|
599
|
+
var ANSI_YELLOW4 = "\x1B[33m";
|
|
600
|
+
var ANSI_RESET4 = "\x1B[0m";
|
|
426
601
|
async function buildPromptInputs(testCase) {
|
|
427
602
|
const guidelineParts = [];
|
|
428
603
|
for (const rawPath of testCase.guideline_paths) {
|
|
429
|
-
const absolutePath =
|
|
604
|
+
const absolutePath = path5.resolve(rawPath);
|
|
430
605
|
if (!await fileExists2(absolutePath)) {
|
|
431
|
-
|
|
606
|
+
logWarning4(`Could not read guideline file ${absolutePath}: file does not exist`);
|
|
432
607
|
continue;
|
|
433
608
|
}
|
|
434
609
|
try {
|
|
435
|
-
const content = (await
|
|
610
|
+
const content = (await readFile3(absolutePath, "utf8")).replace(/\r\n/g, "\n").trim();
|
|
436
611
|
guidelineParts.push({
|
|
437
612
|
content,
|
|
438
613
|
isFile: true,
|
|
439
|
-
displayPath:
|
|
614
|
+
displayPath: path5.basename(absolutePath)
|
|
440
615
|
});
|
|
441
616
|
} catch (error) {
|
|
442
|
-
|
|
617
|
+
logWarning4(`Could not read guideline file ${absolutePath}: ${error.message}`);
|
|
443
618
|
}
|
|
444
619
|
}
|
|
445
620
|
const guidelines = formatFileContents(guidelineParts);
|
|
@@ -463,9 +638,9 @@ async function buildPromptInputs(testCase) {
|
|
|
463
638
|
messageSegments.push({ type: "text", value: segment });
|
|
464
639
|
}
|
|
465
640
|
} else if (isJsonObject(segment)) {
|
|
466
|
-
const type =
|
|
641
|
+
const type = asString4(segment.type);
|
|
467
642
|
if (type === "file") {
|
|
468
|
-
const value =
|
|
643
|
+
const value = asString4(segment.value);
|
|
469
644
|
if (!value) continue;
|
|
470
645
|
if (testCase.guideline_patterns && isGuidelineFile(value, testCase.guideline_patterns)) {
|
|
471
646
|
messageSegments.push({ type: "guideline_ref", path: value });
|
|
@@ -476,7 +651,7 @@ async function buildPromptInputs(testCase) {
|
|
|
476
651
|
messageSegments.push({ type: "file", text: fileText, path: value });
|
|
477
652
|
}
|
|
478
653
|
} else if (type === "text") {
|
|
479
|
-
const textValue =
|
|
654
|
+
const textValue = asString4(segment.value);
|
|
480
655
|
if (textValue && textValue.trim().length > 0) {
|
|
481
656
|
messageSegments.push({ type: "text", value: textValue });
|
|
482
657
|
}
|
|
@@ -532,6 +707,18 @@ ${messageContent}`);
|
|
|
532
707
|
}) : void 0;
|
|
533
708
|
return { question, guidelines, chatPrompt };
|
|
534
709
|
}
|
|
710
|
+
function needsRoleMarkers(messages, processedSegmentsByMessage) {
|
|
711
|
+
if (messages.some((msg) => msg.role === "assistant" || msg.role === "tool")) {
|
|
712
|
+
return true;
|
|
713
|
+
}
|
|
714
|
+
let messagesWithContent = 0;
|
|
715
|
+
for (const segments of processedSegmentsByMessage) {
|
|
716
|
+
if (hasVisibleContent(segments)) {
|
|
717
|
+
messagesWithContent++;
|
|
718
|
+
}
|
|
719
|
+
}
|
|
720
|
+
return messagesWithContent > 1;
|
|
721
|
+
}
|
|
535
722
|
function buildChatPromptFromSegments(options) {
|
|
536
723
|
const { messages, segmentsByMessage, guidelinePatterns, guidelineContent, systemPrompt } = options;
|
|
537
724
|
if (messages.length === 0) {
|
|
@@ -570,243 +757,203 @@ ${guidelineContent.trim()}`);
|
|
|
570
757
|
}
|
|
571
758
|
for (let i = startIndex; i < messages.length; i++) {
|
|
572
759
|
const message = messages[i];
|
|
573
|
-
const segments = segmentsByMessage[i];
|
|
574
|
-
const contentParts = [];
|
|
575
|
-
let role = message.role;
|
|
576
|
-
let name;
|
|
577
|
-
if (role === "system") {
|
|
578
|
-
role = "assistant";
|
|
579
|
-
contentParts.push("@[System]:");
|
|
580
|
-
} else if (role === "tool") {
|
|
581
|
-
role = "function";
|
|
582
|
-
name = "tool";
|
|
583
|
-
}
|
|
584
|
-
for (const segment of segments) {
|
|
585
|
-
if (segment.type === "guideline_ref") {
|
|
586
|
-
continue;
|
|
587
|
-
}
|
|
588
|
-
const formatted = formatSegment(segment);
|
|
589
|
-
if (formatted) {
|
|
590
|
-
const isGuidelineRef = segment.type === "file" && typeof segment.path === "string" && guidelinePatterns && isGuidelineFile(segment.path, guidelinePatterns);
|
|
591
|
-
if (isGuidelineRef) {
|
|
592
|
-
continue;
|
|
593
|
-
}
|
|
594
|
-
contentParts.push(formatted);
|
|
595
|
-
}
|
|
596
|
-
}
|
|
597
|
-
if (contentParts.length === 0) {
|
|
598
|
-
continue;
|
|
599
|
-
}
|
|
600
|
-
chatPrompt.push({
|
|
601
|
-
role,
|
|
602
|
-
content: contentParts.join("\n"),
|
|
603
|
-
...name ? { name } : {}
|
|
604
|
-
});
|
|
605
|
-
}
|
|
606
|
-
return chatPrompt.length > 0 ? chatPrompt : void 0;
|
|
607
|
-
}
|
|
608
|
-
async function fileExists2(absolutePath) {
|
|
609
|
-
try {
|
|
610
|
-
await access(absolutePath, constants.F_OK);
|
|
611
|
-
return true;
|
|
612
|
-
} catch {
|
|
613
|
-
return false;
|
|
614
|
-
}
|
|
615
|
-
}
|
|
616
|
-
function resolveToAbsolutePath(candidate) {
|
|
617
|
-
if (candidate instanceof URL) {
|
|
618
|
-
return fileURLToPath(candidate);
|
|
619
|
-
}
|
|
620
|
-
if (typeof candidate === "string") {
|
|
621
|
-
if (candidate.startsWith("file://")) {
|
|
622
|
-
return fileURLToPath(new URL(candidate));
|
|
623
|
-
}
|
|
624
|
-
return path.resolve(candidate);
|
|
625
|
-
}
|
|
626
|
-
throw new TypeError("Unsupported repoRoot value. Expected string or URL.");
|
|
627
|
-
}
|
|
628
|
-
function asString(value) {
|
|
629
|
-
return typeof value === "string" ? value : void 0;
|
|
630
|
-
}
|
|
631
|
-
function cloneJsonObject(source) {
|
|
632
|
-
const entries = Object.entries(source).map(([key, value]) => [key, cloneJsonValue(value)]);
|
|
633
|
-
return Object.fromEntries(entries);
|
|
634
|
-
}
|
|
635
|
-
function cloneJsonValue(value) {
|
|
636
|
-
if (value === null) {
|
|
637
|
-
return null;
|
|
638
|
-
}
|
|
639
|
-
if (typeof value === "string" || typeof value === "number" || typeof value === "boolean") {
|
|
640
|
-
return value;
|
|
641
|
-
}
|
|
642
|
-
if (Array.isArray(value)) {
|
|
643
|
-
return value.map((item) => cloneJsonValue(item));
|
|
644
|
-
}
|
|
645
|
-
return cloneJsonObject(value);
|
|
646
|
-
}
|
|
647
|
-
function formatFileContents(parts) {
|
|
648
|
-
const fileCount = parts.filter((p) => p.isFile).length;
|
|
649
|
-
if (fileCount > 0) {
|
|
650
|
-
return parts.map((part) => {
|
|
651
|
-
if (part.isFile && part.displayPath) {
|
|
652
|
-
return `<file path="${part.displayPath}">
|
|
653
|
-
${part.content}
|
|
654
|
-
</file>`;
|
|
655
|
-
}
|
|
656
|
-
return part.content;
|
|
657
|
-
}).join("\n\n");
|
|
658
|
-
}
|
|
659
|
-
return parts.map((p) => p.content).join(" ");
|
|
660
|
-
}
|
|
661
|
-
async function resolveAssistantContent(content, searchRoots, verbose) {
|
|
662
|
-
if (typeof content === "string") {
|
|
663
|
-
return content;
|
|
664
|
-
}
|
|
665
|
-
if (!content) {
|
|
666
|
-
return "";
|
|
667
|
-
}
|
|
668
|
-
const parts = [];
|
|
669
|
-
for (const entry of content) {
|
|
670
|
-
if (typeof entry === "string") {
|
|
671
|
-
parts.push({ content: entry, isFile: false });
|
|
672
|
-
continue;
|
|
673
|
-
}
|
|
674
|
-
if (!isJsonObject(entry)) {
|
|
675
|
-
continue;
|
|
760
|
+
const segments = segmentsByMessage[i];
|
|
761
|
+
const contentParts = [];
|
|
762
|
+
let role = message.role;
|
|
763
|
+
let name;
|
|
764
|
+
if (role === "system") {
|
|
765
|
+
role = "assistant";
|
|
766
|
+
contentParts.push("@[System]:");
|
|
767
|
+
} else if (role === "tool") {
|
|
768
|
+
role = "function";
|
|
769
|
+
name = "tool";
|
|
676
770
|
}
|
|
677
|
-
const
|
|
678
|
-
|
|
679
|
-
const rawValue = asString(entry.value);
|
|
680
|
-
if (!rawValue) {
|
|
681
|
-
continue;
|
|
682
|
-
}
|
|
683
|
-
const { displayPath, resolvedPath, attempted } = await resolveFileReference(
|
|
684
|
-
rawValue,
|
|
685
|
-
searchRoots
|
|
686
|
-
);
|
|
687
|
-
if (!resolvedPath) {
|
|
688
|
-
const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
|
|
689
|
-
logWarning(`File not found in expected_messages: ${displayPath}`, attempts);
|
|
771
|
+
for (const segment of segments) {
|
|
772
|
+
if (segment.type === "guideline_ref") {
|
|
690
773
|
continue;
|
|
691
774
|
}
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
if (
|
|
696
|
-
|
|
697
|
-
console.log(` Resolved to: ${resolvedPath}`);
|
|
775
|
+
const formatted = formatSegment(segment);
|
|
776
|
+
if (formatted) {
|
|
777
|
+
const isGuidelineRef = segment.type === "file" && typeof segment.path === "string" && guidelinePatterns && isGuidelineFile(segment.path, guidelinePatterns);
|
|
778
|
+
if (isGuidelineRef) {
|
|
779
|
+
continue;
|
|
698
780
|
}
|
|
699
|
-
|
|
700
|
-
logWarning(`Could not read file ${resolvedPath}: ${error.message}`);
|
|
781
|
+
contentParts.push(formatted);
|
|
701
782
|
}
|
|
702
|
-
continue;
|
|
703
783
|
}
|
|
704
|
-
|
|
705
|
-
if (typeof textValue === "string") {
|
|
706
|
-
parts.push({ content: textValue, isFile: false });
|
|
784
|
+
if (contentParts.length === 0) {
|
|
707
785
|
continue;
|
|
708
786
|
}
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
787
|
+
chatPrompt.push({
|
|
788
|
+
role,
|
|
789
|
+
content: contentParts.join("\n"),
|
|
790
|
+
...name ? { name } : {}
|
|
791
|
+
});
|
|
792
|
+
}
|
|
793
|
+
return chatPrompt.length > 0 ? chatPrompt : void 0;
|
|
794
|
+
}
|
|
795
|
+
function asString4(value) {
|
|
796
|
+
return typeof value === "string" ? value : void 0;
|
|
797
|
+
}
|
|
798
|
+
function logWarning4(message) {
|
|
799
|
+
console.warn(`${ANSI_YELLOW4}Warning: ${message}${ANSI_RESET4}`);
|
|
800
|
+
}
|
|
801
|
+
|
|
802
|
+
// src/evaluation/yaml-parser.ts
|
|
803
|
+
var ANSI_YELLOW5 = "\x1B[33m";
|
|
804
|
+
var ANSI_RESET5 = "\x1B[0m";
|
|
805
|
+
var SCHEMA_EVAL_V2 = "agentv-eval-v2";
|
|
806
|
+
async function readTestSuiteMetadata(testFilePath) {
|
|
807
|
+
try {
|
|
808
|
+
const absolutePath = path6.resolve(testFilePath);
|
|
809
|
+
const content = await readFile4(absolutePath, "utf8");
|
|
810
|
+
const parsed = parse2(content);
|
|
811
|
+
if (!isJsonObject(parsed)) {
|
|
812
|
+
return {};
|
|
713
813
|
}
|
|
714
|
-
|
|
814
|
+
return { target: extractTargetFromSuite(parsed) };
|
|
815
|
+
} catch {
|
|
816
|
+
return {};
|
|
715
817
|
}
|
|
716
|
-
return formatFileContents(parts);
|
|
717
818
|
}
|
|
718
|
-
async function
|
|
719
|
-
const
|
|
720
|
-
const
|
|
721
|
-
|
|
722
|
-
|
|
819
|
+
async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
820
|
+
const verbose = options?.verbose ?? false;
|
|
821
|
+
const evalIdFilter = options?.evalId;
|
|
822
|
+
const absoluteTestPath = path6.resolve(evalFilePath);
|
|
823
|
+
const repoRootPath = resolveToAbsolutePath(repoRoot);
|
|
824
|
+
const searchRoots = buildSearchRoots2(absoluteTestPath, repoRootPath);
|
|
825
|
+
const config = await loadConfig(absoluteTestPath, repoRootPath);
|
|
826
|
+
const guidelinePatterns = config?.guideline_patterns;
|
|
827
|
+
const rawFile = await readFile4(absoluteTestPath, "utf8");
|
|
828
|
+
const parsed = parse2(rawFile);
|
|
829
|
+
if (!isJsonObject(parsed)) {
|
|
830
|
+
throw new Error(`Invalid test file format: ${evalFilePath}`);
|
|
723
831
|
}
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
832
|
+
const suite = parsed;
|
|
833
|
+
const datasetNameFromSuite = asString5(suite.dataset)?.trim();
|
|
834
|
+
const fallbackDataset = path6.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
|
|
835
|
+
const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
|
|
836
|
+
const schema = suite.$schema;
|
|
837
|
+
if (schema !== SCHEMA_EVAL_V2) {
|
|
838
|
+
const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${evalFilePath}. Expected '${SCHEMA_EVAL_V2}'` : `Missing required field '$schema' in ${evalFilePath}.
|
|
839
|
+
Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
840
|
+
throw new Error(message);
|
|
727
841
|
}
|
|
728
|
-
const
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
842
|
+
const rawTestcases = suite.evalcases;
|
|
843
|
+
if (!Array.isArray(rawTestcases)) {
|
|
844
|
+
throw new Error(`Invalid test file format: ${evalFilePath} - missing 'evalcases' field`);
|
|
845
|
+
}
|
|
846
|
+
const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm_judge";
|
|
847
|
+
const globalExecution = isJsonObject(suite.execution) ? suite.execution : void 0;
|
|
848
|
+
const _globalTarget = asString5(globalExecution?.target) ?? asString5(suite.target);
|
|
849
|
+
const results = [];
|
|
850
|
+
for (const rawEvalcase of rawTestcases) {
|
|
851
|
+
if (!isJsonObject(rawEvalcase)) {
|
|
852
|
+
logWarning5("Skipping invalid eval case entry (expected object)");
|
|
732
853
|
continue;
|
|
733
854
|
}
|
|
734
|
-
const
|
|
735
|
-
const
|
|
736
|
-
if (
|
|
737
|
-
logWarning(`Skipping evaluator with invalid name/type in '${evalId}'`);
|
|
855
|
+
const evalcase = rawEvalcase;
|
|
856
|
+
const id = asString5(evalcase.id);
|
|
857
|
+
if (evalIdFilter && id !== evalIdFilter) {
|
|
738
858
|
continue;
|
|
739
859
|
}
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
}
|
|
746
|
-
const cwd = asString(rawEvaluator.cwd);
|
|
747
|
-
let resolvedCwd;
|
|
748
|
-
if (cwd) {
|
|
749
|
-
const resolved = await resolveFileReference(cwd, searchRoots);
|
|
750
|
-
if (resolved.resolvedPath) {
|
|
751
|
-
resolvedCwd = path.resolve(resolved.resolvedPath);
|
|
752
|
-
} else {
|
|
753
|
-
logWarning(
|
|
754
|
-
`Code evaluator '${name}' in '${evalId}': cwd not found (${resolved.displayPath})`,
|
|
755
|
-
resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
|
|
756
|
-
);
|
|
757
|
-
}
|
|
758
|
-
} else {
|
|
759
|
-
resolvedCwd = searchRoots[0];
|
|
760
|
-
}
|
|
761
|
-
evaluators.push({
|
|
762
|
-
name,
|
|
763
|
-
type: "code",
|
|
764
|
-
script,
|
|
765
|
-
cwd,
|
|
766
|
-
resolvedCwd
|
|
767
|
-
});
|
|
860
|
+
const conversationId = asString5(evalcase.conversation_id);
|
|
861
|
+
const outcome = asString5(evalcase.outcome);
|
|
862
|
+
const inputMessagesValue = evalcase.input_messages;
|
|
863
|
+
const expectedMessagesValue = evalcase.expected_messages;
|
|
864
|
+
if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
|
|
865
|
+
logWarning5(`Skipping incomplete eval case: ${id ?? "unknown"}`);
|
|
768
866
|
continue;
|
|
769
867
|
}
|
|
770
|
-
const
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
868
|
+
const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
|
|
869
|
+
const inputMessages = inputMessagesValue.filter((msg) => isTestMessage(msg));
|
|
870
|
+
const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
|
|
871
|
+
if (hasExpectedMessages && expectedMessages.length === 0) {
|
|
872
|
+
logWarning5(`No valid expected message found for eval case: ${id}`);
|
|
873
|
+
continue;
|
|
874
|
+
}
|
|
875
|
+
if (expectedMessages.length > 1) {
|
|
876
|
+
logWarning5(`Multiple expected messages found for eval case: ${id}, using first`);
|
|
877
|
+
}
|
|
878
|
+
const guidelinePaths = [];
|
|
879
|
+
const inputTextParts = [];
|
|
880
|
+
const inputSegments = await processMessages({
|
|
881
|
+
messages: inputMessages,
|
|
882
|
+
searchRoots,
|
|
883
|
+
repoRootPath,
|
|
884
|
+
guidelinePatterns,
|
|
885
|
+
guidelinePaths,
|
|
886
|
+
textParts: inputTextParts,
|
|
887
|
+
messageType: "input",
|
|
888
|
+
verbose
|
|
889
|
+
});
|
|
890
|
+
const outputSegments = hasExpectedMessages ? await processMessages({
|
|
891
|
+
messages: expectedMessages,
|
|
892
|
+
searchRoots,
|
|
893
|
+
repoRootPath,
|
|
894
|
+
guidelinePatterns,
|
|
895
|
+
messageType: "output",
|
|
896
|
+
verbose
|
|
897
|
+
}) : [];
|
|
898
|
+
const codeSnippets = extractCodeBlocks(inputSegments);
|
|
899
|
+
const expectedContent = expectedMessages[0]?.content;
|
|
900
|
+
const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
|
|
901
|
+
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
902
|
+
const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
|
|
903
|
+
const evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
|
|
904
|
+
const userFilePaths = [];
|
|
905
|
+
for (const segment of inputSegments) {
|
|
906
|
+
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
907
|
+
userFilePaths.push(segment.resolvedPath);
|
|
908
|
+
}
|
|
909
|
+
}
|
|
910
|
+
const allFilePaths = [
|
|
911
|
+
...guidelinePaths.map((guidelinePath) => path6.resolve(guidelinePath)),
|
|
912
|
+
...userFilePaths
|
|
913
|
+
];
|
|
914
|
+
const testCase = {
|
|
915
|
+
id,
|
|
916
|
+
dataset: datasetName,
|
|
917
|
+
conversation_id: conversationId,
|
|
918
|
+
question,
|
|
919
|
+
input_messages: inputMessages,
|
|
920
|
+
input_segments: inputSegments,
|
|
921
|
+
output_segments: outputSegments,
|
|
922
|
+
reference_answer: referenceAnswer,
|
|
923
|
+
guideline_paths: guidelinePaths.map((guidelinePath) => path6.resolve(guidelinePath)),
|
|
924
|
+
guideline_patterns: guidelinePatterns,
|
|
925
|
+
file_paths: allFilePaths,
|
|
926
|
+
code_snippets: codeSnippets,
|
|
927
|
+
expected_outcome: outcome,
|
|
928
|
+
evaluator: evalCaseEvaluatorKind,
|
|
929
|
+
evaluators
|
|
930
|
+
};
|
|
931
|
+
if (verbose) {
|
|
932
|
+
console.log(`
|
|
933
|
+
[Eval Case: ${id}]`);
|
|
934
|
+
if (testCase.guideline_paths.length > 0) {
|
|
935
|
+
console.log(` Guidelines used: ${testCase.guideline_paths.length}`);
|
|
936
|
+
for (const guidelinePath of testCase.guideline_paths) {
|
|
937
|
+
console.log(` - ${guidelinePath}`);
|
|
938
|
+
}
|
|
776
939
|
} else {
|
|
777
|
-
|
|
778
|
-
`Inline prompt used for evaluator '${name}' in '${evalId}' (file not found: ${resolved.displayPath})`,
|
|
779
|
-
resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
|
|
780
|
-
);
|
|
940
|
+
console.log(" No guidelines found");
|
|
781
941
|
}
|
|
782
942
|
}
|
|
783
|
-
|
|
784
|
-
evaluators.push({
|
|
785
|
-
name,
|
|
786
|
-
type: "llm_judge",
|
|
787
|
-
prompt,
|
|
788
|
-
promptPath
|
|
789
|
-
});
|
|
943
|
+
results.push(testCase);
|
|
790
944
|
}
|
|
791
|
-
return
|
|
945
|
+
return results;
|
|
792
946
|
}
|
|
793
|
-
function
|
|
794
|
-
|
|
795
|
-
return void 0;
|
|
796
|
-
}
|
|
797
|
-
if (isEvaluatorKind(candidate)) {
|
|
798
|
-
return candidate;
|
|
799
|
-
}
|
|
800
|
-
logWarning(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
|
|
801
|
-
return void 0;
|
|
947
|
+
function asString5(value) {
|
|
948
|
+
return typeof value === "string" ? value : void 0;
|
|
802
949
|
}
|
|
803
|
-
function
|
|
950
|
+
function logWarning5(message, details) {
|
|
804
951
|
if (details && details.length > 0) {
|
|
805
952
|
const detailBlock = details.join("\n");
|
|
806
|
-
console.warn(`${
|
|
807
|
-
${detailBlock}${
|
|
953
|
+
console.warn(`${ANSI_YELLOW5}Warning: ${message}
|
|
954
|
+
${detailBlock}${ANSI_RESET5}`);
|
|
808
955
|
} else {
|
|
809
|
-
console.warn(`${
|
|
956
|
+
console.warn(`${ANSI_YELLOW5}Warning: ${message}${ANSI_RESET5}`);
|
|
810
957
|
}
|
|
811
958
|
}
|
|
812
959
|
|
|
@@ -838,9 +985,8 @@ function buildChatPrompt(request) {
|
|
|
838
985
|
}
|
|
839
986
|
function resolveSystemContent(request) {
|
|
840
987
|
const systemSegments = [];
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
systemSegments.push(metadataSystemPrompt.trim());
|
|
988
|
+
if (request.systemPrompt && request.systemPrompt.trim().length > 0) {
|
|
989
|
+
systemSegments.push(request.systemPrompt.trim());
|
|
844
990
|
} else {
|
|
845
991
|
systemSegments.push(DEFAULT_SYSTEM_PROMPT);
|
|
846
992
|
}
|
|
@@ -1093,7 +1239,7 @@ var GeminiProvider = class {
|
|
|
1093
1239
|
import { exec as execWithCallback } from "node:child_process";
|
|
1094
1240
|
import fs from "node:fs/promises";
|
|
1095
1241
|
import os from "node:os";
|
|
1096
|
-
import
|
|
1242
|
+
import path7 from "node:path";
|
|
1097
1243
|
import { promisify } from "node:util";
|
|
1098
1244
|
var execAsync = promisify(execWithCallback);
|
|
1099
1245
|
var DEFAULT_MAX_BUFFER = 10 * 1024 * 1024;
|
|
@@ -1272,7 +1418,7 @@ function normalizeInputFiles(inputFiles) {
|
|
|
1272
1418
|
}
|
|
1273
1419
|
const unique = /* @__PURE__ */ new Map();
|
|
1274
1420
|
for (const inputFile of inputFiles) {
|
|
1275
|
-
const absolutePath =
|
|
1421
|
+
const absolutePath = path7.resolve(inputFile);
|
|
1276
1422
|
if (!unique.has(absolutePath)) {
|
|
1277
1423
|
unique.set(absolutePath, absolutePath);
|
|
1278
1424
|
}
|
|
@@ -1286,7 +1432,7 @@ function formatFileList(files, template) {
|
|
|
1286
1432
|
const formatter = template ?? "{path}";
|
|
1287
1433
|
return files.map((filePath) => {
|
|
1288
1434
|
const escapedPath = shellEscape(filePath);
|
|
1289
|
-
const escapedName = shellEscape(
|
|
1435
|
+
const escapedName = shellEscape(path7.basename(filePath));
|
|
1290
1436
|
return formatter.replaceAll("{path}", escapedPath).replaceAll("{basename}", escapedName);
|
|
1291
1437
|
}).join(" ");
|
|
1292
1438
|
}
|
|
@@ -1310,7 +1456,7 @@ function generateOutputFilePath(evalCaseId) {
|
|
|
1310
1456
|
const safeEvalId = evalCaseId || "unknown";
|
|
1311
1457
|
const timestamp = Date.now();
|
|
1312
1458
|
const random = Math.random().toString(36).substring(2, 9);
|
|
1313
|
-
return
|
|
1459
|
+
return path7.join(os.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}.json`);
|
|
1314
1460
|
}
|
|
1315
1461
|
function formatTimeoutSuffix(timeoutMs) {
|
|
1316
1462
|
if (!timeoutMs || timeoutMs <= 0) {
|
|
@@ -1326,7 +1472,7 @@ import { randomUUID } from "node:crypto";
|
|
|
1326
1472
|
import { constants as constants2, createWriteStream } from "node:fs";
|
|
1327
1473
|
import { access as access2, mkdtemp, mkdir, rm, writeFile } from "node:fs/promises";
|
|
1328
1474
|
import { tmpdir } from "node:os";
|
|
1329
|
-
import
|
|
1475
|
+
import path9 from "node:path";
|
|
1330
1476
|
import { promisify as promisify2 } from "node:util";
|
|
1331
1477
|
|
|
1332
1478
|
// src/evaluation/providers/codex-log-tracker.ts
|
|
@@ -1383,7 +1529,7 @@ function subscribeToCodexLogEntries(listener) {
|
|
|
1383
1529
|
}
|
|
1384
1530
|
|
|
1385
1531
|
// src/evaluation/providers/preread.ts
|
|
1386
|
-
import
|
|
1532
|
+
import path8 from "node:path";
|
|
1387
1533
|
function buildPromptDocument(request, inputFiles, options) {
|
|
1388
1534
|
const parts = [];
|
|
1389
1535
|
const guidelineFiles = collectGuidelineFiles(
|
|
@@ -1408,7 +1554,7 @@ function normalizeInputFiles2(inputFiles) {
|
|
|
1408
1554
|
}
|
|
1409
1555
|
const deduped = /* @__PURE__ */ new Map();
|
|
1410
1556
|
for (const inputFile of inputFiles) {
|
|
1411
|
-
const absolutePath =
|
|
1557
|
+
const absolutePath = path8.resolve(inputFile);
|
|
1412
1558
|
if (!deduped.has(absolutePath)) {
|
|
1413
1559
|
deduped.set(absolutePath, absolutePath);
|
|
1414
1560
|
}
|
|
@@ -1421,14 +1567,14 @@ function collectGuidelineFiles(inputFiles, guidelinePatterns, overrides) {
|
|
|
1421
1567
|
}
|
|
1422
1568
|
const unique = /* @__PURE__ */ new Map();
|
|
1423
1569
|
for (const inputFile of inputFiles) {
|
|
1424
|
-
const absolutePath =
|
|
1570
|
+
const absolutePath = path8.resolve(inputFile);
|
|
1425
1571
|
if (overrides?.has(absolutePath)) {
|
|
1426
1572
|
if (!unique.has(absolutePath)) {
|
|
1427
1573
|
unique.set(absolutePath, absolutePath);
|
|
1428
1574
|
}
|
|
1429
1575
|
continue;
|
|
1430
1576
|
}
|
|
1431
|
-
const normalized = absolutePath.split(
|
|
1577
|
+
const normalized = absolutePath.split(path8.sep).join("/");
|
|
1432
1578
|
if (isGuidelineFile(normalized, guidelinePatterns)) {
|
|
1433
1579
|
if (!unique.has(absolutePath)) {
|
|
1434
1580
|
unique.set(absolutePath, absolutePath);
|
|
@@ -1443,7 +1589,7 @@ function collectInputFiles(inputFiles) {
|
|
|
1443
1589
|
}
|
|
1444
1590
|
const unique = /* @__PURE__ */ new Map();
|
|
1445
1591
|
for (const inputFile of inputFiles) {
|
|
1446
|
-
const absolutePath =
|
|
1592
|
+
const absolutePath = path8.resolve(inputFile);
|
|
1447
1593
|
if (!unique.has(absolutePath)) {
|
|
1448
1594
|
unique.set(absolutePath, absolutePath);
|
|
1449
1595
|
}
|
|
@@ -1455,7 +1601,7 @@ function buildMandatoryPrereadBlock(guidelineFiles, inputFiles) {
|
|
|
1455
1601
|
return "";
|
|
1456
1602
|
}
|
|
1457
1603
|
const buildList = (files) => files.map((absolutePath) => {
|
|
1458
|
-
const fileName =
|
|
1604
|
+
const fileName = path8.basename(absolutePath);
|
|
1459
1605
|
const fileUri = pathToFileUri(absolutePath);
|
|
1460
1606
|
return `* [${fileName}](${fileUri})`;
|
|
1461
1607
|
});
|
|
@@ -1475,7 +1621,7 @@ ${buildList(inputFiles).join("\n")}.`);
|
|
|
1475
1621
|
return sections.join("\n");
|
|
1476
1622
|
}
|
|
1477
1623
|
function pathToFileUri(filePath) {
|
|
1478
|
-
const absolutePath =
|
|
1624
|
+
const absolutePath = path8.isAbsolute(filePath) ? filePath : path8.resolve(filePath);
|
|
1479
1625
|
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
1480
1626
|
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
1481
1627
|
return `file:///${normalizedPath}`;
|
|
@@ -1513,7 +1659,7 @@ var CodexProvider = class {
|
|
|
1513
1659
|
const logger = await this.createStreamLogger(request).catch(() => void 0);
|
|
1514
1660
|
try {
|
|
1515
1661
|
const promptContent = buildPromptDocument(request, inputFiles);
|
|
1516
|
-
const promptFile =
|
|
1662
|
+
const promptFile = path9.join(workspaceRoot, PROMPT_FILENAME);
|
|
1517
1663
|
await writeFile(promptFile, promptContent, "utf8");
|
|
1518
1664
|
const args = this.buildCodexArgs();
|
|
1519
1665
|
const cwd = this.resolveCwd(workspaceRoot);
|
|
@@ -1563,7 +1709,7 @@ var CodexProvider = class {
|
|
|
1563
1709
|
if (!this.config.cwd) {
|
|
1564
1710
|
return workspaceRoot;
|
|
1565
1711
|
}
|
|
1566
|
-
return
|
|
1712
|
+
return path9.resolve(this.config.cwd);
|
|
1567
1713
|
}
|
|
1568
1714
|
buildCodexArgs() {
|
|
1569
1715
|
const args = ["--ask-for-approval", "never", "exec", "--json", "--color", "never", "--skip-git-repo-check"];
|
|
@@ -1597,7 +1743,7 @@ var CodexProvider = class {
|
|
|
1597
1743
|
}
|
|
1598
1744
|
}
|
|
1599
1745
|
async createWorkspace() {
|
|
1600
|
-
return await mkdtemp(
|
|
1746
|
+
return await mkdtemp(path9.join(tmpdir(), WORKSPACE_PREFIX));
|
|
1601
1747
|
}
|
|
1602
1748
|
async cleanupWorkspace(workspaceRoot) {
|
|
1603
1749
|
try {
|
|
@@ -1611,9 +1757,9 @@ var CodexProvider = class {
|
|
|
1611
1757
|
return void 0;
|
|
1612
1758
|
}
|
|
1613
1759
|
if (this.config.logDir) {
|
|
1614
|
-
return
|
|
1760
|
+
return path9.resolve(this.config.logDir);
|
|
1615
1761
|
}
|
|
1616
|
-
return
|
|
1762
|
+
return path9.join(process.cwd(), ".agentv", "logs", "codex");
|
|
1617
1763
|
}
|
|
1618
1764
|
async createStreamLogger(request) {
|
|
1619
1765
|
const logDir = this.resolveLogDirectory();
|
|
@@ -1627,7 +1773,7 @@ var CodexProvider = class {
|
|
|
1627
1773
|
console.warn(`Skipping Codex stream logging (could not create ${logDir}): ${message}`);
|
|
1628
1774
|
return void 0;
|
|
1629
1775
|
}
|
|
1630
|
-
const filePath =
|
|
1776
|
+
const filePath = path9.join(logDir, buildLogFilename(request, this.targetName));
|
|
1631
1777
|
try {
|
|
1632
1778
|
const logger = await CodexStreamLogger.create({
|
|
1633
1779
|
filePath,
|
|
@@ -1842,7 +1988,7 @@ function tryParseJsonValue(rawLine) {
|
|
|
1842
1988
|
async function locateExecutable(candidate) {
|
|
1843
1989
|
const includesPathSeparator = candidate.includes("/") || candidate.includes("\\");
|
|
1844
1990
|
if (includesPathSeparator) {
|
|
1845
|
-
const resolved =
|
|
1991
|
+
const resolved = path9.isAbsolute(candidate) ? candidate : path9.resolve(candidate);
|
|
1846
1992
|
const executablePath = await ensureWindowsExecutableVariant(resolved);
|
|
1847
1993
|
await access2(executablePath, constants2.F_OK);
|
|
1848
1994
|
return executablePath;
|
|
@@ -2189,7 +2335,7 @@ var MockProvider = class {
|
|
|
2189
2335
|
};
|
|
2190
2336
|
|
|
2191
2337
|
// src/evaluation/providers/vscode.ts
|
|
2192
|
-
import
|
|
2338
|
+
import path10 from "node:path";
|
|
2193
2339
|
import { dispatchAgentSession, dispatchBatchAgent, getSubagentRoot, provisionSubagents } from "subagent";
|
|
2194
2340
|
var VSCodeProvider = class {
|
|
2195
2341
|
id;
|
|
@@ -2302,6 +2448,9 @@ var VSCodeProvider = class {
|
|
|
2302
2448
|
};
|
|
2303
2449
|
function buildPromptDocument2(request, attachments, guidelinePatterns) {
|
|
2304
2450
|
const parts = [];
|
|
2451
|
+
if (request.systemPrompt && request.systemPrompt.trim().length > 0) {
|
|
2452
|
+
parts.push(request.systemPrompt.trim());
|
|
2453
|
+
}
|
|
2305
2454
|
const guidelineFiles = collectGuidelineFiles2(attachments, guidelinePatterns);
|
|
2306
2455
|
const attachmentFiles = collectAttachmentFiles(attachments);
|
|
2307
2456
|
const nonGuidelineAttachments = attachmentFiles.filter(
|
|
@@ -2319,7 +2468,7 @@ function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
|
|
|
2319
2468
|
return "";
|
|
2320
2469
|
}
|
|
2321
2470
|
const buildList = (files) => files.map((absolutePath) => {
|
|
2322
|
-
const fileName =
|
|
2471
|
+
const fileName = path10.basename(absolutePath);
|
|
2323
2472
|
const fileUri = pathToFileUri2(absolutePath);
|
|
2324
2473
|
return `* [${fileName}](${fileUri})`;
|
|
2325
2474
|
});
|
|
@@ -2344,8 +2493,8 @@ function collectGuidelineFiles2(attachments, guidelinePatterns) {
|
|
|
2344
2493
|
}
|
|
2345
2494
|
const unique = /* @__PURE__ */ new Map();
|
|
2346
2495
|
for (const attachment of attachments) {
|
|
2347
|
-
const absolutePath =
|
|
2348
|
-
const normalized = absolutePath.split(
|
|
2496
|
+
const absolutePath = path10.resolve(attachment);
|
|
2497
|
+
const normalized = absolutePath.split(path10.sep).join("/");
|
|
2349
2498
|
if (isGuidelineFile(normalized, guidelinePatterns)) {
|
|
2350
2499
|
if (!unique.has(absolutePath)) {
|
|
2351
2500
|
unique.set(absolutePath, absolutePath);
|
|
@@ -2360,7 +2509,7 @@ function collectAttachmentFiles(attachments) {
|
|
|
2360
2509
|
}
|
|
2361
2510
|
const unique = /* @__PURE__ */ new Map();
|
|
2362
2511
|
for (const attachment of attachments) {
|
|
2363
|
-
const absolutePath =
|
|
2512
|
+
const absolutePath = path10.resolve(attachment);
|
|
2364
2513
|
if (!unique.has(absolutePath)) {
|
|
2365
2514
|
unique.set(absolutePath, absolutePath);
|
|
2366
2515
|
}
|
|
@@ -2368,7 +2517,7 @@ function collectAttachmentFiles(attachments) {
|
|
|
2368
2517
|
return Array.from(unique.values());
|
|
2369
2518
|
}
|
|
2370
2519
|
function pathToFileUri2(filePath) {
|
|
2371
|
-
const absolutePath =
|
|
2520
|
+
const absolutePath = path10.isAbsolute(filePath) ? filePath : path10.resolve(filePath);
|
|
2372
2521
|
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
2373
2522
|
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
2374
2523
|
return `file:///${normalizedPath}`;
|
|
@@ -2381,7 +2530,7 @@ function normalizeAttachments(attachments) {
|
|
|
2381
2530
|
}
|
|
2382
2531
|
const deduped = /* @__PURE__ */ new Set();
|
|
2383
2532
|
for (const attachment of attachments) {
|
|
2384
|
-
deduped.add(
|
|
2533
|
+
deduped.add(path10.resolve(attachment));
|
|
2385
2534
|
}
|
|
2386
2535
|
return Array.from(deduped);
|
|
2387
2536
|
}
|
|
@@ -2390,7 +2539,7 @@ function mergeAttachments(all) {
|
|
|
2390
2539
|
for (const list of all) {
|
|
2391
2540
|
if (!list) continue;
|
|
2392
2541
|
for (const inputFile of list) {
|
|
2393
|
-
deduped.add(
|
|
2542
|
+
deduped.add(path10.resolve(inputFile));
|
|
2394
2543
|
}
|
|
2395
2544
|
}
|
|
2396
2545
|
return deduped.size > 0 ? Array.from(deduped) : void 0;
|
|
@@ -2436,9 +2585,9 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
|
|
|
2436
2585
|
|
|
2437
2586
|
// src/evaluation/providers/targets-file.ts
|
|
2438
2587
|
import { constants as constants3 } from "node:fs";
|
|
2439
|
-
import { access as access3, readFile as
|
|
2440
|
-
import
|
|
2441
|
-
import { parse as
|
|
2588
|
+
import { access as access3, readFile as readFile5 } from "node:fs/promises";
|
|
2589
|
+
import path11 from "node:path";
|
|
2590
|
+
import { parse as parse3 } from "yaml";
|
|
2442
2591
|
function isRecord(value) {
|
|
2443
2592
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
2444
2593
|
}
|
|
@@ -2493,12 +2642,12 @@ async function fileExists3(filePath) {
|
|
|
2493
2642
|
}
|
|
2494
2643
|
}
|
|
2495
2644
|
async function readTargetDefinitions(filePath) {
|
|
2496
|
-
const absolutePath =
|
|
2645
|
+
const absolutePath = path11.resolve(filePath);
|
|
2497
2646
|
if (!await fileExists3(absolutePath)) {
|
|
2498
2647
|
throw new Error(`targets.yaml not found at ${absolutePath}`);
|
|
2499
2648
|
}
|
|
2500
|
-
const raw = await
|
|
2501
|
-
const parsed =
|
|
2649
|
+
const raw = await readFile5(absolutePath, "utf8");
|
|
2650
|
+
const parsed = parse3(raw);
|
|
2502
2651
|
if (!isRecord(parsed)) {
|
|
2503
2652
|
throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with '$schema' and 'targets' fields`);
|
|
2504
2653
|
}
|
|
@@ -2541,18 +2690,34 @@ function resolveAndCreateProvider(definition, env = process.env) {
|
|
|
2541
2690
|
}
|
|
2542
2691
|
|
|
2543
2692
|
// src/evaluation/evaluators.ts
|
|
2544
|
-
|
|
2693
|
+
var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
|
|
2694
|
+
|
|
2695
|
+
Use the reference_answer as a gold standard for a high-quality response (if provided). The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
|
|
2696
|
+
|
|
2697
|
+
Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.
|
|
2698
|
+
|
|
2699
|
+
[[ ## expected_outcome ## ]]
|
|
2700
|
+
{{expected_outcome}}
|
|
2701
|
+
|
|
2702
|
+
[[ ## question ## ]]
|
|
2703
|
+
{{question}}
|
|
2704
|
+
|
|
2705
|
+
[[ ## reference_answer ## ]]
|
|
2706
|
+
{{reference_answer}}
|
|
2707
|
+
|
|
2708
|
+
[[ ## candidate_answer ## ]]
|
|
2709
|
+
{{candidate_answer}}`;
|
|
2545
2710
|
var LlmJudgeEvaluator = class {
|
|
2546
2711
|
kind = "llm_judge";
|
|
2547
2712
|
resolveJudgeProvider;
|
|
2548
2713
|
maxOutputTokens;
|
|
2549
2714
|
temperature;
|
|
2550
|
-
|
|
2715
|
+
evaluatorTemplate;
|
|
2551
2716
|
constructor(options) {
|
|
2552
2717
|
this.resolveJudgeProvider = options.resolveJudgeProvider;
|
|
2553
2718
|
this.maxOutputTokens = options.maxOutputTokens;
|
|
2554
2719
|
this.temperature = options.temperature;
|
|
2555
|
-
this.
|
|
2720
|
+
this.evaluatorTemplate = options.evaluatorTemplate;
|
|
2556
2721
|
}
|
|
2557
2722
|
async evaluate(context) {
|
|
2558
2723
|
const judgeProvider = await this.resolveJudgeProvider(context);
|
|
@@ -2562,26 +2727,21 @@ var LlmJudgeEvaluator = class {
|
|
|
2562
2727
|
return this.evaluateWithPrompt(context, judgeProvider);
|
|
2563
2728
|
}
|
|
2564
2729
|
async evaluateWithPrompt(context, judgeProvider) {
|
|
2565
|
-
const hasReferenceAnswer = hasNonEmptyReferenceAnswer(context.evalCase);
|
|
2566
2730
|
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
2567
|
-
|
|
2568
|
-
|
|
2569
|
-
|
|
2570
|
-
|
|
2571
|
-
|
|
2572
|
-
|
|
2573
|
-
|
|
2574
|
-
|
|
2575
|
-
|
|
2576
|
-
|
|
2577
|
-
|
|
2578
|
-
prompt = substituteVariables(systemPrompt, variables);
|
|
2579
|
-
systemPrompt = buildSystemPrompt(hasReferenceAnswer);
|
|
2580
|
-
}
|
|
2581
|
-
const metadata = systemPrompt !== void 0 ? { systemPrompt } : {};
|
|
2731
|
+
const variables = {
|
|
2732
|
+
input_messages: JSON.stringify(context.evalCase.input_segments, null, 2),
|
|
2733
|
+
output_messages: JSON.stringify(context.evalCase.output_segments, null, 2),
|
|
2734
|
+
candidate_answer: context.candidate.trim(),
|
|
2735
|
+
reference_answer: (context.evalCase.reference_answer ?? "").trim(),
|
|
2736
|
+
expected_outcome: context.evalCase.expected_outcome.trim(),
|
|
2737
|
+
question: formattedQuestion.trim()
|
|
2738
|
+
};
|
|
2739
|
+
const systemPrompt = buildOutputSchema();
|
|
2740
|
+
const evaluatorTemplate = context.evaluatorTemplateOverride ?? this.evaluatorTemplate ?? DEFAULT_EVALUATOR_TEMPLATE;
|
|
2741
|
+
const userPrompt = substituteVariables(evaluatorTemplate, variables);
|
|
2582
2742
|
const response = await judgeProvider.invoke({
|
|
2583
|
-
question:
|
|
2584
|
-
|
|
2743
|
+
question: userPrompt,
|
|
2744
|
+
systemPrompt,
|
|
2585
2745
|
evalCaseId: context.evalCase.id,
|
|
2586
2746
|
attempt: context.attempt,
|
|
2587
2747
|
maxOutputTokens: this.maxOutputTokens,
|
|
@@ -2594,11 +2754,9 @@ var LlmJudgeEvaluator = class {
|
|
|
2594
2754
|
const reasoning = parsed.reasoning ?? response.reasoning;
|
|
2595
2755
|
const expectedAspectCount = Math.max(hits.length + misses.length, 1);
|
|
2596
2756
|
const evaluatorRawRequest = {
|
|
2597
|
-
|
|
2598
|
-
|
|
2599
|
-
|
|
2600
|
-
target: context.target.name,
|
|
2601
|
-
...systemPrompt !== void 0 && { systemPrompt }
|
|
2757
|
+
userPrompt,
|
|
2758
|
+
systemPrompt,
|
|
2759
|
+
target: judgeProvider.targetName
|
|
2602
2760
|
};
|
|
2603
2761
|
return {
|
|
2604
2762
|
score,
|
|
@@ -2610,20 +2768,8 @@ var LlmJudgeEvaluator = class {
|
|
|
2610
2768
|
};
|
|
2611
2769
|
}
|
|
2612
2770
|
};
|
|
2613
|
-
function
|
|
2614
|
-
|
|
2615
|
-
"You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.",
|
|
2616
|
-
""
|
|
2617
|
-
];
|
|
2618
|
-
if (hasReferenceAnswer) {
|
|
2619
|
-
basePrompt.push(
|
|
2620
|
-
"Use the reference_answer as a gold standard for a high-quality response. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.",
|
|
2621
|
-
""
|
|
2622
|
-
);
|
|
2623
|
-
}
|
|
2624
|
-
basePrompt.push(
|
|
2625
|
-
"Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.",
|
|
2626
|
-
"",
|
|
2771
|
+
function buildOutputSchema() {
|
|
2772
|
+
return [
|
|
2627
2773
|
"You must respond with a single JSON object matching this schema:",
|
|
2628
2774
|
"",
|
|
2629
2775
|
"{",
|
|
@@ -2632,30 +2778,7 @@ function buildSystemPrompt(hasReferenceAnswer) {
|
|
|
2632
2778
|
' "misses": [<array of strings, max 4 items, brief specific failures or omissions, empty if none>],',
|
|
2633
2779
|
' "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
|
|
2634
2780
|
"}"
|
|
2635
|
-
);
|
|
2636
|
-
return basePrompt.join("\n");
|
|
2637
|
-
}
|
|
2638
|
-
function buildQualityPrompt(evalCase, candidate, question) {
|
|
2639
|
-
const parts = [
|
|
2640
|
-
"[[ ## expected_outcome ## ]]",
|
|
2641
|
-
evalCase.expected_outcome.trim(),
|
|
2642
|
-
"",
|
|
2643
|
-
"[[ ## question ## ]]",
|
|
2644
|
-
question.trim(),
|
|
2645
|
-
""
|
|
2646
|
-
];
|
|
2647
|
-
if (hasNonEmptyReferenceAnswer(evalCase)) {
|
|
2648
|
-
parts.push(
|
|
2649
|
-
"[[ ## reference_answer ## ]]",
|
|
2650
|
-
evalCase.reference_answer.trim(),
|
|
2651
|
-
""
|
|
2652
|
-
);
|
|
2653
|
-
}
|
|
2654
|
-
parts.push(
|
|
2655
|
-
"[[ ## candidate_answer ## ]]",
|
|
2656
|
-
candidate.trim()
|
|
2657
|
-
);
|
|
2658
|
-
return parts.join("\n");
|
|
2781
|
+
].join("\n");
|
|
2659
2782
|
}
|
|
2660
2783
|
function clampScore(value) {
|
|
2661
2784
|
if (Number.isNaN(value) || !Number.isFinite(value)) {
|
|
@@ -2737,9 +2860,6 @@ function extractJsonBlob(text) {
|
|
|
2737
2860
|
function isNonEmptyString(value) {
|
|
2738
2861
|
return typeof value === "string" && value.trim().length > 0;
|
|
2739
2862
|
}
|
|
2740
|
-
function hasNonEmptyReferenceAnswer(evalCase) {
|
|
2741
|
-
return evalCase.reference_answer !== void 0 && evalCase.reference_answer.trim().length > 0;
|
|
2742
|
-
}
|
|
2743
2863
|
var CodeEvaluator = class {
|
|
2744
2864
|
kind = "code";
|
|
2745
2865
|
script;
|
|
@@ -2845,19 +2965,16 @@ function parseJsonSafe(payload) {
|
|
|
2845
2965
|
return void 0;
|
|
2846
2966
|
}
|
|
2847
2967
|
}
|
|
2848
|
-
function hasTemplateVariables(text) {
|
|
2849
|
-
return /\$\{[a-zA-Z0-9_]+\}/.test(text);
|
|
2850
|
-
}
|
|
2851
2968
|
function substituteVariables(template, variables) {
|
|
2852
|
-
return template.replace(
|
|
2969
|
+
return template.replace(/\{\{([a-zA-Z0-9_]+)\}\}/g, (match, varName) => {
|
|
2853
2970
|
return variables[varName] ?? match;
|
|
2854
2971
|
});
|
|
2855
2972
|
}
|
|
2856
2973
|
|
|
2857
2974
|
// src/evaluation/orchestrator.ts
|
|
2858
|
-
import { createHash, randomUUID as
|
|
2975
|
+
import { createHash, randomUUID as randomUUID2 } from "node:crypto";
|
|
2859
2976
|
import { mkdir as mkdir2, writeFile as writeFile2 } from "node:fs/promises";
|
|
2860
|
-
import
|
|
2977
|
+
import path12 from "node:path";
|
|
2861
2978
|
|
|
2862
2979
|
// ../../node_modules/.pnpm/yocto-queue@1.2.1/node_modules/yocto-queue/index.js
|
|
2863
2980
|
var Node = class {
|
|
@@ -3420,6 +3537,7 @@ async function evaluateCandidate(options) {
|
|
|
3420
3537
|
}
|
|
3421
3538
|
}
|
|
3422
3539
|
return {
|
|
3540
|
+
timestamp: completedAt.toISOString(),
|
|
3423
3541
|
eval_id: evalCase.id,
|
|
3424
3542
|
dataset: evalCase.dataset,
|
|
3425
3543
|
conversation_id: evalCase.conversation_id,
|
|
@@ -3427,14 +3545,12 @@ async function evaluateCandidate(options) {
|
|
|
3427
3545
|
hits: score.hits,
|
|
3428
3546
|
misses: score.misses,
|
|
3429
3547
|
candidate_answer: candidate,
|
|
3430
|
-
expected_aspect_count: score.expectedAspectCount,
|
|
3431
3548
|
target: target.name,
|
|
3432
|
-
timestamp: completedAt.toISOString(),
|
|
3433
3549
|
reasoning: score.reasoning,
|
|
3434
3550
|
raw_aspects: score.rawAspects,
|
|
3435
3551
|
agent_provider_request: agentProviderRequest,
|
|
3436
3552
|
lm_provider_request: lmProviderRequest,
|
|
3437
|
-
|
|
3553
|
+
evaluator_provider_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
|
|
3438
3554
|
evaluator_results: evaluatorResults
|
|
3439
3555
|
};
|
|
3440
3556
|
}
|
|
@@ -3511,7 +3627,7 @@ async function runEvaluatorList(options) {
|
|
|
3511
3627
|
hits: score2.hits,
|
|
3512
3628
|
misses: score2.misses,
|
|
3513
3629
|
reasoning: score2.reasoning,
|
|
3514
|
-
|
|
3630
|
+
evaluator_provider_request: score2.evaluatorRawRequest
|
|
3515
3631
|
});
|
|
3516
3632
|
continue;
|
|
3517
3633
|
}
|
|
@@ -3538,7 +3654,7 @@ async function runEvaluatorList(options) {
|
|
|
3538
3654
|
hits: score2.hits,
|
|
3539
3655
|
misses: score2.misses,
|
|
3540
3656
|
reasoning: score2.reasoning,
|
|
3541
|
-
|
|
3657
|
+
evaluator_provider_request: score2.evaluatorRawRequest
|
|
3542
3658
|
});
|
|
3543
3659
|
continue;
|
|
3544
3660
|
}
|
|
@@ -3591,7 +3707,7 @@ async function runLlmJudgeEvaluator(options) {
|
|
|
3591
3707
|
promptInputs,
|
|
3592
3708
|
now,
|
|
3593
3709
|
judgeProvider,
|
|
3594
|
-
|
|
3710
|
+
evaluatorTemplateOverride: customPrompt,
|
|
3595
3711
|
evaluator: config
|
|
3596
3712
|
});
|
|
3597
3713
|
}
|
|
@@ -3632,8 +3748,8 @@ function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
|
|
|
3632
3748
|
async function dumpPrompt(directory, evalCase, promptInputs) {
|
|
3633
3749
|
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
3634
3750
|
const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
|
|
3635
|
-
const filePath =
|
|
3636
|
-
await mkdir2(
|
|
3751
|
+
const filePath = path12.resolve(directory, filename);
|
|
3752
|
+
await mkdir2(path12.dirname(filePath), { recursive: true });
|
|
3637
3753
|
const payload = {
|
|
3638
3754
|
eval_id: evalCase.id,
|
|
3639
3755
|
question: promptInputs.question,
|
|
@@ -3647,7 +3763,7 @@ function sanitizeFilename(value) {
|
|
|
3647
3763
|
return "prompt";
|
|
3648
3764
|
}
|
|
3649
3765
|
const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
|
|
3650
|
-
return sanitized.length > 0 ? sanitized :
|
|
3766
|
+
return sanitized.length > 0 ? sanitized : randomUUID2();
|
|
3651
3767
|
}
|
|
3652
3768
|
async function invokeProvider(provider, options) {
|
|
3653
3769
|
const { evalCase, promptInputs, attempt, agentTimeoutMs, signal } = options;
|
|
@@ -3703,6 +3819,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
3703
3819
|
}
|
|
3704
3820
|
}
|
|
3705
3821
|
return {
|
|
3822
|
+
timestamp: timestamp.toISOString(),
|
|
3706
3823
|
eval_id: evalCase.id,
|
|
3707
3824
|
dataset: evalCase.dataset,
|
|
3708
3825
|
conversation_id: evalCase.conversation_id,
|
|
@@ -3710,9 +3827,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
3710
3827
|
hits: [],
|
|
3711
3828
|
misses: [`Error: ${message}`],
|
|
3712
3829
|
candidate_answer: `Error occurred: ${message}`,
|
|
3713
|
-
expected_aspect_count: 0,
|
|
3714
3830
|
target: targetName,
|
|
3715
|
-
timestamp: timestamp.toISOString(),
|
|
3716
3831
|
raw_aspects: [],
|
|
3717
3832
|
agent_provider_request: agentProviderRequest,
|
|
3718
3833
|
lm_provider_request: lmProviderRequest,
|