@agentv/core 0.11.0 → 0.14.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -2
- package/dist/{chunk-YQBJAT5I.js → chunk-IOCVST3R.js} +1 -1
- package/dist/chunk-IOCVST3R.js.map +1 -0
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/index.cjs +912 -747
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +46 -34
- package/dist/index.d.ts +46 -34
- package/dist/index.js +875 -708
- package/dist/index.js.map +1 -1
- package/package.json +5 -2
- package/dist/chunk-YQBJAT5I.js.map +0 -1
package/dist/index.js
CHANGED
|
@@ -9,7 +9,7 @@ import {
|
|
|
9
9
|
readTextFile,
|
|
10
10
|
resolveFileReference,
|
|
11
11
|
resolveTargetDefinition
|
|
12
|
-
} from "./chunk-
|
|
12
|
+
} from "./chunk-IOCVST3R.js";
|
|
13
13
|
|
|
14
14
|
// src/evaluation/types.ts
|
|
15
15
|
var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
|
|
@@ -62,48 +62,197 @@ function getHitCount(result) {
|
|
|
62
62
|
}
|
|
63
63
|
|
|
64
64
|
// src/evaluation/yaml-parser.ts
|
|
65
|
+
import { readFile as readFile4 } from "node:fs/promises";
|
|
66
|
+
import path6 from "node:path";
|
|
67
|
+
import { parse as parse2 } from "yaml";
|
|
68
|
+
|
|
69
|
+
// src/evaluation/formatting/segment-formatter.ts
|
|
70
|
+
function extractCodeBlocks(segments) {
|
|
71
|
+
const CODE_BLOCK_PATTERN = /```[\s\S]*?```/g;
|
|
72
|
+
const codeBlocks = [];
|
|
73
|
+
for (const segment of segments) {
|
|
74
|
+
const typeValue = segment["type"];
|
|
75
|
+
if (typeof typeValue !== "string" || typeValue !== "text") {
|
|
76
|
+
continue;
|
|
77
|
+
}
|
|
78
|
+
const textValue = segment["value"];
|
|
79
|
+
if (typeof textValue !== "string") {
|
|
80
|
+
continue;
|
|
81
|
+
}
|
|
82
|
+
const matches = textValue.match(CODE_BLOCK_PATTERN);
|
|
83
|
+
if (matches) {
|
|
84
|
+
codeBlocks.push(...matches);
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
return codeBlocks;
|
|
88
|
+
}
|
|
89
|
+
function formatFileContents(parts) {
|
|
90
|
+
const fileCount = parts.filter((p) => p.isFile).length;
|
|
91
|
+
if (fileCount > 0) {
|
|
92
|
+
return parts.map((part) => {
|
|
93
|
+
if (part.isFile && part.displayPath) {
|
|
94
|
+
return `<file path="${part.displayPath}">
|
|
95
|
+
${part.content}
|
|
96
|
+
</file>`;
|
|
97
|
+
}
|
|
98
|
+
return part.content;
|
|
99
|
+
}).join("\n\n");
|
|
100
|
+
}
|
|
101
|
+
return parts.map((p) => p.content).join(" ");
|
|
102
|
+
}
|
|
103
|
+
function formatSegment(segment) {
|
|
104
|
+
const type = asString(segment.type);
|
|
105
|
+
if (type === "text") {
|
|
106
|
+
return asString(segment.value);
|
|
107
|
+
}
|
|
108
|
+
if (type === "guideline_ref") {
|
|
109
|
+
const refPath = asString(segment.path);
|
|
110
|
+
return refPath ? `<Attached: ${refPath}>` : void 0;
|
|
111
|
+
}
|
|
112
|
+
if (type === "file") {
|
|
113
|
+
const text = asString(segment.text);
|
|
114
|
+
const filePath = asString(segment.path);
|
|
115
|
+
if (text && filePath) {
|
|
116
|
+
return formatFileContents([{ content: text.trim(), isFile: true, displayPath: filePath }]);
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
return void 0;
|
|
120
|
+
}
|
|
121
|
+
function hasVisibleContent(segments) {
|
|
122
|
+
return segments.some((segment) => {
|
|
123
|
+
const type = asString(segment.type);
|
|
124
|
+
if (type === "text") {
|
|
125
|
+
const value = asString(segment.value);
|
|
126
|
+
return value !== void 0 && value.trim().length > 0;
|
|
127
|
+
}
|
|
128
|
+
if (type === "guideline_ref") {
|
|
129
|
+
return false;
|
|
130
|
+
}
|
|
131
|
+
if (type === "file") {
|
|
132
|
+
const text = asString(segment.text);
|
|
133
|
+
return text !== void 0 && text.trim().length > 0;
|
|
134
|
+
}
|
|
135
|
+
return false;
|
|
136
|
+
});
|
|
137
|
+
}
|
|
138
|
+
function asString(value) {
|
|
139
|
+
return typeof value === "string" ? value : void 0;
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
// src/evaluation/loaders/config-loader.ts
|
|
65
143
|
import micromatch from "micromatch";
|
|
144
|
+
import { readFile } from "node:fs/promises";
|
|
145
|
+
import path2 from "node:path";
|
|
146
|
+
import { parse } from "yaml";
|
|
147
|
+
|
|
148
|
+
// src/evaluation/loaders/file-resolver.ts
|
|
66
149
|
import { constants } from "node:fs";
|
|
67
|
-
import { access
|
|
150
|
+
import { access } from "node:fs/promises";
|
|
68
151
|
import path from "node:path";
|
|
69
|
-
|
|
70
|
-
import { parse } from "yaml";
|
|
71
|
-
var CODE_BLOCK_PATTERN = /```[\s\S]*?```/g;
|
|
72
|
-
var ANSI_YELLOW = "\x1B[33m";
|
|
73
|
-
var ANSI_RESET = "\x1B[0m";
|
|
74
|
-
var SCHEMA_EVAL_V2 = "agentv-eval-v2";
|
|
75
|
-
var SCHEMA_CONFIG_V2 = "agentv-config-v2";
|
|
76
|
-
async function readTestSuiteMetadata(testFilePath) {
|
|
152
|
+
async function fileExists2(absolutePath) {
|
|
77
153
|
try {
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
const parsed = parse(content);
|
|
81
|
-
if (!isJsonObject(parsed)) {
|
|
82
|
-
return {};
|
|
83
|
-
}
|
|
84
|
-
return { target: extractTargetFromSuite(parsed) };
|
|
154
|
+
await access(absolutePath, constants.F_OK);
|
|
155
|
+
return true;
|
|
85
156
|
} catch {
|
|
86
|
-
return
|
|
157
|
+
return false;
|
|
87
158
|
}
|
|
88
159
|
}
|
|
89
|
-
function
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
160
|
+
function resolveToAbsolutePath(candidate) {
|
|
161
|
+
if (candidate instanceof URL) {
|
|
162
|
+
return new URL(candidate).pathname;
|
|
163
|
+
}
|
|
164
|
+
if (typeof candidate === "string") {
|
|
165
|
+
if (candidate.startsWith("file://")) {
|
|
166
|
+
return new URL(candidate).pathname;
|
|
95
167
|
}
|
|
168
|
+
return path.resolve(candidate);
|
|
96
169
|
}
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
170
|
+
throw new TypeError("Unsupported repoRoot value. Expected string or URL.");
|
|
171
|
+
}
|
|
172
|
+
function buildDirectoryChain2(filePath, repoRoot) {
|
|
173
|
+
const directories = [];
|
|
174
|
+
const seen = /* @__PURE__ */ new Set();
|
|
175
|
+
const boundary = path.resolve(repoRoot);
|
|
176
|
+
let current = path.resolve(path.dirname(filePath));
|
|
177
|
+
while (current !== void 0) {
|
|
178
|
+
if (!seen.has(current)) {
|
|
179
|
+
directories.push(current);
|
|
180
|
+
seen.add(current);
|
|
181
|
+
}
|
|
182
|
+
if (current === boundary) {
|
|
183
|
+
break;
|
|
184
|
+
}
|
|
185
|
+
const parent = path.dirname(current);
|
|
186
|
+
if (parent === current) {
|
|
187
|
+
break;
|
|
188
|
+
}
|
|
189
|
+
current = parent;
|
|
100
190
|
}
|
|
101
|
-
|
|
191
|
+
if (!seen.has(boundary)) {
|
|
192
|
+
directories.push(boundary);
|
|
193
|
+
}
|
|
194
|
+
return directories;
|
|
195
|
+
}
|
|
196
|
+
function buildSearchRoots2(evalPath, repoRoot) {
|
|
197
|
+
const uniqueRoots = [];
|
|
198
|
+
const addRoot = (root) => {
|
|
199
|
+
const normalized = path.resolve(root);
|
|
200
|
+
if (!uniqueRoots.includes(normalized)) {
|
|
201
|
+
uniqueRoots.push(normalized);
|
|
202
|
+
}
|
|
203
|
+
};
|
|
204
|
+
let currentDir = path.dirname(evalPath);
|
|
205
|
+
let reachedBoundary = false;
|
|
206
|
+
while (!reachedBoundary) {
|
|
207
|
+
addRoot(currentDir);
|
|
208
|
+
const parentDir = path.dirname(currentDir);
|
|
209
|
+
if (currentDir === repoRoot || parentDir === currentDir) {
|
|
210
|
+
reachedBoundary = true;
|
|
211
|
+
} else {
|
|
212
|
+
currentDir = parentDir;
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
addRoot(repoRoot);
|
|
216
|
+
addRoot(process.cwd());
|
|
217
|
+
return uniqueRoots;
|
|
102
218
|
}
|
|
219
|
+
function trimLeadingSeparators(value) {
|
|
220
|
+
const trimmed = value.replace(/^[/\\]+/, "");
|
|
221
|
+
return trimmed.length > 0 ? trimmed : value;
|
|
222
|
+
}
|
|
223
|
+
async function resolveFileReference2(rawValue, searchRoots) {
|
|
224
|
+
const displayPath = trimLeadingSeparators(rawValue);
|
|
225
|
+
const potentialPaths = [];
|
|
226
|
+
if (path.isAbsolute(rawValue)) {
|
|
227
|
+
potentialPaths.push(path.normalize(rawValue));
|
|
228
|
+
}
|
|
229
|
+
for (const base of searchRoots) {
|
|
230
|
+
potentialPaths.push(path.resolve(base, displayPath));
|
|
231
|
+
}
|
|
232
|
+
const attempted = [];
|
|
233
|
+
const seen = /* @__PURE__ */ new Set();
|
|
234
|
+
for (const candidate of potentialPaths) {
|
|
235
|
+
const absoluteCandidate = path.resolve(candidate);
|
|
236
|
+
if (seen.has(absoluteCandidate)) {
|
|
237
|
+
continue;
|
|
238
|
+
}
|
|
239
|
+
seen.add(absoluteCandidate);
|
|
240
|
+
attempted.push(absoluteCandidate);
|
|
241
|
+
if (await fileExists2(absoluteCandidate)) {
|
|
242
|
+
return { displayPath, resolvedPath: absoluteCandidate, attempted };
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
return { displayPath, attempted };
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
// src/evaluation/loaders/config-loader.ts
|
|
249
|
+
var SCHEMA_CONFIG_V2 = "agentv-config-v2";
|
|
250
|
+
var ANSI_YELLOW = "\x1B[33m";
|
|
251
|
+
var ANSI_RESET = "\x1B[0m";
|
|
103
252
|
async function loadConfig(evalFilePath, repoRoot) {
|
|
104
|
-
const directories =
|
|
253
|
+
const directories = buildDirectoryChain2(evalFilePath, repoRoot);
|
|
105
254
|
for (const directory of directories) {
|
|
106
|
-
const configPath =
|
|
255
|
+
const configPath = path2.join(directory, ".agentv", "config.yaml");
|
|
107
256
|
if (!await fileExists2(configPath)) {
|
|
108
257
|
continue;
|
|
109
258
|
}
|
|
@@ -146,24 +295,134 @@ function isGuidelineFile(filePath, patterns) {
|
|
|
146
295
|
const patternsToUse = patterns ?? [];
|
|
147
296
|
return micromatch.isMatch(normalized, patternsToUse);
|
|
148
297
|
}
|
|
149
|
-
function
|
|
150
|
-
const
|
|
151
|
-
|
|
152
|
-
const
|
|
153
|
-
if (typeof
|
|
298
|
+
function extractTargetFromSuite(suite) {
|
|
299
|
+
const execution = suite.execution;
|
|
300
|
+
if (execution && typeof execution === "object" && !Array.isArray(execution)) {
|
|
301
|
+
const executionTarget = execution.target;
|
|
302
|
+
if (typeof executionTarget === "string" && executionTarget.trim().length > 0) {
|
|
303
|
+
return executionTarget.trim();
|
|
304
|
+
}
|
|
305
|
+
}
|
|
306
|
+
const targetValue = suite.target;
|
|
307
|
+
if (typeof targetValue === "string" && targetValue.trim().length > 0) {
|
|
308
|
+
return targetValue.trim();
|
|
309
|
+
}
|
|
310
|
+
return void 0;
|
|
311
|
+
}
|
|
312
|
+
function logWarning(message) {
|
|
313
|
+
console.warn(`${ANSI_YELLOW}Warning: ${message}${ANSI_RESET}`);
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
// src/evaluation/loaders/evaluator-parser.ts
|
|
317
|
+
import path3 from "node:path";
|
|
318
|
+
var ANSI_YELLOW2 = "\x1B[33m";
|
|
319
|
+
var ANSI_RESET2 = "\x1B[0m";
|
|
320
|
+
async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
|
|
321
|
+
const execution = rawEvalCase.execution;
|
|
322
|
+
const candidateEvaluators = isJsonObject2(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators ?? globalExecution?.evaluators;
|
|
323
|
+
if (candidateEvaluators === void 0) {
|
|
324
|
+
return void 0;
|
|
325
|
+
}
|
|
326
|
+
if (!Array.isArray(candidateEvaluators)) {
|
|
327
|
+
logWarning2(`Skipping evaluators for '${evalId}': expected array`);
|
|
328
|
+
return void 0;
|
|
329
|
+
}
|
|
330
|
+
const evaluators = [];
|
|
331
|
+
for (const rawEvaluator of candidateEvaluators) {
|
|
332
|
+
if (!isJsonObject2(rawEvaluator)) {
|
|
333
|
+
logWarning2(`Skipping invalid evaluator entry for '${evalId}' (expected object)`);
|
|
154
334
|
continue;
|
|
155
335
|
}
|
|
156
|
-
const
|
|
157
|
-
|
|
336
|
+
const name = asString2(rawEvaluator.name);
|
|
337
|
+
const typeValue = rawEvaluator.type;
|
|
338
|
+
if (!name || !isEvaluatorKind(typeValue)) {
|
|
339
|
+
logWarning2(`Skipping evaluator with invalid name/type in '${evalId}'`);
|
|
158
340
|
continue;
|
|
159
341
|
}
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
342
|
+
if (typeValue === "code") {
|
|
343
|
+
const script = asString2(rawEvaluator.script);
|
|
344
|
+
if (!script) {
|
|
345
|
+
logWarning2(`Skipping code evaluator '${name}' in '${evalId}': missing script`);
|
|
346
|
+
continue;
|
|
347
|
+
}
|
|
348
|
+
const cwd = asString2(rawEvaluator.cwd);
|
|
349
|
+
let resolvedCwd;
|
|
350
|
+
if (cwd) {
|
|
351
|
+
const resolved = await resolveFileReference2(cwd, searchRoots);
|
|
352
|
+
if (resolved.resolvedPath) {
|
|
353
|
+
resolvedCwd = path3.resolve(resolved.resolvedPath);
|
|
354
|
+
} else {
|
|
355
|
+
logWarning2(
|
|
356
|
+
`Code evaluator '${name}' in '${evalId}': cwd not found (${resolved.displayPath})`,
|
|
357
|
+
resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
|
|
358
|
+
);
|
|
359
|
+
}
|
|
360
|
+
} else {
|
|
361
|
+
resolvedCwd = searchRoots[0];
|
|
362
|
+
}
|
|
363
|
+
evaluators.push({
|
|
364
|
+
name,
|
|
365
|
+
type: "code",
|
|
366
|
+
script,
|
|
367
|
+
cwd,
|
|
368
|
+
resolvedCwd
|
|
369
|
+
});
|
|
370
|
+
continue;
|
|
371
|
+
}
|
|
372
|
+
const prompt = asString2(rawEvaluator.prompt);
|
|
373
|
+
let promptPath;
|
|
374
|
+
if (prompt) {
|
|
375
|
+
const resolved = await resolveFileReference2(prompt, searchRoots);
|
|
376
|
+
if (resolved.resolvedPath) {
|
|
377
|
+
promptPath = path3.resolve(resolved.resolvedPath);
|
|
378
|
+
} else {
|
|
379
|
+
logWarning2(
|
|
380
|
+
`Inline prompt used for evaluator '${name}' in '${evalId}' (file not found: ${resolved.displayPath})`,
|
|
381
|
+
resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
|
|
382
|
+
);
|
|
383
|
+
}
|
|
163
384
|
}
|
|
385
|
+
const _model = asString2(rawEvaluator.model);
|
|
386
|
+
evaluators.push({
|
|
387
|
+
name,
|
|
388
|
+
type: "llm_judge",
|
|
389
|
+
prompt,
|
|
390
|
+
promptPath
|
|
391
|
+
});
|
|
164
392
|
}
|
|
165
|
-
return
|
|
393
|
+
return evaluators.length > 0 ? evaluators : void 0;
|
|
166
394
|
}
|
|
395
|
+
function coerceEvaluator(candidate, contextId) {
|
|
396
|
+
if (typeof candidate !== "string") {
|
|
397
|
+
return void 0;
|
|
398
|
+
}
|
|
399
|
+
if (isEvaluatorKind(candidate)) {
|
|
400
|
+
return candidate;
|
|
401
|
+
}
|
|
402
|
+
logWarning2(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
|
|
403
|
+
return void 0;
|
|
404
|
+
}
|
|
405
|
+
function asString2(value) {
|
|
406
|
+
return typeof value === "string" ? value : void 0;
|
|
407
|
+
}
|
|
408
|
+
function isJsonObject2(value) {
|
|
409
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
410
|
+
}
|
|
411
|
+
function logWarning2(message, details) {
|
|
412
|
+
if (details && details.length > 0) {
|
|
413
|
+
const detailBlock = details.join("\n");
|
|
414
|
+
console.warn(`${ANSI_YELLOW2}Warning: ${message}
|
|
415
|
+
${detailBlock}${ANSI_RESET2}`);
|
|
416
|
+
} else {
|
|
417
|
+
console.warn(`${ANSI_YELLOW2}Warning: ${message}${ANSI_RESET2}`);
|
|
418
|
+
}
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
// src/evaluation/loaders/message-processor.ts
|
|
422
|
+
import { readFile as readFile2 } from "node:fs/promises";
|
|
423
|
+
import path4 from "node:path";
|
|
424
|
+
var ANSI_YELLOW3 = "\x1B[33m";
|
|
425
|
+
var ANSI_RESET3 = "\x1B[0m";
|
|
167
426
|
async function processMessages(options) {
|
|
168
427
|
const {
|
|
169
428
|
messages,
|
|
@@ -189,28 +448,28 @@ async function processMessages(options) {
|
|
|
189
448
|
if (!isJsonObject(rawSegment)) {
|
|
190
449
|
continue;
|
|
191
450
|
}
|
|
192
|
-
const segmentType =
|
|
451
|
+
const segmentType = asString3(rawSegment.type);
|
|
193
452
|
if (segmentType === "file") {
|
|
194
|
-
const rawValue =
|
|
453
|
+
const rawValue = asString3(rawSegment.value);
|
|
195
454
|
if (!rawValue) {
|
|
196
455
|
continue;
|
|
197
456
|
}
|
|
198
|
-
const { displayPath, resolvedPath, attempted } = await
|
|
457
|
+
const { displayPath, resolvedPath, attempted } = await resolveFileReference2(
|
|
199
458
|
rawValue,
|
|
200
459
|
searchRoots
|
|
201
460
|
);
|
|
202
461
|
if (!resolvedPath) {
|
|
203
462
|
const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
|
|
204
463
|
const context = messageType === "input" ? "" : " in expected_messages";
|
|
205
|
-
|
|
464
|
+
logWarning3(`File not found${context}: ${displayPath}`, attempts);
|
|
206
465
|
continue;
|
|
207
466
|
}
|
|
208
467
|
try {
|
|
209
|
-
const fileContent = (await
|
|
468
|
+
const fileContent = (await readFile2(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
|
|
210
469
|
if (messageType === "input" && guidelinePatterns && guidelinePaths) {
|
|
211
|
-
const relativeToRepo =
|
|
470
|
+
const relativeToRepo = path4.relative(repoRootPath, resolvedPath);
|
|
212
471
|
if (isGuidelineFile(relativeToRepo, guidelinePatterns)) {
|
|
213
|
-
guidelinePaths.push(
|
|
472
|
+
guidelinePaths.push(path4.resolve(resolvedPath));
|
|
214
473
|
if (verbose) {
|
|
215
474
|
console.log(` [Guideline] Found: ${displayPath}`);
|
|
216
475
|
console.log(` Resolved to: ${resolvedPath}`);
|
|
@@ -222,7 +481,7 @@ async function processMessages(options) {
|
|
|
222
481
|
type: "file",
|
|
223
482
|
path: displayPath,
|
|
224
483
|
text: fileContent,
|
|
225
|
-
resolvedPath:
|
|
484
|
+
resolvedPath: path4.resolve(resolvedPath)
|
|
226
485
|
});
|
|
227
486
|
if (verbose) {
|
|
228
487
|
const label = messageType === "input" ? "[File]" : "[Expected Output File]";
|
|
@@ -231,7 +490,7 @@ async function processMessages(options) {
|
|
|
231
490
|
}
|
|
232
491
|
} catch (error) {
|
|
233
492
|
const context = messageType === "input" ? "" : " expected output";
|
|
234
|
-
|
|
493
|
+
logWarning3(`Could not read${context} file ${resolvedPath}: ${error.message}`);
|
|
235
494
|
}
|
|
236
495
|
continue;
|
|
237
496
|
}
|
|
@@ -245,201 +504,117 @@ async function processMessages(options) {
|
|
|
245
504
|
}
|
|
246
505
|
return segments;
|
|
247
506
|
}
|
|
248
|
-
async function
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
const absoluteTestPath = path.resolve(evalFilePath);
|
|
252
|
-
if (!await fileExists2(absoluteTestPath)) {
|
|
253
|
-
throw new Error(`Test file not found: ${evalFilePath}`);
|
|
254
|
-
}
|
|
255
|
-
const repoRootPath = resolveToAbsolutePath(repoRoot);
|
|
256
|
-
const searchRoots = buildSearchRoots(absoluteTestPath, repoRootPath);
|
|
257
|
-
const config = await loadConfig(absoluteTestPath, repoRootPath);
|
|
258
|
-
const guidelinePatterns = config?.guideline_patterns;
|
|
259
|
-
const rawFile = await readFile(absoluteTestPath, "utf8");
|
|
260
|
-
const parsed = parse(rawFile);
|
|
261
|
-
if (!isJsonObject(parsed)) {
|
|
262
|
-
throw new Error(`Invalid test file format: ${evalFilePath}`);
|
|
263
|
-
}
|
|
264
|
-
const suite = parsed;
|
|
265
|
-
const datasetNameFromSuite = asString(suite.dataset)?.trim();
|
|
266
|
-
const fallbackDataset = path.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
|
|
267
|
-
const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
|
|
268
|
-
const schema = suite.$schema;
|
|
269
|
-
if (schema !== SCHEMA_EVAL_V2) {
|
|
270
|
-
const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${evalFilePath}. Expected '${SCHEMA_EVAL_V2}'` : `Missing required field '$schema' in ${evalFilePath}.
|
|
271
|
-
Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
272
|
-
throw new Error(message);
|
|
507
|
+
async function resolveAssistantContent(content, searchRoots, verbose) {
|
|
508
|
+
if (typeof content === "string") {
|
|
509
|
+
return content;
|
|
273
510
|
}
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
throw new Error(`Invalid test file format: ${evalFilePath} - missing 'evalcases' field`);
|
|
511
|
+
if (!content) {
|
|
512
|
+
return "";
|
|
277
513
|
}
|
|
278
|
-
const
|
|
279
|
-
const
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
for (const rawEvalcase of rawTestcases) {
|
|
283
|
-
if (!isJsonObject(rawEvalcase)) {
|
|
284
|
-
logWarning("Skipping invalid eval case entry (expected object)");
|
|
514
|
+
const parts = [];
|
|
515
|
+
for (const entry of content) {
|
|
516
|
+
if (typeof entry === "string") {
|
|
517
|
+
parts.push({ content: entry, isFile: false });
|
|
285
518
|
continue;
|
|
286
519
|
}
|
|
287
|
-
|
|
288
|
-
const id = asString(evalcase.id);
|
|
289
|
-
if (evalIdFilter && id !== evalIdFilter) {
|
|
520
|
+
if (!isJsonObject(entry)) {
|
|
290
521
|
continue;
|
|
291
522
|
}
|
|
292
|
-
const
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
523
|
+
const segmentType = asString3(entry.type);
|
|
524
|
+
if (segmentType === "file") {
|
|
525
|
+
const rawValue = asString3(entry.value);
|
|
526
|
+
if (!rawValue) {
|
|
527
|
+
continue;
|
|
528
|
+
}
|
|
529
|
+
const { displayPath, resolvedPath, attempted } = await resolveFileReference2(
|
|
530
|
+
rawValue,
|
|
531
|
+
searchRoots
|
|
532
|
+
);
|
|
533
|
+
if (!resolvedPath) {
|
|
534
|
+
const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
|
|
535
|
+
logWarning3(`File not found in expected_messages: ${displayPath}`, attempts);
|
|
536
|
+
continue;
|
|
537
|
+
}
|
|
538
|
+
try {
|
|
539
|
+
const fileContent = (await readFile2(resolvedPath, "utf8")).replace(/\r\n/g, "\n").trim();
|
|
540
|
+
parts.push({ content: fileContent, isFile: true, displayPath });
|
|
541
|
+
if (verbose) {
|
|
542
|
+
console.log(` [Expected Assistant File] Found: ${displayPath}`);
|
|
543
|
+
console.log(` Resolved to: ${resolvedPath}`);
|
|
544
|
+
}
|
|
545
|
+
} catch (error) {
|
|
546
|
+
logWarning3(`Could not read file ${resolvedPath}: ${error.message}`);
|
|
547
|
+
}
|
|
298
548
|
continue;
|
|
299
549
|
}
|
|
300
|
-
const
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
if (hasExpectedMessages && expectedMessages.length === 0) {
|
|
304
|
-
logWarning(`No valid expected message found for eval case: ${id}`);
|
|
550
|
+
const textValue = asString3(entry.text);
|
|
551
|
+
if (typeof textValue === "string") {
|
|
552
|
+
parts.push({ content: textValue, isFile: false });
|
|
305
553
|
continue;
|
|
306
554
|
}
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
const inputTextParts = [];
|
|
312
|
-
const inputSegments = await processMessages({
|
|
313
|
-
messages: inputMessages,
|
|
314
|
-
searchRoots,
|
|
315
|
-
repoRootPath,
|
|
316
|
-
guidelinePatterns,
|
|
317
|
-
guidelinePaths,
|
|
318
|
-
textParts: inputTextParts,
|
|
319
|
-
messageType: "input",
|
|
320
|
-
verbose
|
|
321
|
-
});
|
|
322
|
-
const outputSegments = hasExpectedMessages ? await processMessages({
|
|
323
|
-
messages: expectedMessages,
|
|
324
|
-
searchRoots,
|
|
325
|
-
repoRootPath,
|
|
326
|
-
guidelinePatterns,
|
|
327
|
-
messageType: "output",
|
|
328
|
-
verbose
|
|
329
|
-
}) : [];
|
|
330
|
-
const codeSnippets = extractCodeBlocks(inputSegments);
|
|
331
|
-
const expectedContent = expectedMessages[0]?.content;
|
|
332
|
-
const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
|
|
333
|
-
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
334
|
-
const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
|
|
335
|
-
const evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
|
|
336
|
-
const userFilePaths = [];
|
|
337
|
-
for (const segment of inputSegments) {
|
|
338
|
-
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
339
|
-
userFilePaths.push(segment.resolvedPath);
|
|
340
|
-
}
|
|
341
|
-
}
|
|
342
|
-
const allFilePaths = [
|
|
343
|
-
...guidelinePaths.map((guidelinePath) => path.resolve(guidelinePath)),
|
|
344
|
-
...userFilePaths
|
|
345
|
-
];
|
|
346
|
-
const testCase = {
|
|
347
|
-
id,
|
|
348
|
-
dataset: datasetName,
|
|
349
|
-
conversation_id: conversationId,
|
|
350
|
-
question,
|
|
351
|
-
input_messages: inputMessages,
|
|
352
|
-
input_segments: inputSegments,
|
|
353
|
-
output_segments: outputSegments,
|
|
354
|
-
reference_answer: referenceAnswer,
|
|
355
|
-
guideline_paths: guidelinePaths.map((guidelinePath) => path.resolve(guidelinePath)),
|
|
356
|
-
guideline_patterns: guidelinePatterns,
|
|
357
|
-
file_paths: allFilePaths,
|
|
358
|
-
code_snippets: codeSnippets,
|
|
359
|
-
expected_outcome: outcome,
|
|
360
|
-
evaluator: evalCaseEvaluatorKind,
|
|
361
|
-
evaluators
|
|
362
|
-
};
|
|
363
|
-
if (verbose) {
|
|
364
|
-
console.log(`
|
|
365
|
-
[Eval Case: ${id}]`);
|
|
366
|
-
if (testCase.guideline_paths.length > 0) {
|
|
367
|
-
console.log(` Guidelines used: ${testCase.guideline_paths.length}`);
|
|
368
|
-
for (const guidelinePath of testCase.guideline_paths) {
|
|
369
|
-
console.log(` - ${guidelinePath}`);
|
|
370
|
-
}
|
|
371
|
-
} else {
|
|
372
|
-
console.log(" No guidelines found");
|
|
373
|
-
}
|
|
555
|
+
const valueValue = asString3(entry.value);
|
|
556
|
+
if (typeof valueValue === "string") {
|
|
557
|
+
parts.push({ content: valueValue, isFile: false });
|
|
558
|
+
continue;
|
|
374
559
|
}
|
|
375
|
-
|
|
560
|
+
parts.push({ content: JSON.stringify(entry), isFile: false });
|
|
376
561
|
}
|
|
377
|
-
return
|
|
562
|
+
return formatFileContents(parts);
|
|
378
563
|
}
|
|
379
|
-
function
|
|
380
|
-
|
|
381
|
-
return true;
|
|
382
|
-
}
|
|
383
|
-
let messagesWithContent = 0;
|
|
384
|
-
for (const segments of processedSegmentsByMessage) {
|
|
385
|
-
if (hasVisibleContent(segments)) {
|
|
386
|
-
messagesWithContent++;
|
|
387
|
-
}
|
|
388
|
-
}
|
|
389
|
-
return messagesWithContent > 1;
|
|
564
|
+
function asString3(value) {
|
|
565
|
+
return typeof value === "string" ? value : void 0;
|
|
390
566
|
}
|
|
391
|
-
function
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
if (type === "text") {
|
|
395
|
-
const value = asString(segment.value);
|
|
396
|
-
return value !== void 0 && value.trim().length > 0;
|
|
397
|
-
}
|
|
398
|
-
if (type === "guideline_ref") {
|
|
399
|
-
return false;
|
|
400
|
-
}
|
|
401
|
-
if (type === "file") {
|
|
402
|
-
const text = asString(segment.text);
|
|
403
|
-
return text !== void 0 && text.trim().length > 0;
|
|
404
|
-
}
|
|
405
|
-
return false;
|
|
406
|
-
});
|
|
567
|
+
function cloneJsonObject(source) {
|
|
568
|
+
const entries = Object.entries(source).map(([key, value]) => [key, cloneJsonValue(value)]);
|
|
569
|
+
return Object.fromEntries(entries);
|
|
407
570
|
}
|
|
408
|
-
function
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
return asString(segment.value);
|
|
571
|
+
function cloneJsonValue(value) {
|
|
572
|
+
if (value === null) {
|
|
573
|
+
return null;
|
|
412
574
|
}
|
|
413
|
-
if (
|
|
414
|
-
|
|
415
|
-
return refPath ? `<Attached: ${refPath}>` : void 0;
|
|
575
|
+
if (typeof value === "string" || typeof value === "number" || typeof value === "boolean") {
|
|
576
|
+
return value;
|
|
416
577
|
}
|
|
417
|
-
if (
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
578
|
+
if (Array.isArray(value)) {
|
|
579
|
+
return value.map((item) => cloneJsonValue(item));
|
|
580
|
+
}
|
|
581
|
+
if (typeof value === "object") {
|
|
582
|
+
return cloneJsonObject(value);
|
|
583
|
+
}
|
|
584
|
+
return value;
|
|
585
|
+
}
|
|
586
|
+
function logWarning3(message, details) {
|
|
587
|
+
if (details && details.length > 0) {
|
|
588
|
+
const detailBlock = details.join("\n");
|
|
589
|
+
console.warn(`${ANSI_YELLOW3}Warning: ${message}
|
|
590
|
+
${detailBlock}${ANSI_RESET3}`);
|
|
591
|
+
} else {
|
|
592
|
+
console.warn(`${ANSI_YELLOW3}Warning: ${message}${ANSI_RESET3}`);
|
|
423
593
|
}
|
|
424
|
-
return void 0;
|
|
425
594
|
}
|
|
595
|
+
|
|
596
|
+
// src/evaluation/formatting/prompt-builder.ts
|
|
597
|
+
import { readFile as readFile3 } from "node:fs/promises";
|
|
598
|
+
import path5 from "node:path";
|
|
599
|
+
var ANSI_YELLOW4 = "\x1B[33m";
|
|
600
|
+
var ANSI_RESET4 = "\x1B[0m";
|
|
426
601
|
async function buildPromptInputs(testCase) {
|
|
427
602
|
const guidelineParts = [];
|
|
428
603
|
for (const rawPath of testCase.guideline_paths) {
|
|
429
|
-
const absolutePath =
|
|
604
|
+
const absolutePath = path5.resolve(rawPath);
|
|
430
605
|
if (!await fileExists2(absolutePath)) {
|
|
431
|
-
|
|
606
|
+
logWarning4(`Could not read guideline file ${absolutePath}: file does not exist`);
|
|
432
607
|
continue;
|
|
433
608
|
}
|
|
434
609
|
try {
|
|
435
|
-
const content = (await
|
|
610
|
+
const content = (await readFile3(absolutePath, "utf8")).replace(/\r\n/g, "\n").trim();
|
|
436
611
|
guidelineParts.push({
|
|
437
612
|
content,
|
|
438
613
|
isFile: true,
|
|
439
|
-
displayPath:
|
|
614
|
+
displayPath: path5.basename(absolutePath)
|
|
440
615
|
});
|
|
441
616
|
} catch (error) {
|
|
442
|
-
|
|
617
|
+
logWarning4(`Could not read guideline file ${absolutePath}: ${error.message}`);
|
|
443
618
|
}
|
|
444
619
|
}
|
|
445
620
|
const guidelines = formatFileContents(guidelineParts);
|
|
@@ -463,9 +638,9 @@ async function buildPromptInputs(testCase) {
|
|
|
463
638
|
messageSegments.push({ type: "text", value: segment });
|
|
464
639
|
}
|
|
465
640
|
} else if (isJsonObject(segment)) {
|
|
466
|
-
const type =
|
|
641
|
+
const type = asString4(segment.type);
|
|
467
642
|
if (type === "file") {
|
|
468
|
-
const value =
|
|
643
|
+
const value = asString4(segment.value);
|
|
469
644
|
if (!value) continue;
|
|
470
645
|
if (testCase.guideline_patterns && isGuidelineFile(value, testCase.guideline_patterns)) {
|
|
471
646
|
messageSegments.push({ type: "guideline_ref", path: value });
|
|
@@ -476,7 +651,7 @@ async function buildPromptInputs(testCase) {
|
|
|
476
651
|
messageSegments.push({ type: "file", text: fileText, path: value });
|
|
477
652
|
}
|
|
478
653
|
} else if (type === "text") {
|
|
479
|
-
const textValue =
|
|
654
|
+
const textValue = asString4(segment.value);
|
|
480
655
|
if (textValue && textValue.trim().length > 0) {
|
|
481
656
|
messageSegments.push({ type: "text", value: textValue });
|
|
482
657
|
}
|
|
@@ -532,6 +707,18 @@ ${messageContent}`);
|
|
|
532
707
|
}) : void 0;
|
|
533
708
|
return { question, guidelines, chatPrompt };
|
|
534
709
|
}
|
|
710
|
+
function needsRoleMarkers(messages, processedSegmentsByMessage) {
|
|
711
|
+
if (messages.some((msg) => msg.role === "assistant" || msg.role === "tool")) {
|
|
712
|
+
return true;
|
|
713
|
+
}
|
|
714
|
+
let messagesWithContent = 0;
|
|
715
|
+
for (const segments of processedSegmentsByMessage) {
|
|
716
|
+
if (hasVisibleContent(segments)) {
|
|
717
|
+
messagesWithContent++;
|
|
718
|
+
}
|
|
719
|
+
}
|
|
720
|
+
return messagesWithContent > 1;
|
|
721
|
+
}
|
|
535
722
|
function buildChatPromptFromSegments(options) {
|
|
536
723
|
const { messages, segmentsByMessage, guidelinePatterns, guidelineContent, systemPrompt } = options;
|
|
537
724
|
if (messages.length === 0) {
|
|
@@ -573,13 +760,12 @@ ${guidelineContent.trim()}`);
|
|
|
573
760
|
const segments = segmentsByMessage[i];
|
|
574
761
|
const contentParts = [];
|
|
575
762
|
let role = message.role;
|
|
576
|
-
let name;
|
|
577
763
|
if (role === "system") {
|
|
578
764
|
role = "assistant";
|
|
579
765
|
contentParts.push("@[System]:");
|
|
580
766
|
} else if (role === "tool") {
|
|
581
|
-
role = "
|
|
582
|
-
|
|
767
|
+
role = "assistant";
|
|
768
|
+
contentParts.push("@[Tool]:");
|
|
583
769
|
}
|
|
584
770
|
for (const segment of segments) {
|
|
585
771
|
if (segment.type === "guideline_ref") {
|
|
@@ -597,282 +783,398 @@ ${guidelineContent.trim()}`);
|
|
|
597
783
|
if (contentParts.length === 0) {
|
|
598
784
|
continue;
|
|
599
785
|
}
|
|
786
|
+
const content = contentParts.join("\n");
|
|
600
787
|
chatPrompt.push({
|
|
601
788
|
role,
|
|
602
|
-
content
|
|
603
|
-
...name ? { name } : {}
|
|
789
|
+
content
|
|
604
790
|
});
|
|
605
791
|
}
|
|
606
792
|
return chatPrompt.length > 0 ? chatPrompt : void 0;
|
|
607
793
|
}
|
|
608
|
-
|
|
609
|
-
try {
|
|
610
|
-
await access(absolutePath, constants.F_OK);
|
|
611
|
-
return true;
|
|
612
|
-
} catch {
|
|
613
|
-
return false;
|
|
614
|
-
}
|
|
615
|
-
}
|
|
616
|
-
function resolveToAbsolutePath(candidate) {
|
|
617
|
-
if (candidate instanceof URL) {
|
|
618
|
-
return fileURLToPath(candidate);
|
|
619
|
-
}
|
|
620
|
-
if (typeof candidate === "string") {
|
|
621
|
-
if (candidate.startsWith("file://")) {
|
|
622
|
-
return fileURLToPath(new URL(candidate));
|
|
623
|
-
}
|
|
624
|
-
return path.resolve(candidate);
|
|
625
|
-
}
|
|
626
|
-
throw new TypeError("Unsupported repoRoot value. Expected string or URL.");
|
|
627
|
-
}
|
|
628
|
-
function asString(value) {
|
|
794
|
+
function asString4(value) {
|
|
629
795
|
return typeof value === "string" ? value : void 0;
|
|
630
796
|
}
|
|
631
|
-
function
|
|
632
|
-
|
|
633
|
-
return Object.fromEntries(entries);
|
|
797
|
+
function logWarning4(message) {
|
|
798
|
+
console.warn(`${ANSI_YELLOW4}Warning: ${message}${ANSI_RESET4}`);
|
|
634
799
|
}
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
800
|
+
|
|
801
|
+
// src/evaluation/yaml-parser.ts
|
|
802
|
+
var ANSI_YELLOW5 = "\x1B[33m";
|
|
803
|
+
var ANSI_RESET5 = "\x1B[0m";
|
|
804
|
+
var SCHEMA_EVAL_V2 = "agentv-eval-v2";
|
|
805
|
+
async function readTestSuiteMetadata(testFilePath) {
|
|
806
|
+
try {
|
|
807
|
+
const absolutePath = path6.resolve(testFilePath);
|
|
808
|
+
const content = await readFile4(absolutePath, "utf8");
|
|
809
|
+
const parsed = parse2(content);
|
|
810
|
+
if (!isJsonObject(parsed)) {
|
|
811
|
+
return {};
|
|
812
|
+
}
|
|
813
|
+
return { target: extractTargetFromSuite(parsed) };
|
|
814
|
+
} catch {
|
|
815
|
+
return {};
|
|
644
816
|
}
|
|
645
|
-
return cloneJsonObject(value);
|
|
646
817
|
}
|
|
647
|
-
function
|
|
648
|
-
const
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
818
|
+
async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
819
|
+
const verbose = options?.verbose ?? false;
|
|
820
|
+
const evalIdFilter = options?.evalId;
|
|
821
|
+
const absoluteTestPath = path6.resolve(evalFilePath);
|
|
822
|
+
const repoRootPath = resolveToAbsolutePath(repoRoot);
|
|
823
|
+
const searchRoots = buildSearchRoots2(absoluteTestPath, repoRootPath);
|
|
824
|
+
const config = await loadConfig(absoluteTestPath, repoRootPath);
|
|
825
|
+
const guidelinePatterns = config?.guideline_patterns;
|
|
826
|
+
const rawFile = await readFile4(absoluteTestPath, "utf8");
|
|
827
|
+
const parsed = parse2(rawFile);
|
|
828
|
+
if (!isJsonObject(parsed)) {
|
|
829
|
+
throw new Error(`Invalid test file format: ${evalFilePath}`);
|
|
658
830
|
}
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
831
|
+
const suite = parsed;
|
|
832
|
+
const datasetNameFromSuite = asString5(suite.dataset)?.trim();
|
|
833
|
+
const fallbackDataset = path6.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
|
|
834
|
+
const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
|
|
835
|
+
const schema = suite.$schema;
|
|
836
|
+
if (schema !== SCHEMA_EVAL_V2) {
|
|
837
|
+
const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${evalFilePath}. Expected '${SCHEMA_EVAL_V2}'` : `Missing required field '$schema' in ${evalFilePath}.
|
|
838
|
+
Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
839
|
+
throw new Error(message);
|
|
664
840
|
}
|
|
665
|
-
|
|
666
|
-
|
|
841
|
+
const rawTestcases = suite.evalcases;
|
|
842
|
+
if (!Array.isArray(rawTestcases)) {
|
|
843
|
+
throw new Error(`Invalid test file format: ${evalFilePath} - missing 'evalcases' field`);
|
|
667
844
|
}
|
|
668
|
-
const
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
continue;
|
|
676
|
-
}
|
|
677
|
-
const segmentType = asString(entry.type);
|
|
678
|
-
if (segmentType === "file") {
|
|
679
|
-
const rawValue = asString(entry.value);
|
|
680
|
-
if (!rawValue) {
|
|
681
|
-
continue;
|
|
682
|
-
}
|
|
683
|
-
const { displayPath, resolvedPath, attempted } = await resolveFileReference(
|
|
684
|
-
rawValue,
|
|
685
|
-
searchRoots
|
|
686
|
-
);
|
|
687
|
-
if (!resolvedPath) {
|
|
688
|
-
const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
|
|
689
|
-
logWarning(`File not found in expected_messages: ${displayPath}`, attempts);
|
|
690
|
-
continue;
|
|
691
|
-
}
|
|
692
|
-
try {
|
|
693
|
-
const fileContent = (await readFile(resolvedPath, "utf8")).replace(/\r\n/g, "\n").trim();
|
|
694
|
-
parts.push({ content: fileContent, isFile: true, displayPath });
|
|
695
|
-
if (verbose) {
|
|
696
|
-
console.log(` [Expected Assistant File] Found: ${displayPath}`);
|
|
697
|
-
console.log(` Resolved to: ${resolvedPath}`);
|
|
698
|
-
}
|
|
699
|
-
} catch (error) {
|
|
700
|
-
logWarning(`Could not read file ${resolvedPath}: ${error.message}`);
|
|
701
|
-
}
|
|
702
|
-
continue;
|
|
703
|
-
}
|
|
704
|
-
const textValue = asString(entry.text);
|
|
705
|
-
if (typeof textValue === "string") {
|
|
706
|
-
parts.push({ content: textValue, isFile: false });
|
|
845
|
+
const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm_judge";
|
|
846
|
+
const globalExecution = isJsonObject(suite.execution) ? suite.execution : void 0;
|
|
847
|
+
const _globalTarget = asString5(globalExecution?.target) ?? asString5(suite.target);
|
|
848
|
+
const results = [];
|
|
849
|
+
for (const rawEvalcase of rawTestcases) {
|
|
850
|
+
if (!isJsonObject(rawEvalcase)) {
|
|
851
|
+
logWarning5("Skipping invalid eval case entry (expected object)");
|
|
707
852
|
continue;
|
|
708
853
|
}
|
|
709
|
-
const
|
|
710
|
-
|
|
711
|
-
|
|
854
|
+
const evalcase = rawEvalcase;
|
|
855
|
+
const id = asString5(evalcase.id);
|
|
856
|
+
if (evalIdFilter && id !== evalIdFilter) {
|
|
712
857
|
continue;
|
|
713
858
|
}
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
const candidateEvaluators = isJsonObject(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators ?? globalExecution?.evaluators;
|
|
721
|
-
if (candidateEvaluators === void 0) {
|
|
722
|
-
return void 0;
|
|
723
|
-
}
|
|
724
|
-
if (!Array.isArray(candidateEvaluators)) {
|
|
725
|
-
logWarning(`Skipping evaluators for '${evalId}': expected array`);
|
|
726
|
-
return void 0;
|
|
727
|
-
}
|
|
728
|
-
const evaluators = [];
|
|
729
|
-
for (const rawEvaluator of candidateEvaluators) {
|
|
730
|
-
if (!isJsonObject(rawEvaluator)) {
|
|
731
|
-
logWarning(`Skipping invalid evaluator entry for '${evalId}' (expected object)`);
|
|
859
|
+
const conversationId = asString5(evalcase.conversation_id);
|
|
860
|
+
const outcome = asString5(evalcase.outcome);
|
|
861
|
+
const inputMessagesValue = evalcase.input_messages;
|
|
862
|
+
const expectedMessagesValue = evalcase.expected_messages;
|
|
863
|
+
if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
|
|
864
|
+
logWarning5(`Skipping incomplete eval case: ${id ?? "unknown"}`);
|
|
732
865
|
continue;
|
|
733
866
|
}
|
|
734
|
-
const
|
|
735
|
-
const
|
|
736
|
-
|
|
737
|
-
|
|
867
|
+
const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
|
|
868
|
+
const inputMessages = inputMessagesValue.filter((msg) => isTestMessage(msg));
|
|
869
|
+
const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
|
|
870
|
+
if (hasExpectedMessages && expectedMessages.length === 0) {
|
|
871
|
+
logWarning5(`No valid expected message found for eval case: ${id}`);
|
|
738
872
|
continue;
|
|
739
873
|
}
|
|
740
|
-
if (
|
|
741
|
-
|
|
742
|
-
if (!script) {
|
|
743
|
-
logWarning(`Skipping code evaluator '${name}' in '${evalId}': missing script`);
|
|
744
|
-
continue;
|
|
745
|
-
}
|
|
746
|
-
const cwd = asString(rawEvaluator.cwd);
|
|
747
|
-
let resolvedCwd;
|
|
748
|
-
if (cwd) {
|
|
749
|
-
const resolved = await resolveFileReference(cwd, searchRoots);
|
|
750
|
-
if (resolved.resolvedPath) {
|
|
751
|
-
resolvedCwd = path.resolve(resolved.resolvedPath);
|
|
752
|
-
} else {
|
|
753
|
-
logWarning(
|
|
754
|
-
`Code evaluator '${name}' in '${evalId}': cwd not found (${resolved.displayPath})`,
|
|
755
|
-
resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
|
|
756
|
-
);
|
|
757
|
-
}
|
|
758
|
-
} else {
|
|
759
|
-
resolvedCwd = searchRoots[0];
|
|
760
|
-
}
|
|
761
|
-
evaluators.push({
|
|
762
|
-
name,
|
|
763
|
-
type: "code",
|
|
764
|
-
script,
|
|
765
|
-
cwd,
|
|
766
|
-
resolvedCwd
|
|
767
|
-
});
|
|
768
|
-
continue;
|
|
874
|
+
if (expectedMessages.length > 1) {
|
|
875
|
+
logWarning5(`Multiple expected messages found for eval case: ${id}, using first`);
|
|
769
876
|
}
|
|
770
|
-
const
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
877
|
+
const guidelinePaths = [];
|
|
878
|
+
const inputTextParts = [];
|
|
879
|
+
const inputSegments = await processMessages({
|
|
880
|
+
messages: inputMessages,
|
|
881
|
+
searchRoots,
|
|
882
|
+
repoRootPath,
|
|
883
|
+
guidelinePatterns,
|
|
884
|
+
guidelinePaths,
|
|
885
|
+
textParts: inputTextParts,
|
|
886
|
+
messageType: "input",
|
|
887
|
+
verbose
|
|
888
|
+
});
|
|
889
|
+
const outputSegments = hasExpectedMessages ? await processMessages({
|
|
890
|
+
messages: expectedMessages,
|
|
891
|
+
searchRoots,
|
|
892
|
+
repoRootPath,
|
|
893
|
+
guidelinePatterns,
|
|
894
|
+
messageType: "output",
|
|
895
|
+
verbose
|
|
896
|
+
}) : [];
|
|
897
|
+
const codeSnippets = extractCodeBlocks(inputSegments);
|
|
898
|
+
const expectedContent = expectedMessages[0]?.content;
|
|
899
|
+
const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
|
|
900
|
+
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
901
|
+
const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
|
|
902
|
+
const evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
|
|
903
|
+
const userFilePaths = [];
|
|
904
|
+
for (const segment of inputSegments) {
|
|
905
|
+
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
906
|
+
userFilePaths.push(segment.resolvedPath);
|
|
781
907
|
}
|
|
782
908
|
}
|
|
783
|
-
const
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
909
|
+
const allFilePaths = [
|
|
910
|
+
...guidelinePaths.map((guidelinePath) => path6.resolve(guidelinePath)),
|
|
911
|
+
...userFilePaths
|
|
912
|
+
];
|
|
913
|
+
const testCase = {
|
|
914
|
+
id,
|
|
915
|
+
dataset: datasetName,
|
|
916
|
+
conversation_id: conversationId,
|
|
917
|
+
question,
|
|
918
|
+
input_messages: inputMessages,
|
|
919
|
+
input_segments: inputSegments,
|
|
920
|
+
output_segments: outputSegments,
|
|
921
|
+
reference_answer: referenceAnswer,
|
|
922
|
+
guideline_paths: guidelinePaths.map((guidelinePath) => path6.resolve(guidelinePath)),
|
|
923
|
+
guideline_patterns: guidelinePatterns,
|
|
924
|
+
file_paths: allFilePaths,
|
|
925
|
+
code_snippets: codeSnippets,
|
|
926
|
+
expected_outcome: outcome,
|
|
927
|
+
evaluator: evalCaseEvaluatorKind,
|
|
928
|
+
evaluators
|
|
929
|
+
};
|
|
930
|
+
if (verbose) {
|
|
931
|
+
console.log(`
|
|
932
|
+
[Eval Case: ${id}]`);
|
|
933
|
+
if (testCase.guideline_paths.length > 0) {
|
|
934
|
+
console.log(` Guidelines used: ${testCase.guideline_paths.length}`);
|
|
935
|
+
for (const guidelinePath of testCase.guideline_paths) {
|
|
936
|
+
console.log(` - ${guidelinePath}`);
|
|
937
|
+
}
|
|
938
|
+
} else {
|
|
939
|
+
console.log(" No guidelines found");
|
|
940
|
+
}
|
|
941
|
+
}
|
|
942
|
+
results.push(testCase);
|
|
790
943
|
}
|
|
791
|
-
return
|
|
944
|
+
return results;
|
|
792
945
|
}
|
|
793
|
-
function
|
|
794
|
-
|
|
795
|
-
return void 0;
|
|
796
|
-
}
|
|
797
|
-
if (isEvaluatorKind(candidate)) {
|
|
798
|
-
return candidate;
|
|
799
|
-
}
|
|
800
|
-
logWarning(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
|
|
801
|
-
return void 0;
|
|
946
|
+
function asString5(value) {
|
|
947
|
+
return typeof value === "string" ? value : void 0;
|
|
802
948
|
}
|
|
803
|
-
function
|
|
949
|
+
function logWarning5(message, details) {
|
|
804
950
|
if (details && details.length > 0) {
|
|
805
951
|
const detailBlock = details.join("\n");
|
|
806
|
-
console.warn(`${
|
|
807
|
-
${detailBlock}${
|
|
952
|
+
console.warn(`${ANSI_YELLOW5}Warning: ${message}
|
|
953
|
+
${detailBlock}${ANSI_RESET5}`);
|
|
808
954
|
} else {
|
|
809
|
-
console.warn(`${
|
|
955
|
+
console.warn(`${ANSI_YELLOW5}Warning: ${message}${ANSI_RESET5}`);
|
|
810
956
|
}
|
|
811
957
|
}
|
|
812
958
|
|
|
813
|
-
// src/evaluation/providers/
|
|
814
|
-
import {
|
|
959
|
+
// src/evaluation/providers/ai-sdk.ts
|
|
960
|
+
import { createAnthropic } from "@ai-sdk/anthropic";
|
|
961
|
+
import { createAzure } from "@ai-sdk/azure";
|
|
962
|
+
import { createGoogleGenerativeAI } from "@ai-sdk/google";
|
|
963
|
+
import { generateText } from "ai";
|
|
815
964
|
var DEFAULT_SYSTEM_PROMPT = "You are a careful assistant. Follow all provided instructions and do not fabricate results.";
|
|
965
|
+
var AzureProvider = class {
|
|
966
|
+
constructor(targetName, config) {
|
|
967
|
+
this.config = config;
|
|
968
|
+
this.id = `azure:${targetName}`;
|
|
969
|
+
this.targetName = targetName;
|
|
970
|
+
this.defaults = {
|
|
971
|
+
temperature: config.temperature,
|
|
972
|
+
maxOutputTokens: config.maxOutputTokens
|
|
973
|
+
};
|
|
974
|
+
this.retryConfig = config.retry;
|
|
975
|
+
const azure = createAzure(buildAzureOptions(config));
|
|
976
|
+
this.model = azure(config.deploymentName);
|
|
977
|
+
}
|
|
978
|
+
id;
|
|
979
|
+
kind = "azure";
|
|
980
|
+
targetName;
|
|
981
|
+
model;
|
|
982
|
+
defaults;
|
|
983
|
+
retryConfig;
|
|
984
|
+
async invoke(request) {
|
|
985
|
+
return invokeModel({
|
|
986
|
+
model: this.model,
|
|
987
|
+
request,
|
|
988
|
+
defaults: this.defaults,
|
|
989
|
+
retryConfig: this.retryConfig
|
|
990
|
+
});
|
|
991
|
+
}
|
|
992
|
+
};
|
|
993
|
+
var AnthropicProvider = class {
|
|
994
|
+
constructor(targetName, config) {
|
|
995
|
+
this.config = config;
|
|
996
|
+
this.id = `anthropic:${targetName}`;
|
|
997
|
+
this.targetName = targetName;
|
|
998
|
+
this.defaults = {
|
|
999
|
+
temperature: config.temperature,
|
|
1000
|
+
maxOutputTokens: config.maxOutputTokens,
|
|
1001
|
+
thinkingBudget: config.thinkingBudget
|
|
1002
|
+
};
|
|
1003
|
+
this.retryConfig = config.retry;
|
|
1004
|
+
const anthropic = createAnthropic({
|
|
1005
|
+
apiKey: config.apiKey
|
|
1006
|
+
});
|
|
1007
|
+
this.model = anthropic(config.model);
|
|
1008
|
+
}
|
|
1009
|
+
id;
|
|
1010
|
+
kind = "anthropic";
|
|
1011
|
+
targetName;
|
|
1012
|
+
model;
|
|
1013
|
+
defaults;
|
|
1014
|
+
retryConfig;
|
|
1015
|
+
async invoke(request) {
|
|
1016
|
+
const providerOptions = buildAnthropicProviderOptions(this.defaults);
|
|
1017
|
+
return invokeModel({
|
|
1018
|
+
model: this.model,
|
|
1019
|
+
request,
|
|
1020
|
+
defaults: this.defaults,
|
|
1021
|
+
retryConfig: this.retryConfig,
|
|
1022
|
+
providerOptions
|
|
1023
|
+
});
|
|
1024
|
+
}
|
|
1025
|
+
};
|
|
1026
|
+
var GeminiProvider = class {
|
|
1027
|
+
constructor(targetName, config) {
|
|
1028
|
+
this.config = config;
|
|
1029
|
+
this.id = `gemini:${targetName}`;
|
|
1030
|
+
this.targetName = targetName;
|
|
1031
|
+
this.defaults = {
|
|
1032
|
+
temperature: config.temperature,
|
|
1033
|
+
maxOutputTokens: config.maxOutputTokens
|
|
1034
|
+
};
|
|
1035
|
+
this.retryConfig = config.retry;
|
|
1036
|
+
const google = createGoogleGenerativeAI({
|
|
1037
|
+
apiKey: config.apiKey
|
|
1038
|
+
});
|
|
1039
|
+
this.model = google(config.model);
|
|
1040
|
+
}
|
|
1041
|
+
id;
|
|
1042
|
+
kind = "gemini";
|
|
1043
|
+
targetName;
|
|
1044
|
+
model;
|
|
1045
|
+
defaults;
|
|
1046
|
+
retryConfig;
|
|
1047
|
+
async invoke(request) {
|
|
1048
|
+
return invokeModel({
|
|
1049
|
+
model: this.model,
|
|
1050
|
+
request,
|
|
1051
|
+
defaults: this.defaults,
|
|
1052
|
+
retryConfig: this.retryConfig
|
|
1053
|
+
});
|
|
1054
|
+
}
|
|
1055
|
+
};
|
|
1056
|
+
function buildAzureOptions(config) {
|
|
1057
|
+
const options = {
|
|
1058
|
+
apiKey: config.apiKey,
|
|
1059
|
+
apiVersion: config.version,
|
|
1060
|
+
useDeploymentBasedUrls: true
|
|
1061
|
+
};
|
|
1062
|
+
const baseURL = normalizeAzureBaseUrl(config.resourceName);
|
|
1063
|
+
if (baseURL) {
|
|
1064
|
+
options.baseURL = baseURL;
|
|
1065
|
+
} else {
|
|
1066
|
+
options.resourceName = config.resourceName;
|
|
1067
|
+
}
|
|
1068
|
+
return options;
|
|
1069
|
+
}
|
|
1070
|
+
function normalizeAzureBaseUrl(resourceName) {
|
|
1071
|
+
const trimmed = resourceName.trim();
|
|
1072
|
+
if (!/^https?:\/\//i.test(trimmed)) {
|
|
1073
|
+
return void 0;
|
|
1074
|
+
}
|
|
1075
|
+
const withoutSlash = trimmed.replace(/\/+$/, "");
|
|
1076
|
+
const normalized = withoutSlash.endsWith("/openai") ? withoutSlash : `${withoutSlash}/openai`;
|
|
1077
|
+
return normalized;
|
|
1078
|
+
}
|
|
1079
|
+
function buildAnthropicProviderOptions(defaults) {
|
|
1080
|
+
if (defaults.thinkingBudget === void 0) {
|
|
1081
|
+
return void 0;
|
|
1082
|
+
}
|
|
1083
|
+
return {
|
|
1084
|
+
anthropic: {
|
|
1085
|
+
thinking: {
|
|
1086
|
+
type: "enabled",
|
|
1087
|
+
budgetTokens: defaults.thinkingBudget
|
|
1088
|
+
}
|
|
1089
|
+
}
|
|
1090
|
+
};
|
|
1091
|
+
}
|
|
816
1092
|
function buildChatPrompt(request) {
|
|
817
|
-
|
|
818
|
-
|
|
1093
|
+
const provided = request.chatPrompt?.length ? request.chatPrompt : void 0;
|
|
1094
|
+
if (provided) {
|
|
1095
|
+
const hasSystemMessage = provided.some((message) => message.role === "system");
|
|
819
1096
|
if (hasSystemMessage) {
|
|
820
|
-
return
|
|
1097
|
+
return provided;
|
|
821
1098
|
}
|
|
822
|
-
const systemContent2 = resolveSystemContent(request);
|
|
823
|
-
return [{ role: "system", content: systemContent2 }, ...
|
|
1099
|
+
const systemContent2 = resolveSystemContent(request, false);
|
|
1100
|
+
return [{ role: "system", content: systemContent2 }, ...provided];
|
|
824
1101
|
}
|
|
825
|
-
const systemContent = resolveSystemContent(request);
|
|
1102
|
+
const systemContent = resolveSystemContent(request, true);
|
|
826
1103
|
const userContent = request.question.trim();
|
|
827
1104
|
const prompt = [
|
|
828
|
-
{
|
|
829
|
-
|
|
830
|
-
content: systemContent
|
|
831
|
-
},
|
|
832
|
-
{
|
|
833
|
-
role: "user",
|
|
834
|
-
content: userContent
|
|
835
|
-
}
|
|
1105
|
+
{ role: "system", content: systemContent },
|
|
1106
|
+
{ role: "user", content: userContent }
|
|
836
1107
|
];
|
|
837
1108
|
return prompt;
|
|
838
1109
|
}
|
|
839
|
-
function resolveSystemContent(request) {
|
|
1110
|
+
function resolveSystemContent(request, includeGuidelines) {
|
|
840
1111
|
const systemSegments = [];
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
systemSegments.push(metadataSystemPrompt.trim());
|
|
1112
|
+
if (request.systemPrompt && request.systemPrompt.trim().length > 0) {
|
|
1113
|
+
systemSegments.push(request.systemPrompt.trim());
|
|
844
1114
|
} else {
|
|
845
1115
|
systemSegments.push(DEFAULT_SYSTEM_PROMPT);
|
|
846
1116
|
}
|
|
847
|
-
if (request.guidelines && request.guidelines.trim().length > 0) {
|
|
1117
|
+
if (includeGuidelines && request.guidelines && request.guidelines.trim().length > 0) {
|
|
848
1118
|
systemSegments.push(`[[ ## Guidelines ## ]]
|
|
849
1119
|
|
|
850
1120
|
${request.guidelines.trim()}`);
|
|
851
1121
|
}
|
|
852
1122
|
return systemSegments.join("\n\n");
|
|
853
1123
|
}
|
|
854
|
-
function
|
|
1124
|
+
function toModelMessages(chatPrompt) {
|
|
1125
|
+
return chatPrompt.map((message) => {
|
|
1126
|
+
if (message.role === "tool" || message.role === "function") {
|
|
1127
|
+
const prefix = message.name ? `@[${message.name}]: ` : "@[Tool]: ";
|
|
1128
|
+
return {
|
|
1129
|
+
role: "assistant",
|
|
1130
|
+
content: `${prefix}${message.content}`
|
|
1131
|
+
};
|
|
1132
|
+
}
|
|
1133
|
+
if (message.role === "assistant" || message.role === "system" || message.role === "user") {
|
|
1134
|
+
return {
|
|
1135
|
+
role: message.role,
|
|
1136
|
+
content: message.content
|
|
1137
|
+
};
|
|
1138
|
+
}
|
|
1139
|
+
return {
|
|
1140
|
+
role: "user",
|
|
1141
|
+
content: message.content
|
|
1142
|
+
};
|
|
1143
|
+
});
|
|
1144
|
+
}
|
|
1145
|
+
function resolveModelSettings(request, defaults) {
|
|
855
1146
|
const temperature = request.temperature ?? defaults.temperature;
|
|
856
|
-
const
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
}
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
}
|
|
864
|
-
|
|
1147
|
+
const maxOutputTokens = request.maxOutputTokens ?? defaults.maxOutputTokens;
|
|
1148
|
+
return {
|
|
1149
|
+
temperature,
|
|
1150
|
+
maxOutputTokens
|
|
1151
|
+
};
|
|
1152
|
+
}
|
|
1153
|
+
async function invokeModel(options) {
|
|
1154
|
+
const { model, request, defaults, retryConfig, providerOptions } = options;
|
|
1155
|
+
const chatPrompt = buildChatPrompt(request);
|
|
1156
|
+
const { temperature, maxOutputTokens } = resolveModelSettings(request, defaults);
|
|
1157
|
+
const result = await withRetry(
|
|
1158
|
+
() => generateText({
|
|
1159
|
+
model,
|
|
1160
|
+
messages: toModelMessages(chatPrompt),
|
|
1161
|
+
temperature,
|
|
1162
|
+
maxOutputTokens,
|
|
1163
|
+
maxRetries: 0,
|
|
1164
|
+
abortSignal: request.signal,
|
|
1165
|
+
...providerOptions ? { providerOptions } : {}
|
|
1166
|
+
}),
|
|
1167
|
+
retryConfig,
|
|
1168
|
+
request.signal
|
|
1169
|
+
);
|
|
1170
|
+
return mapResponse(result);
|
|
865
1171
|
}
|
|
866
|
-
function mapResponse(
|
|
867
|
-
const primary = response.results[0];
|
|
868
|
-
const text = typeof primary?.content === "string" ? primary.content : "";
|
|
869
|
-
const reasoning = primary?.thought ?? primary?.thoughtBlock?.data;
|
|
870
|
-
const usage = toJsonObject(response.modelUsage);
|
|
1172
|
+
function mapResponse(result) {
|
|
871
1173
|
return {
|
|
872
|
-
text,
|
|
873
|
-
reasoning,
|
|
874
|
-
raw:
|
|
875
|
-
usage
|
|
1174
|
+
text: result.text ?? "",
|
|
1175
|
+
reasoning: result.reasoningText ?? void 0,
|
|
1176
|
+
raw: result,
|
|
1177
|
+
usage: toJsonObject(result.totalUsage ?? result.usage)
|
|
876
1178
|
};
|
|
877
1179
|
}
|
|
878
1180
|
function toJsonObject(value) {
|
|
@@ -885,34 +1187,59 @@ function toJsonObject(value) {
|
|
|
885
1187
|
return void 0;
|
|
886
1188
|
}
|
|
887
1189
|
}
|
|
888
|
-
function
|
|
889
|
-
if (typeof
|
|
890
|
-
|
|
1190
|
+
function extractStatus(error) {
|
|
1191
|
+
if (!error || typeof error !== "object") {
|
|
1192
|
+
return void 0;
|
|
891
1193
|
}
|
|
892
|
-
|
|
893
|
-
|
|
1194
|
+
const candidate = error;
|
|
1195
|
+
const directStatus = candidate.status ?? candidate.statusCode;
|
|
1196
|
+
if (typeof directStatus === "number" && Number.isFinite(directStatus)) {
|
|
1197
|
+
return directStatus;
|
|
894
1198
|
}
|
|
895
|
-
|
|
1199
|
+
const responseStatus = typeof candidate.response === "object" && candidate.response ? candidate.response.status : void 0;
|
|
1200
|
+
if (typeof responseStatus === "number" && Number.isFinite(responseStatus)) {
|
|
1201
|
+
return responseStatus;
|
|
1202
|
+
}
|
|
1203
|
+
const message = typeof candidate.message === "string" ? candidate.message : void 0;
|
|
1204
|
+
if (message) {
|
|
1205
|
+
const match = message.match(/HTTP\s+(\d{3})/i);
|
|
1206
|
+
if (match) {
|
|
1207
|
+
const parsed = Number.parseInt(match[1], 10);
|
|
1208
|
+
if (Number.isFinite(parsed)) {
|
|
1209
|
+
return parsed;
|
|
1210
|
+
}
|
|
1211
|
+
}
|
|
1212
|
+
}
|
|
1213
|
+
return void 0;
|
|
896
1214
|
}
|
|
897
|
-
function
|
|
1215
|
+
function isNetworkError(error) {
|
|
898
1216
|
if (!error || typeof error !== "object") {
|
|
899
1217
|
return false;
|
|
900
1218
|
}
|
|
901
|
-
|
|
902
|
-
|
|
1219
|
+
const candidate = error;
|
|
1220
|
+
if (candidate.name === "AbortError") {
|
|
1221
|
+
return false;
|
|
903
1222
|
}
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
const status = Number.parseInt(match[1], 10);
|
|
908
|
-
return retryableStatusCodes.includes(status);
|
|
909
|
-
}
|
|
1223
|
+
const code = candidate.code;
|
|
1224
|
+
if (typeof code === "string" && /^E(AI|CONN|HOST|NET|PIPE|TIME|REFUSED|RESET)/i.test(code)) {
|
|
1225
|
+
return true;
|
|
910
1226
|
}
|
|
911
|
-
|
|
1227
|
+
const message = typeof candidate.message === "string" ? candidate.message : void 0;
|
|
1228
|
+
if (message && /(network|fetch failed|ECONNRESET|ENOTFOUND|EAI_AGAIN|ETIMEDOUT|ECONNREFUSED)/i.test(message)) {
|
|
912
1229
|
return true;
|
|
913
1230
|
}
|
|
914
1231
|
return false;
|
|
915
1232
|
}
|
|
1233
|
+
function isRetryableError(error, retryableStatusCodes) {
|
|
1234
|
+
const status = extractStatus(error);
|
|
1235
|
+
if (status === 401 || status === 403) {
|
|
1236
|
+
return false;
|
|
1237
|
+
}
|
|
1238
|
+
if (typeof status === "number") {
|
|
1239
|
+
return retryableStatusCodes.includes(status);
|
|
1240
|
+
}
|
|
1241
|
+
return isNetworkError(error);
|
|
1242
|
+
}
|
|
916
1243
|
function calculateRetryDelay(attempt, config) {
|
|
917
1244
|
const delay = Math.min(
|
|
918
1245
|
config.maxDelayMs,
|
|
@@ -948,152 +1275,16 @@ async function withRetry(fn, retryConfig, signal) {
|
|
|
948
1275
|
}
|
|
949
1276
|
const delay = calculateRetryDelay(attempt, config);
|
|
950
1277
|
await sleep(delay);
|
|
951
|
-
if (signal?.aborted) {
|
|
952
|
-
throw new Error(`Request aborted: ${signal.reason ?? "Unknown reason"}`);
|
|
953
|
-
}
|
|
954
1278
|
}
|
|
955
1279
|
}
|
|
956
1280
|
throw lastError;
|
|
957
1281
|
}
|
|
958
|
-
var AzureProvider = class {
|
|
959
|
-
constructor(targetName, config) {
|
|
960
|
-
this.config = config;
|
|
961
|
-
this.id = `azure:${targetName}`;
|
|
962
|
-
this.targetName = targetName;
|
|
963
|
-
this.defaults = {
|
|
964
|
-
temperature: config.temperature,
|
|
965
|
-
maxOutputTokens: config.maxOutputTokens
|
|
966
|
-
};
|
|
967
|
-
this.retryConfig = config.retry;
|
|
968
|
-
this.ai = AxAI.create({
|
|
969
|
-
name: "azure-openai",
|
|
970
|
-
apiKey: config.apiKey,
|
|
971
|
-
resourceName: config.resourceName,
|
|
972
|
-
deploymentName: config.deploymentName,
|
|
973
|
-
version: config.version,
|
|
974
|
-
config: {
|
|
975
|
-
stream: false
|
|
976
|
-
}
|
|
977
|
-
});
|
|
978
|
-
}
|
|
979
|
-
id;
|
|
980
|
-
kind = "azure";
|
|
981
|
-
targetName;
|
|
982
|
-
ai;
|
|
983
|
-
defaults;
|
|
984
|
-
retryConfig;
|
|
985
|
-
async invoke(request) {
|
|
986
|
-
const chatPrompt = buildChatPrompt(request);
|
|
987
|
-
const modelConfig = extractModelConfig(request, this.defaults);
|
|
988
|
-
const response = await withRetry(
|
|
989
|
-
async () => await this.ai.chat(
|
|
990
|
-
{
|
|
991
|
-
chatPrompt,
|
|
992
|
-
model: this.config.deploymentName,
|
|
993
|
-
...modelConfig ? { modelConfig } : {}
|
|
994
|
-
},
|
|
995
|
-
request.signal ? { abortSignal: request.signal } : void 0
|
|
996
|
-
),
|
|
997
|
-
this.retryConfig,
|
|
998
|
-
request.signal
|
|
999
|
-
);
|
|
1000
|
-
return mapResponse(ensureChatResponse(response));
|
|
1001
|
-
}
|
|
1002
|
-
getAxAI() {
|
|
1003
|
-
return this.ai;
|
|
1004
|
-
}
|
|
1005
|
-
};
|
|
1006
|
-
var AnthropicProvider = class {
|
|
1007
|
-
constructor(targetName, config) {
|
|
1008
|
-
this.config = config;
|
|
1009
|
-
this.id = `anthropic:${targetName}`;
|
|
1010
|
-
this.targetName = targetName;
|
|
1011
|
-
this.defaults = {
|
|
1012
|
-
temperature: config.temperature,
|
|
1013
|
-
maxOutputTokens: config.maxOutputTokens,
|
|
1014
|
-
thinkingBudget: config.thinkingBudget
|
|
1015
|
-
};
|
|
1016
|
-
this.retryConfig = config.retry;
|
|
1017
|
-
this.ai = AxAI.create({
|
|
1018
|
-
name: "anthropic",
|
|
1019
|
-
apiKey: config.apiKey
|
|
1020
|
-
});
|
|
1021
|
-
}
|
|
1022
|
-
id;
|
|
1023
|
-
kind = "anthropic";
|
|
1024
|
-
targetName;
|
|
1025
|
-
ai;
|
|
1026
|
-
defaults;
|
|
1027
|
-
retryConfig;
|
|
1028
|
-
async invoke(request) {
|
|
1029
|
-
const chatPrompt = buildChatPrompt(request);
|
|
1030
|
-
const modelConfig = extractModelConfig(request, this.defaults);
|
|
1031
|
-
const response = await withRetry(
|
|
1032
|
-
async () => await this.ai.chat(
|
|
1033
|
-
{
|
|
1034
|
-
chatPrompt,
|
|
1035
|
-
model: this.config.model,
|
|
1036
|
-
...modelConfig ? { modelConfig } : {}
|
|
1037
|
-
},
|
|
1038
|
-
request.signal ? { abortSignal: request.signal } : void 0
|
|
1039
|
-
),
|
|
1040
|
-
this.retryConfig,
|
|
1041
|
-
request.signal
|
|
1042
|
-
);
|
|
1043
|
-
return mapResponse(ensureChatResponse(response));
|
|
1044
|
-
}
|
|
1045
|
-
getAxAI() {
|
|
1046
|
-
return this.ai;
|
|
1047
|
-
}
|
|
1048
|
-
};
|
|
1049
|
-
var GeminiProvider = class {
|
|
1050
|
-
constructor(targetName, config) {
|
|
1051
|
-
this.config = config;
|
|
1052
|
-
this.id = `gemini:${targetName}`;
|
|
1053
|
-
this.targetName = targetName;
|
|
1054
|
-
this.defaults = {
|
|
1055
|
-
temperature: config.temperature,
|
|
1056
|
-
maxOutputTokens: config.maxOutputTokens
|
|
1057
|
-
};
|
|
1058
|
-
this.retryConfig = config.retry;
|
|
1059
|
-
this.ai = AxAI.create({
|
|
1060
|
-
name: "google-gemini",
|
|
1061
|
-
apiKey: config.apiKey
|
|
1062
|
-
});
|
|
1063
|
-
}
|
|
1064
|
-
id;
|
|
1065
|
-
kind = "gemini";
|
|
1066
|
-
targetName;
|
|
1067
|
-
ai;
|
|
1068
|
-
defaults;
|
|
1069
|
-
retryConfig;
|
|
1070
|
-
async invoke(request) {
|
|
1071
|
-
const chatPrompt = buildChatPrompt(request);
|
|
1072
|
-
const modelConfig = extractModelConfig(request, this.defaults);
|
|
1073
|
-
const response = await withRetry(
|
|
1074
|
-
async () => await this.ai.chat(
|
|
1075
|
-
{
|
|
1076
|
-
chatPrompt,
|
|
1077
|
-
model: this.config.model,
|
|
1078
|
-
...modelConfig ? { modelConfig } : {}
|
|
1079
|
-
},
|
|
1080
|
-
request.signal ? { abortSignal: request.signal } : void 0
|
|
1081
|
-
),
|
|
1082
|
-
this.retryConfig,
|
|
1083
|
-
request.signal
|
|
1084
|
-
);
|
|
1085
|
-
return mapResponse(ensureChatResponse(response));
|
|
1086
|
-
}
|
|
1087
|
-
getAxAI() {
|
|
1088
|
-
return this.ai;
|
|
1089
|
-
}
|
|
1090
|
-
};
|
|
1091
1282
|
|
|
1092
1283
|
// src/evaluation/providers/cli.ts
|
|
1093
1284
|
import { exec as execWithCallback } from "node:child_process";
|
|
1094
1285
|
import fs from "node:fs/promises";
|
|
1095
1286
|
import os from "node:os";
|
|
1096
|
-
import
|
|
1287
|
+
import path7 from "node:path";
|
|
1097
1288
|
import { promisify } from "node:util";
|
|
1098
1289
|
var execAsync = promisify(execWithCallback);
|
|
1099
1290
|
var DEFAULT_MAX_BUFFER = 10 * 1024 * 1024;
|
|
@@ -1135,12 +1326,14 @@ var CliProvider = class {
|
|
|
1135
1326
|
supportsBatch = false;
|
|
1136
1327
|
config;
|
|
1137
1328
|
runCommand;
|
|
1329
|
+
verbose;
|
|
1138
1330
|
healthcheckPromise;
|
|
1139
1331
|
constructor(targetName, config, runner = defaultCommandRunner) {
|
|
1140
1332
|
this.targetName = targetName;
|
|
1141
1333
|
this.id = `cli:${targetName}`;
|
|
1142
1334
|
this.config = config;
|
|
1143
1335
|
this.runCommand = runner;
|
|
1336
|
+
this.verbose = config.verbose ?? false;
|
|
1144
1337
|
}
|
|
1145
1338
|
async invoke(request) {
|
|
1146
1339
|
if (request.signal?.aborted) {
|
|
@@ -1241,6 +1434,11 @@ var CliProvider = class {
|
|
|
1241
1434
|
generateOutputFilePath("healthcheck")
|
|
1242
1435
|
)
|
|
1243
1436
|
);
|
|
1437
|
+
if (this.verbose) {
|
|
1438
|
+
console.log(
|
|
1439
|
+
`[cli-provider:${this.targetName}] (healthcheck) CLI_EVALS_DIR=${process.env.CLI_EVALS_DIR ?? ""} cwd=${healthcheck.cwd ?? this.config.cwd ?? ""} command=${renderedCommand}`
|
|
1440
|
+
);
|
|
1441
|
+
}
|
|
1244
1442
|
const result = await this.runCommand(renderedCommand, {
|
|
1245
1443
|
cwd: healthcheck.cwd ?? this.config.cwd,
|
|
1246
1444
|
env: process.env,
|
|
@@ -1272,7 +1470,7 @@ function normalizeInputFiles(inputFiles) {
|
|
|
1272
1470
|
}
|
|
1273
1471
|
const unique = /* @__PURE__ */ new Map();
|
|
1274
1472
|
for (const inputFile of inputFiles) {
|
|
1275
|
-
const absolutePath =
|
|
1473
|
+
const absolutePath = path7.resolve(inputFile);
|
|
1276
1474
|
if (!unique.has(absolutePath)) {
|
|
1277
1475
|
unique.set(absolutePath, absolutePath);
|
|
1278
1476
|
}
|
|
@@ -1286,7 +1484,7 @@ function formatFileList(files, template) {
|
|
|
1286
1484
|
const formatter = template ?? "{path}";
|
|
1287
1485
|
return files.map((filePath) => {
|
|
1288
1486
|
const escapedPath = shellEscape(filePath);
|
|
1289
|
-
const escapedName = shellEscape(
|
|
1487
|
+
const escapedName = shellEscape(path7.basename(filePath));
|
|
1290
1488
|
return formatter.replaceAll("{path}", escapedPath).replaceAll("{basename}", escapedName);
|
|
1291
1489
|
}).join(" ");
|
|
1292
1490
|
}
|
|
@@ -1310,7 +1508,7 @@ function generateOutputFilePath(evalCaseId) {
|
|
|
1310
1508
|
const safeEvalId = evalCaseId || "unknown";
|
|
1311
1509
|
const timestamp = Date.now();
|
|
1312
1510
|
const random = Math.random().toString(36).substring(2, 9);
|
|
1313
|
-
return
|
|
1511
|
+
return path7.join(os.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}.json`);
|
|
1314
1512
|
}
|
|
1315
1513
|
function formatTimeoutSuffix(timeoutMs) {
|
|
1316
1514
|
if (!timeoutMs || timeoutMs <= 0) {
|
|
@@ -1326,7 +1524,7 @@ import { randomUUID } from "node:crypto";
|
|
|
1326
1524
|
import { constants as constants2, createWriteStream } from "node:fs";
|
|
1327
1525
|
import { access as access2, mkdtemp, mkdir, rm, writeFile } from "node:fs/promises";
|
|
1328
1526
|
import { tmpdir } from "node:os";
|
|
1329
|
-
import
|
|
1527
|
+
import path9 from "node:path";
|
|
1330
1528
|
import { promisify as promisify2 } from "node:util";
|
|
1331
1529
|
|
|
1332
1530
|
// src/evaluation/providers/codex-log-tracker.ts
|
|
@@ -1383,7 +1581,7 @@ function subscribeToCodexLogEntries(listener) {
|
|
|
1383
1581
|
}
|
|
1384
1582
|
|
|
1385
1583
|
// src/evaluation/providers/preread.ts
|
|
1386
|
-
import
|
|
1584
|
+
import path8 from "node:path";
|
|
1387
1585
|
function buildPromptDocument(request, inputFiles, options) {
|
|
1388
1586
|
const parts = [];
|
|
1389
1587
|
const guidelineFiles = collectGuidelineFiles(
|
|
@@ -1408,7 +1606,7 @@ function normalizeInputFiles2(inputFiles) {
|
|
|
1408
1606
|
}
|
|
1409
1607
|
const deduped = /* @__PURE__ */ new Map();
|
|
1410
1608
|
for (const inputFile of inputFiles) {
|
|
1411
|
-
const absolutePath =
|
|
1609
|
+
const absolutePath = path8.resolve(inputFile);
|
|
1412
1610
|
if (!deduped.has(absolutePath)) {
|
|
1413
1611
|
deduped.set(absolutePath, absolutePath);
|
|
1414
1612
|
}
|
|
@@ -1421,14 +1619,14 @@ function collectGuidelineFiles(inputFiles, guidelinePatterns, overrides) {
|
|
|
1421
1619
|
}
|
|
1422
1620
|
const unique = /* @__PURE__ */ new Map();
|
|
1423
1621
|
for (const inputFile of inputFiles) {
|
|
1424
|
-
const absolutePath =
|
|
1622
|
+
const absolutePath = path8.resolve(inputFile);
|
|
1425
1623
|
if (overrides?.has(absolutePath)) {
|
|
1426
1624
|
if (!unique.has(absolutePath)) {
|
|
1427
1625
|
unique.set(absolutePath, absolutePath);
|
|
1428
1626
|
}
|
|
1429
1627
|
continue;
|
|
1430
1628
|
}
|
|
1431
|
-
const normalized = absolutePath.split(
|
|
1629
|
+
const normalized = absolutePath.split(path8.sep).join("/");
|
|
1432
1630
|
if (isGuidelineFile(normalized, guidelinePatterns)) {
|
|
1433
1631
|
if (!unique.has(absolutePath)) {
|
|
1434
1632
|
unique.set(absolutePath, absolutePath);
|
|
@@ -1443,7 +1641,7 @@ function collectInputFiles(inputFiles) {
|
|
|
1443
1641
|
}
|
|
1444
1642
|
const unique = /* @__PURE__ */ new Map();
|
|
1445
1643
|
for (const inputFile of inputFiles) {
|
|
1446
|
-
const absolutePath =
|
|
1644
|
+
const absolutePath = path8.resolve(inputFile);
|
|
1447
1645
|
if (!unique.has(absolutePath)) {
|
|
1448
1646
|
unique.set(absolutePath, absolutePath);
|
|
1449
1647
|
}
|
|
@@ -1455,7 +1653,7 @@ function buildMandatoryPrereadBlock(guidelineFiles, inputFiles) {
|
|
|
1455
1653
|
return "";
|
|
1456
1654
|
}
|
|
1457
1655
|
const buildList = (files) => files.map((absolutePath) => {
|
|
1458
|
-
const fileName =
|
|
1656
|
+
const fileName = path8.basename(absolutePath);
|
|
1459
1657
|
const fileUri = pathToFileUri(absolutePath);
|
|
1460
1658
|
return `* [${fileName}](${fileUri})`;
|
|
1461
1659
|
});
|
|
@@ -1475,7 +1673,7 @@ ${buildList(inputFiles).join("\n")}.`);
|
|
|
1475
1673
|
return sections.join("\n");
|
|
1476
1674
|
}
|
|
1477
1675
|
function pathToFileUri(filePath) {
|
|
1478
|
-
const absolutePath =
|
|
1676
|
+
const absolutePath = path8.isAbsolute(filePath) ? filePath : path8.resolve(filePath);
|
|
1479
1677
|
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
1480
1678
|
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
1481
1679
|
return `file:///${normalizedPath}`;
|
|
@@ -1513,7 +1711,7 @@ var CodexProvider = class {
|
|
|
1513
1711
|
const logger = await this.createStreamLogger(request).catch(() => void 0);
|
|
1514
1712
|
try {
|
|
1515
1713
|
const promptContent = buildPromptDocument(request, inputFiles);
|
|
1516
|
-
const promptFile =
|
|
1714
|
+
const promptFile = path9.join(workspaceRoot, PROMPT_FILENAME);
|
|
1517
1715
|
await writeFile(promptFile, promptContent, "utf8");
|
|
1518
1716
|
const args = this.buildCodexArgs();
|
|
1519
1717
|
const cwd = this.resolveCwd(workspaceRoot);
|
|
@@ -1563,7 +1761,7 @@ var CodexProvider = class {
|
|
|
1563
1761
|
if (!this.config.cwd) {
|
|
1564
1762
|
return workspaceRoot;
|
|
1565
1763
|
}
|
|
1566
|
-
return
|
|
1764
|
+
return path9.resolve(this.config.cwd);
|
|
1567
1765
|
}
|
|
1568
1766
|
buildCodexArgs() {
|
|
1569
1767
|
const args = ["--ask-for-approval", "never", "exec", "--json", "--color", "never", "--skip-git-repo-check"];
|
|
@@ -1597,7 +1795,7 @@ var CodexProvider = class {
|
|
|
1597
1795
|
}
|
|
1598
1796
|
}
|
|
1599
1797
|
async createWorkspace() {
|
|
1600
|
-
return await mkdtemp(
|
|
1798
|
+
return await mkdtemp(path9.join(tmpdir(), WORKSPACE_PREFIX));
|
|
1601
1799
|
}
|
|
1602
1800
|
async cleanupWorkspace(workspaceRoot) {
|
|
1603
1801
|
try {
|
|
@@ -1611,9 +1809,9 @@ var CodexProvider = class {
|
|
|
1611
1809
|
return void 0;
|
|
1612
1810
|
}
|
|
1613
1811
|
if (this.config.logDir) {
|
|
1614
|
-
return
|
|
1812
|
+
return path9.resolve(this.config.logDir);
|
|
1615
1813
|
}
|
|
1616
|
-
return
|
|
1814
|
+
return path9.join(process.cwd(), ".agentv", "logs", "codex");
|
|
1617
1815
|
}
|
|
1618
1816
|
async createStreamLogger(request) {
|
|
1619
1817
|
const logDir = this.resolveLogDirectory();
|
|
@@ -1627,7 +1825,7 @@ var CodexProvider = class {
|
|
|
1627
1825
|
console.warn(`Skipping Codex stream logging (could not create ${logDir}): ${message}`);
|
|
1628
1826
|
return void 0;
|
|
1629
1827
|
}
|
|
1630
|
-
const filePath =
|
|
1828
|
+
const filePath = path9.join(logDir, buildLogFilename(request, this.targetName));
|
|
1631
1829
|
try {
|
|
1632
1830
|
const logger = await CodexStreamLogger.create({
|
|
1633
1831
|
filePath,
|
|
@@ -1842,7 +2040,7 @@ function tryParseJsonValue(rawLine) {
|
|
|
1842
2040
|
async function locateExecutable(candidate) {
|
|
1843
2041
|
const includesPathSeparator = candidate.includes("/") || candidate.includes("\\");
|
|
1844
2042
|
if (includesPathSeparator) {
|
|
1845
|
-
const resolved =
|
|
2043
|
+
const resolved = path9.isAbsolute(candidate) ? candidate : path9.resolve(candidate);
|
|
1846
2044
|
const executablePath = await ensureWindowsExecutableVariant(resolved);
|
|
1847
2045
|
await access2(executablePath, constants2.F_OK);
|
|
1848
2046
|
return executablePath;
|
|
@@ -2189,7 +2387,7 @@ var MockProvider = class {
|
|
|
2189
2387
|
};
|
|
2190
2388
|
|
|
2191
2389
|
// src/evaluation/providers/vscode.ts
|
|
2192
|
-
import
|
|
2390
|
+
import path10 from "node:path";
|
|
2193
2391
|
import { dispatchAgentSession, dispatchBatchAgent, getSubagentRoot, provisionSubagents } from "subagent";
|
|
2194
2392
|
var VSCodeProvider = class {
|
|
2195
2393
|
id;
|
|
@@ -2302,6 +2500,9 @@ var VSCodeProvider = class {
|
|
|
2302
2500
|
};
|
|
2303
2501
|
function buildPromptDocument2(request, attachments, guidelinePatterns) {
|
|
2304
2502
|
const parts = [];
|
|
2503
|
+
if (request.systemPrompt && request.systemPrompt.trim().length > 0) {
|
|
2504
|
+
parts.push(request.systemPrompt.trim());
|
|
2505
|
+
}
|
|
2305
2506
|
const guidelineFiles = collectGuidelineFiles2(attachments, guidelinePatterns);
|
|
2306
2507
|
const attachmentFiles = collectAttachmentFiles(attachments);
|
|
2307
2508
|
const nonGuidelineAttachments = attachmentFiles.filter(
|
|
@@ -2319,7 +2520,7 @@ function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
|
|
|
2319
2520
|
return "";
|
|
2320
2521
|
}
|
|
2321
2522
|
const buildList = (files) => files.map((absolutePath) => {
|
|
2322
|
-
const fileName =
|
|
2523
|
+
const fileName = path10.basename(absolutePath);
|
|
2323
2524
|
const fileUri = pathToFileUri2(absolutePath);
|
|
2324
2525
|
return `* [${fileName}](${fileUri})`;
|
|
2325
2526
|
});
|
|
@@ -2344,8 +2545,8 @@ function collectGuidelineFiles2(attachments, guidelinePatterns) {
|
|
|
2344
2545
|
}
|
|
2345
2546
|
const unique = /* @__PURE__ */ new Map();
|
|
2346
2547
|
for (const attachment of attachments) {
|
|
2347
|
-
const absolutePath =
|
|
2348
|
-
const normalized = absolutePath.split(
|
|
2548
|
+
const absolutePath = path10.resolve(attachment);
|
|
2549
|
+
const normalized = absolutePath.split(path10.sep).join("/");
|
|
2349
2550
|
if (isGuidelineFile(normalized, guidelinePatterns)) {
|
|
2350
2551
|
if (!unique.has(absolutePath)) {
|
|
2351
2552
|
unique.set(absolutePath, absolutePath);
|
|
@@ -2360,7 +2561,7 @@ function collectAttachmentFiles(attachments) {
|
|
|
2360
2561
|
}
|
|
2361
2562
|
const unique = /* @__PURE__ */ new Map();
|
|
2362
2563
|
for (const attachment of attachments) {
|
|
2363
|
-
const absolutePath =
|
|
2564
|
+
const absolutePath = path10.resolve(attachment);
|
|
2364
2565
|
if (!unique.has(absolutePath)) {
|
|
2365
2566
|
unique.set(absolutePath, absolutePath);
|
|
2366
2567
|
}
|
|
@@ -2368,7 +2569,7 @@ function collectAttachmentFiles(attachments) {
|
|
|
2368
2569
|
return Array.from(unique.values());
|
|
2369
2570
|
}
|
|
2370
2571
|
function pathToFileUri2(filePath) {
|
|
2371
|
-
const absolutePath =
|
|
2572
|
+
const absolutePath = path10.isAbsolute(filePath) ? filePath : path10.resolve(filePath);
|
|
2372
2573
|
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
2373
2574
|
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
2374
2575
|
return `file:///${normalizedPath}`;
|
|
@@ -2381,7 +2582,7 @@ function normalizeAttachments(attachments) {
|
|
|
2381
2582
|
}
|
|
2382
2583
|
const deduped = /* @__PURE__ */ new Set();
|
|
2383
2584
|
for (const attachment of attachments) {
|
|
2384
|
-
deduped.add(
|
|
2585
|
+
deduped.add(path10.resolve(attachment));
|
|
2385
2586
|
}
|
|
2386
2587
|
return Array.from(deduped);
|
|
2387
2588
|
}
|
|
@@ -2390,7 +2591,7 @@ function mergeAttachments(all) {
|
|
|
2390
2591
|
for (const list of all) {
|
|
2391
2592
|
if (!list) continue;
|
|
2392
2593
|
for (const inputFile of list) {
|
|
2393
|
-
deduped.add(
|
|
2594
|
+
deduped.add(path10.resolve(inputFile));
|
|
2394
2595
|
}
|
|
2395
2596
|
}
|
|
2396
2597
|
return deduped.size > 0 ? Array.from(deduped) : void 0;
|
|
@@ -2436,9 +2637,9 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
|
|
|
2436
2637
|
|
|
2437
2638
|
// src/evaluation/providers/targets-file.ts
|
|
2438
2639
|
import { constants as constants3 } from "node:fs";
|
|
2439
|
-
import { access as access3, readFile as
|
|
2440
|
-
import
|
|
2441
|
-
import { parse as
|
|
2640
|
+
import { access as access3, readFile as readFile5 } from "node:fs/promises";
|
|
2641
|
+
import path11 from "node:path";
|
|
2642
|
+
import { parse as parse3 } from "yaml";
|
|
2442
2643
|
function isRecord(value) {
|
|
2443
2644
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
2444
2645
|
}
|
|
@@ -2493,12 +2694,12 @@ async function fileExists3(filePath) {
|
|
|
2493
2694
|
}
|
|
2494
2695
|
}
|
|
2495
2696
|
async function readTargetDefinitions(filePath) {
|
|
2496
|
-
const absolutePath =
|
|
2697
|
+
const absolutePath = path11.resolve(filePath);
|
|
2497
2698
|
if (!await fileExists3(absolutePath)) {
|
|
2498
2699
|
throw new Error(`targets.yaml not found at ${absolutePath}`);
|
|
2499
2700
|
}
|
|
2500
|
-
const raw = await
|
|
2501
|
-
const parsed =
|
|
2701
|
+
const raw = await readFile5(absolutePath, "utf8");
|
|
2702
|
+
const parsed = parse3(raw);
|
|
2502
2703
|
if (!isRecord(parsed)) {
|
|
2503
2704
|
throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with '$schema' and 'targets' fields`);
|
|
2504
2705
|
}
|
|
@@ -2541,18 +2742,34 @@ function resolveAndCreateProvider(definition, env = process.env) {
|
|
|
2541
2742
|
}
|
|
2542
2743
|
|
|
2543
2744
|
// src/evaluation/evaluators.ts
|
|
2544
|
-
|
|
2745
|
+
var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
|
|
2746
|
+
|
|
2747
|
+
Use the reference_answer as a gold standard for a high-quality response (if provided). The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
|
|
2748
|
+
|
|
2749
|
+
Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.
|
|
2750
|
+
|
|
2751
|
+
[[ ## expected_outcome ## ]]
|
|
2752
|
+
{{expected_outcome}}
|
|
2753
|
+
|
|
2754
|
+
[[ ## question ## ]]
|
|
2755
|
+
{{question}}
|
|
2756
|
+
|
|
2757
|
+
[[ ## reference_answer ## ]]
|
|
2758
|
+
{{reference_answer}}
|
|
2759
|
+
|
|
2760
|
+
[[ ## candidate_answer ## ]]
|
|
2761
|
+
{{candidate_answer}}`;
|
|
2545
2762
|
var LlmJudgeEvaluator = class {
|
|
2546
2763
|
kind = "llm_judge";
|
|
2547
2764
|
resolveJudgeProvider;
|
|
2548
2765
|
maxOutputTokens;
|
|
2549
2766
|
temperature;
|
|
2550
|
-
|
|
2767
|
+
evaluatorTemplate;
|
|
2551
2768
|
constructor(options) {
|
|
2552
2769
|
this.resolveJudgeProvider = options.resolveJudgeProvider;
|
|
2553
2770
|
this.maxOutputTokens = options.maxOutputTokens;
|
|
2554
2771
|
this.temperature = options.temperature;
|
|
2555
|
-
this.
|
|
2772
|
+
this.evaluatorTemplate = options.evaluatorTemplate;
|
|
2556
2773
|
}
|
|
2557
2774
|
async evaluate(context) {
|
|
2558
2775
|
const judgeProvider = await this.resolveJudgeProvider(context);
|
|
@@ -2562,26 +2779,21 @@ var LlmJudgeEvaluator = class {
|
|
|
2562
2779
|
return this.evaluateWithPrompt(context, judgeProvider);
|
|
2563
2780
|
}
|
|
2564
2781
|
async evaluateWithPrompt(context, judgeProvider) {
|
|
2565
|
-
const hasReferenceAnswer = hasNonEmptyReferenceAnswer(context.evalCase);
|
|
2566
2782
|
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
2567
|
-
|
|
2568
|
-
|
|
2569
|
-
|
|
2570
|
-
|
|
2571
|
-
|
|
2572
|
-
|
|
2573
|
-
|
|
2574
|
-
|
|
2575
|
-
|
|
2576
|
-
|
|
2577
|
-
|
|
2578
|
-
prompt = substituteVariables(systemPrompt, variables);
|
|
2579
|
-
systemPrompt = buildSystemPrompt(hasReferenceAnswer);
|
|
2580
|
-
}
|
|
2581
|
-
const metadata = systemPrompt !== void 0 ? { systemPrompt } : {};
|
|
2783
|
+
const variables = {
|
|
2784
|
+
input_messages: JSON.stringify(context.evalCase.input_segments, null, 2),
|
|
2785
|
+
output_messages: JSON.stringify(context.evalCase.output_segments, null, 2),
|
|
2786
|
+
candidate_answer: context.candidate.trim(),
|
|
2787
|
+
reference_answer: (context.evalCase.reference_answer ?? "").trim(),
|
|
2788
|
+
expected_outcome: context.evalCase.expected_outcome.trim(),
|
|
2789
|
+
question: formattedQuestion.trim()
|
|
2790
|
+
};
|
|
2791
|
+
const systemPrompt = buildOutputSchema();
|
|
2792
|
+
const evaluatorTemplate = context.evaluatorTemplateOverride ?? this.evaluatorTemplate ?? DEFAULT_EVALUATOR_TEMPLATE;
|
|
2793
|
+
const userPrompt = substituteVariables(evaluatorTemplate, variables);
|
|
2582
2794
|
const response = await judgeProvider.invoke({
|
|
2583
|
-
question:
|
|
2584
|
-
|
|
2795
|
+
question: userPrompt,
|
|
2796
|
+
systemPrompt,
|
|
2585
2797
|
evalCaseId: context.evalCase.id,
|
|
2586
2798
|
attempt: context.attempt,
|
|
2587
2799
|
maxOutputTokens: this.maxOutputTokens,
|
|
@@ -2594,11 +2806,9 @@ var LlmJudgeEvaluator = class {
|
|
|
2594
2806
|
const reasoning = parsed.reasoning ?? response.reasoning;
|
|
2595
2807
|
const expectedAspectCount = Math.max(hits.length + misses.length, 1);
|
|
2596
2808
|
const evaluatorRawRequest = {
|
|
2597
|
-
|
|
2598
|
-
|
|
2599
|
-
|
|
2600
|
-
target: context.target.name,
|
|
2601
|
-
...systemPrompt !== void 0 && { systemPrompt }
|
|
2809
|
+
userPrompt,
|
|
2810
|
+
systemPrompt,
|
|
2811
|
+
target: judgeProvider.targetName
|
|
2602
2812
|
};
|
|
2603
2813
|
return {
|
|
2604
2814
|
score,
|
|
@@ -2610,20 +2820,8 @@ var LlmJudgeEvaluator = class {
|
|
|
2610
2820
|
};
|
|
2611
2821
|
}
|
|
2612
2822
|
};
|
|
2613
|
-
function
|
|
2614
|
-
|
|
2615
|
-
"You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.",
|
|
2616
|
-
""
|
|
2617
|
-
];
|
|
2618
|
-
if (hasReferenceAnswer) {
|
|
2619
|
-
basePrompt.push(
|
|
2620
|
-
"Use the reference_answer as a gold standard for a high-quality response. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.",
|
|
2621
|
-
""
|
|
2622
|
-
);
|
|
2623
|
-
}
|
|
2624
|
-
basePrompt.push(
|
|
2625
|
-
"Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.",
|
|
2626
|
-
"",
|
|
2823
|
+
function buildOutputSchema() {
|
|
2824
|
+
return [
|
|
2627
2825
|
"You must respond with a single JSON object matching this schema:",
|
|
2628
2826
|
"",
|
|
2629
2827
|
"{",
|
|
@@ -2632,30 +2830,7 @@ function buildSystemPrompt(hasReferenceAnswer) {
|
|
|
2632
2830
|
' "misses": [<array of strings, max 4 items, brief specific failures or omissions, empty if none>],',
|
|
2633
2831
|
' "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
|
|
2634
2832
|
"}"
|
|
2635
|
-
);
|
|
2636
|
-
return basePrompt.join("\n");
|
|
2637
|
-
}
|
|
2638
|
-
function buildQualityPrompt(evalCase, candidate, question) {
|
|
2639
|
-
const parts = [
|
|
2640
|
-
"[[ ## expected_outcome ## ]]",
|
|
2641
|
-
evalCase.expected_outcome.trim(),
|
|
2642
|
-
"",
|
|
2643
|
-
"[[ ## question ## ]]",
|
|
2644
|
-
question.trim(),
|
|
2645
|
-
""
|
|
2646
|
-
];
|
|
2647
|
-
if (hasNonEmptyReferenceAnswer(evalCase)) {
|
|
2648
|
-
parts.push(
|
|
2649
|
-
"[[ ## reference_answer ## ]]",
|
|
2650
|
-
evalCase.reference_answer.trim(),
|
|
2651
|
-
""
|
|
2652
|
-
);
|
|
2653
|
-
}
|
|
2654
|
-
parts.push(
|
|
2655
|
-
"[[ ## candidate_answer ## ]]",
|
|
2656
|
-
candidate.trim()
|
|
2657
|
-
);
|
|
2658
|
-
return parts.join("\n");
|
|
2833
|
+
].join("\n");
|
|
2659
2834
|
}
|
|
2660
2835
|
function clampScore(value) {
|
|
2661
2836
|
if (Number.isNaN(value) || !Number.isFinite(value)) {
|
|
@@ -2737,9 +2912,6 @@ function extractJsonBlob(text) {
|
|
|
2737
2912
|
function isNonEmptyString(value) {
|
|
2738
2913
|
return typeof value === "string" && value.trim().length > 0;
|
|
2739
2914
|
}
|
|
2740
|
-
function hasNonEmptyReferenceAnswer(evalCase) {
|
|
2741
|
-
return evalCase.reference_answer !== void 0 && evalCase.reference_answer.trim().length > 0;
|
|
2742
|
-
}
|
|
2743
2915
|
var CodeEvaluator = class {
|
|
2744
2916
|
kind = "code";
|
|
2745
2917
|
script;
|
|
@@ -2845,19 +3017,16 @@ function parseJsonSafe(payload) {
|
|
|
2845
3017
|
return void 0;
|
|
2846
3018
|
}
|
|
2847
3019
|
}
|
|
2848
|
-
function hasTemplateVariables(text) {
|
|
2849
|
-
return /\$\{[a-zA-Z0-9_]+\}/.test(text);
|
|
2850
|
-
}
|
|
2851
3020
|
function substituteVariables(template, variables) {
|
|
2852
|
-
return template.replace(
|
|
3021
|
+
return template.replace(/\{\{([a-zA-Z0-9_]+)\}\}/g, (match, varName) => {
|
|
2853
3022
|
return variables[varName] ?? match;
|
|
2854
3023
|
});
|
|
2855
3024
|
}
|
|
2856
3025
|
|
|
2857
3026
|
// src/evaluation/orchestrator.ts
|
|
2858
|
-
import { createHash, randomUUID as
|
|
3027
|
+
import { createHash, randomUUID as randomUUID2 } from "node:crypto";
|
|
2859
3028
|
import { mkdir as mkdir2, writeFile as writeFile2 } from "node:fs/promises";
|
|
2860
|
-
import
|
|
3029
|
+
import path12 from "node:path";
|
|
2861
3030
|
|
|
2862
3031
|
// ../../node_modules/.pnpm/yocto-queue@1.2.1/node_modules/yocto-queue/index.js
|
|
2863
3032
|
var Node = class {
|
|
@@ -3420,6 +3589,7 @@ async function evaluateCandidate(options) {
|
|
|
3420
3589
|
}
|
|
3421
3590
|
}
|
|
3422
3591
|
return {
|
|
3592
|
+
timestamp: completedAt.toISOString(),
|
|
3423
3593
|
eval_id: evalCase.id,
|
|
3424
3594
|
dataset: evalCase.dataset,
|
|
3425
3595
|
conversation_id: evalCase.conversation_id,
|
|
@@ -3427,14 +3597,12 @@ async function evaluateCandidate(options) {
|
|
|
3427
3597
|
hits: score.hits,
|
|
3428
3598
|
misses: score.misses,
|
|
3429
3599
|
candidate_answer: candidate,
|
|
3430
|
-
expected_aspect_count: score.expectedAspectCount,
|
|
3431
3600
|
target: target.name,
|
|
3432
|
-
timestamp: completedAt.toISOString(),
|
|
3433
3601
|
reasoning: score.reasoning,
|
|
3434
3602
|
raw_aspects: score.rawAspects,
|
|
3435
3603
|
agent_provider_request: agentProviderRequest,
|
|
3436
3604
|
lm_provider_request: lmProviderRequest,
|
|
3437
|
-
|
|
3605
|
+
evaluator_provider_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
|
|
3438
3606
|
evaluator_results: evaluatorResults
|
|
3439
3607
|
};
|
|
3440
3608
|
}
|
|
@@ -3511,7 +3679,7 @@ async function runEvaluatorList(options) {
|
|
|
3511
3679
|
hits: score2.hits,
|
|
3512
3680
|
misses: score2.misses,
|
|
3513
3681
|
reasoning: score2.reasoning,
|
|
3514
|
-
|
|
3682
|
+
evaluator_provider_request: score2.evaluatorRawRequest
|
|
3515
3683
|
});
|
|
3516
3684
|
continue;
|
|
3517
3685
|
}
|
|
@@ -3538,7 +3706,7 @@ async function runEvaluatorList(options) {
|
|
|
3538
3706
|
hits: score2.hits,
|
|
3539
3707
|
misses: score2.misses,
|
|
3540
3708
|
reasoning: score2.reasoning,
|
|
3541
|
-
|
|
3709
|
+
evaluator_provider_request: score2.evaluatorRawRequest
|
|
3542
3710
|
});
|
|
3543
3711
|
continue;
|
|
3544
3712
|
}
|
|
@@ -3591,7 +3759,7 @@ async function runLlmJudgeEvaluator(options) {
|
|
|
3591
3759
|
promptInputs,
|
|
3592
3760
|
now,
|
|
3593
3761
|
judgeProvider,
|
|
3594
|
-
|
|
3762
|
+
evaluatorTemplateOverride: customPrompt,
|
|
3595
3763
|
evaluator: config
|
|
3596
3764
|
});
|
|
3597
3765
|
}
|
|
@@ -3632,8 +3800,8 @@ function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
|
|
|
3632
3800
|
async function dumpPrompt(directory, evalCase, promptInputs) {
|
|
3633
3801
|
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
3634
3802
|
const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
|
|
3635
|
-
const filePath =
|
|
3636
|
-
await mkdir2(
|
|
3803
|
+
const filePath = path12.resolve(directory, filename);
|
|
3804
|
+
await mkdir2(path12.dirname(filePath), { recursive: true });
|
|
3637
3805
|
const payload = {
|
|
3638
3806
|
eval_id: evalCase.id,
|
|
3639
3807
|
question: promptInputs.question,
|
|
@@ -3647,7 +3815,7 @@ function sanitizeFilename(value) {
|
|
|
3647
3815
|
return "prompt";
|
|
3648
3816
|
}
|
|
3649
3817
|
const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
|
|
3650
|
-
return sanitized.length > 0 ? sanitized :
|
|
3818
|
+
return sanitized.length > 0 ? sanitized : randomUUID2();
|
|
3651
3819
|
}
|
|
3652
3820
|
async function invokeProvider(provider, options) {
|
|
3653
3821
|
const { evalCase, promptInputs, attempt, agentTimeoutMs, signal } = options;
|
|
@@ -3703,6 +3871,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
3703
3871
|
}
|
|
3704
3872
|
}
|
|
3705
3873
|
return {
|
|
3874
|
+
timestamp: timestamp.toISOString(),
|
|
3706
3875
|
eval_id: evalCase.id,
|
|
3707
3876
|
dataset: evalCase.dataset,
|
|
3708
3877
|
conversation_id: evalCase.conversation_id,
|
|
@@ -3710,9 +3879,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
3710
3879
|
hits: [],
|
|
3711
3880
|
misses: [`Error: ${message}`],
|
|
3712
3881
|
candidate_answer: `Error occurred: ${message}`,
|
|
3713
|
-
expected_aspect_count: 0,
|
|
3714
3882
|
target: targetName,
|
|
3715
|
-
timestamp: timestamp.toISOString(),
|
|
3716
3883
|
raw_aspects: [],
|
|
3717
3884
|
agent_provider_request: agentProviderRequest,
|
|
3718
3885
|
lm_provider_request: lmProviderRequest,
|