@agentv/core 0.10.1 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-YQBJAT5I.js → chunk-U3GEJ3K7.js} +1 -1
- package/dist/{chunk-YQBJAT5I.js.map → chunk-U3GEJ3K7.js.map} +1 -1
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/index.cjs +691 -562
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +29 -26
- package/dist/index.d.ts +29 -26
- package/dist/index.js +638 -507
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.cjs
CHANGED
|
@@ -33,15 +33,15 @@ __export(index_exports, {
|
|
|
33
33
|
CodeEvaluator: () => CodeEvaluator,
|
|
34
34
|
LlmJudgeEvaluator: () => LlmJudgeEvaluator,
|
|
35
35
|
TEST_MESSAGE_ROLES: () => TEST_MESSAGE_ROLES,
|
|
36
|
-
buildDirectoryChain: () =>
|
|
36
|
+
buildDirectoryChain: () => buildDirectoryChain2,
|
|
37
37
|
buildPromptInputs: () => buildPromptInputs,
|
|
38
|
-
buildSearchRoots: () =>
|
|
38
|
+
buildSearchRoots: () => buildSearchRoots2,
|
|
39
39
|
consumeCodexLogEntries: () => consumeCodexLogEntries,
|
|
40
40
|
createAgentKernel: () => createAgentKernel,
|
|
41
41
|
createProvider: () => createProvider,
|
|
42
42
|
ensureVSCodeSubagents: () => ensureVSCodeSubagents,
|
|
43
43
|
extractCodeBlocks: () => extractCodeBlocks,
|
|
44
|
-
fileExists: () =>
|
|
44
|
+
fileExists: () => fileExists2,
|
|
45
45
|
findGitRoot: () => findGitRoot,
|
|
46
46
|
getHitCount: () => getHitCount,
|
|
47
47
|
isEvaluatorKind: () => isEvaluatorKind,
|
|
@@ -57,7 +57,7 @@ __export(index_exports, {
|
|
|
57
57
|
readTestSuiteMetadata: () => readTestSuiteMetadata,
|
|
58
58
|
readTextFile: () => readTextFile,
|
|
59
59
|
resolveAndCreateProvider: () => resolveAndCreateProvider,
|
|
60
|
-
resolveFileReference: () =>
|
|
60
|
+
resolveFileReference: () => resolveFileReference2,
|
|
61
61
|
resolveTargetDefinition: () => resolveTargetDefinition,
|
|
62
62
|
runEvalCase: () => runEvalCase,
|
|
63
63
|
runEvaluation: () => runEvaluation,
|
|
@@ -116,47 +116,112 @@ function getHitCount(result) {
|
|
|
116
116
|
}
|
|
117
117
|
|
|
118
118
|
// src/evaluation/yaml-parser.ts
|
|
119
|
+
var import_promises5 = require("fs/promises");
|
|
120
|
+
var import_node_path6 = __toESM(require("path"), 1);
|
|
121
|
+
var import_yaml2 = require("yaml");
|
|
122
|
+
|
|
123
|
+
// src/evaluation/formatting/segment-formatter.ts
|
|
124
|
+
function extractCodeBlocks(segments) {
|
|
125
|
+
const CODE_BLOCK_PATTERN = /```[\s\S]*?```/g;
|
|
126
|
+
const codeBlocks = [];
|
|
127
|
+
for (const segment of segments) {
|
|
128
|
+
const typeValue = segment["type"];
|
|
129
|
+
if (typeof typeValue !== "string" || typeValue !== "text") {
|
|
130
|
+
continue;
|
|
131
|
+
}
|
|
132
|
+
const textValue = segment["value"];
|
|
133
|
+
if (typeof textValue !== "string") {
|
|
134
|
+
continue;
|
|
135
|
+
}
|
|
136
|
+
const matches = textValue.match(CODE_BLOCK_PATTERN);
|
|
137
|
+
if (matches) {
|
|
138
|
+
codeBlocks.push(...matches);
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
return codeBlocks;
|
|
142
|
+
}
|
|
143
|
+
function formatFileContents(parts) {
|
|
144
|
+
const fileCount = parts.filter((p) => p.isFile).length;
|
|
145
|
+
if (fileCount > 0) {
|
|
146
|
+
return parts.map((part) => {
|
|
147
|
+
if (part.isFile && part.displayPath) {
|
|
148
|
+
return `<file path="${part.displayPath}">
|
|
149
|
+
${part.content}
|
|
150
|
+
</file>`;
|
|
151
|
+
}
|
|
152
|
+
return part.content;
|
|
153
|
+
}).join("\n\n");
|
|
154
|
+
}
|
|
155
|
+
return parts.map((p) => p.content).join(" ");
|
|
156
|
+
}
|
|
157
|
+
function formatSegment(segment) {
|
|
158
|
+
const type = asString(segment.type);
|
|
159
|
+
if (type === "text") {
|
|
160
|
+
return asString(segment.value);
|
|
161
|
+
}
|
|
162
|
+
if (type === "guideline_ref") {
|
|
163
|
+
const refPath = asString(segment.path);
|
|
164
|
+
return refPath ? `<Attached: ${refPath}>` : void 0;
|
|
165
|
+
}
|
|
166
|
+
if (type === "file") {
|
|
167
|
+
const text = asString(segment.text);
|
|
168
|
+
const filePath = asString(segment.path);
|
|
169
|
+
if (text && filePath) {
|
|
170
|
+
return formatFileContents([{ content: text.trim(), isFile: true, displayPath: filePath }]);
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
return void 0;
|
|
174
|
+
}
|
|
175
|
+
function hasVisibleContent(segments) {
|
|
176
|
+
return segments.some((segment) => {
|
|
177
|
+
const type = asString(segment.type);
|
|
178
|
+
if (type === "text") {
|
|
179
|
+
const value = asString(segment.value);
|
|
180
|
+
return value !== void 0 && value.trim().length > 0;
|
|
181
|
+
}
|
|
182
|
+
if (type === "guideline_ref") {
|
|
183
|
+
return false;
|
|
184
|
+
}
|
|
185
|
+
if (type === "file") {
|
|
186
|
+
const text = asString(segment.text);
|
|
187
|
+
return text !== void 0 && text.trim().length > 0;
|
|
188
|
+
}
|
|
189
|
+
return false;
|
|
190
|
+
});
|
|
191
|
+
}
|
|
192
|
+
function asString(value) {
|
|
193
|
+
return typeof value === "string" ? value : void 0;
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
// src/evaluation/loaders/config-loader.ts
|
|
119
197
|
var import_micromatch = __toESM(require("micromatch"), 1);
|
|
120
|
-
var import_node_fs2 = require("fs");
|
|
121
198
|
var import_promises2 = require("fs/promises");
|
|
122
199
|
var import_node_path2 = __toESM(require("path"), 1);
|
|
123
|
-
var import_node_url = require("url");
|
|
124
200
|
var import_yaml = require("yaml");
|
|
125
201
|
|
|
126
|
-
// src/evaluation/file-
|
|
202
|
+
// src/evaluation/loaders/file-resolver.ts
|
|
127
203
|
var import_node_fs = require("fs");
|
|
128
204
|
var import_promises = require("fs/promises");
|
|
129
205
|
var import_node_path = __toESM(require("path"), 1);
|
|
130
|
-
async function fileExists(
|
|
206
|
+
async function fileExists(absolutePath) {
|
|
131
207
|
try {
|
|
132
|
-
await (0, import_promises.access)(
|
|
208
|
+
await (0, import_promises.access)(absolutePath, import_node_fs.constants.F_OK);
|
|
133
209
|
return true;
|
|
134
210
|
} catch {
|
|
135
211
|
return false;
|
|
136
212
|
}
|
|
137
213
|
}
|
|
138
|
-
function
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
async function findGitRoot(startPath) {
|
|
146
|
-
let currentDir = import_node_path.default.dirname(import_node_path.default.resolve(startPath));
|
|
147
|
-
const root = import_node_path.default.parse(currentDir).root;
|
|
148
|
-
while (currentDir !== root) {
|
|
149
|
-
const gitPath = import_node_path.default.join(currentDir, ".git");
|
|
150
|
-
if (await fileExists(gitPath)) {
|
|
151
|
-
return currentDir;
|
|
152
|
-
}
|
|
153
|
-
const parentDir = import_node_path.default.dirname(currentDir);
|
|
154
|
-
if (parentDir === currentDir) {
|
|
155
|
-
break;
|
|
214
|
+
function resolveToAbsolutePath(candidate) {
|
|
215
|
+
if (candidate instanceof URL) {
|
|
216
|
+
return new URL(candidate).pathname;
|
|
217
|
+
}
|
|
218
|
+
if (typeof candidate === "string") {
|
|
219
|
+
if (candidate.startsWith("file://")) {
|
|
220
|
+
return new URL(candidate).pathname;
|
|
156
221
|
}
|
|
157
|
-
|
|
222
|
+
return import_node_path.default.resolve(candidate);
|
|
158
223
|
}
|
|
159
|
-
|
|
224
|
+
throw new TypeError("Unsupported repoRoot value. Expected string or URL.");
|
|
160
225
|
}
|
|
161
226
|
function buildDirectoryChain(filePath, repoRoot) {
|
|
162
227
|
const directories = [];
|
|
@@ -234,44 +299,15 @@ async function resolveFileReference(rawValue, searchRoots) {
|
|
|
234
299
|
return { displayPath, attempted };
|
|
235
300
|
}
|
|
236
301
|
|
|
237
|
-
// src/evaluation/
|
|
238
|
-
var
|
|
302
|
+
// src/evaluation/loaders/config-loader.ts
|
|
303
|
+
var SCHEMA_CONFIG_V2 = "agentv-config-v2";
|
|
239
304
|
var ANSI_YELLOW = "\x1B[33m";
|
|
240
305
|
var ANSI_RESET = "\x1B[0m";
|
|
241
|
-
var SCHEMA_EVAL_V2 = "agentv-eval-v2";
|
|
242
|
-
var SCHEMA_CONFIG_V2 = "agentv-config-v2";
|
|
243
|
-
async function readTestSuiteMetadata(testFilePath) {
|
|
244
|
-
try {
|
|
245
|
-
const absolutePath = import_node_path2.default.resolve(testFilePath);
|
|
246
|
-
const content = await (0, import_promises2.readFile)(absolutePath, "utf8");
|
|
247
|
-
const parsed = (0, import_yaml.parse)(content);
|
|
248
|
-
if (!isJsonObject(parsed)) {
|
|
249
|
-
return {};
|
|
250
|
-
}
|
|
251
|
-
return { target: extractTargetFromSuite(parsed) };
|
|
252
|
-
} catch {
|
|
253
|
-
return {};
|
|
254
|
-
}
|
|
255
|
-
}
|
|
256
|
-
function extractTargetFromSuite(suite) {
|
|
257
|
-
const execution = suite.execution;
|
|
258
|
-
if (execution && typeof execution === "object" && !Array.isArray(execution)) {
|
|
259
|
-
const executionTarget = execution.target;
|
|
260
|
-
if (typeof executionTarget === "string" && executionTarget.trim().length > 0) {
|
|
261
|
-
return executionTarget.trim();
|
|
262
|
-
}
|
|
263
|
-
}
|
|
264
|
-
const targetValue = suite.target;
|
|
265
|
-
if (typeof targetValue === "string" && targetValue.trim().length > 0) {
|
|
266
|
-
return targetValue.trim();
|
|
267
|
-
}
|
|
268
|
-
return void 0;
|
|
269
|
-
}
|
|
270
306
|
async function loadConfig(evalFilePath, repoRoot) {
|
|
271
307
|
const directories = buildDirectoryChain(evalFilePath, repoRoot);
|
|
272
308
|
for (const directory of directories) {
|
|
273
309
|
const configPath = import_node_path2.default.join(directory, ".agentv", "config.yaml");
|
|
274
|
-
if (!await
|
|
310
|
+
if (!await fileExists(configPath)) {
|
|
275
311
|
continue;
|
|
276
312
|
}
|
|
277
313
|
try {
|
|
@@ -313,24 +349,134 @@ function isGuidelineFile(filePath, patterns) {
|
|
|
313
349
|
const patternsToUse = patterns ?? [];
|
|
314
350
|
return import_micromatch.default.isMatch(normalized, patternsToUse);
|
|
315
351
|
}
|
|
316
|
-
function
|
|
317
|
-
const
|
|
318
|
-
|
|
319
|
-
const
|
|
320
|
-
if (typeof
|
|
352
|
+
function extractTargetFromSuite(suite) {
|
|
353
|
+
const execution = suite.execution;
|
|
354
|
+
if (execution && typeof execution === "object" && !Array.isArray(execution)) {
|
|
355
|
+
const executionTarget = execution.target;
|
|
356
|
+
if (typeof executionTarget === "string" && executionTarget.trim().length > 0) {
|
|
357
|
+
return executionTarget.trim();
|
|
358
|
+
}
|
|
359
|
+
}
|
|
360
|
+
const targetValue = suite.target;
|
|
361
|
+
if (typeof targetValue === "string" && targetValue.trim().length > 0) {
|
|
362
|
+
return targetValue.trim();
|
|
363
|
+
}
|
|
364
|
+
return void 0;
|
|
365
|
+
}
|
|
366
|
+
function logWarning(message) {
|
|
367
|
+
console.warn(`${ANSI_YELLOW}Warning: ${message}${ANSI_RESET}`);
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
// src/evaluation/loaders/evaluator-parser.ts
|
|
371
|
+
var import_node_path3 = __toESM(require("path"), 1);
|
|
372
|
+
var ANSI_YELLOW2 = "\x1B[33m";
|
|
373
|
+
var ANSI_RESET2 = "\x1B[0m";
|
|
374
|
+
async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
|
|
375
|
+
const execution = rawEvalCase.execution;
|
|
376
|
+
const candidateEvaluators = isJsonObject2(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators ?? globalExecution?.evaluators;
|
|
377
|
+
if (candidateEvaluators === void 0) {
|
|
378
|
+
return void 0;
|
|
379
|
+
}
|
|
380
|
+
if (!Array.isArray(candidateEvaluators)) {
|
|
381
|
+
logWarning2(`Skipping evaluators for '${evalId}': expected array`);
|
|
382
|
+
return void 0;
|
|
383
|
+
}
|
|
384
|
+
const evaluators = [];
|
|
385
|
+
for (const rawEvaluator of candidateEvaluators) {
|
|
386
|
+
if (!isJsonObject2(rawEvaluator)) {
|
|
387
|
+
logWarning2(`Skipping invalid evaluator entry for '${evalId}' (expected object)`);
|
|
321
388
|
continue;
|
|
322
389
|
}
|
|
323
|
-
const
|
|
324
|
-
|
|
390
|
+
const name = asString2(rawEvaluator.name);
|
|
391
|
+
const typeValue = rawEvaluator.type;
|
|
392
|
+
if (!name || !isEvaluatorKind(typeValue)) {
|
|
393
|
+
logWarning2(`Skipping evaluator with invalid name/type in '${evalId}'`);
|
|
325
394
|
continue;
|
|
326
395
|
}
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
396
|
+
if (typeValue === "code") {
|
|
397
|
+
const script = asString2(rawEvaluator.script);
|
|
398
|
+
if (!script) {
|
|
399
|
+
logWarning2(`Skipping code evaluator '${name}' in '${evalId}': missing script`);
|
|
400
|
+
continue;
|
|
401
|
+
}
|
|
402
|
+
const cwd = asString2(rawEvaluator.cwd);
|
|
403
|
+
let resolvedCwd;
|
|
404
|
+
if (cwd) {
|
|
405
|
+
const resolved = await resolveFileReference(cwd, searchRoots);
|
|
406
|
+
if (resolved.resolvedPath) {
|
|
407
|
+
resolvedCwd = import_node_path3.default.resolve(resolved.resolvedPath);
|
|
408
|
+
} else {
|
|
409
|
+
logWarning2(
|
|
410
|
+
`Code evaluator '${name}' in '${evalId}': cwd not found (${resolved.displayPath})`,
|
|
411
|
+
resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
|
|
412
|
+
);
|
|
413
|
+
}
|
|
414
|
+
} else {
|
|
415
|
+
resolvedCwd = searchRoots[0];
|
|
416
|
+
}
|
|
417
|
+
evaluators.push({
|
|
418
|
+
name,
|
|
419
|
+
type: "code",
|
|
420
|
+
script,
|
|
421
|
+
cwd,
|
|
422
|
+
resolvedCwd
|
|
423
|
+
});
|
|
424
|
+
continue;
|
|
425
|
+
}
|
|
426
|
+
const prompt = asString2(rawEvaluator.prompt);
|
|
427
|
+
let promptPath;
|
|
428
|
+
if (prompt) {
|
|
429
|
+
const resolved = await resolveFileReference(prompt, searchRoots);
|
|
430
|
+
if (resolved.resolvedPath) {
|
|
431
|
+
promptPath = import_node_path3.default.resolve(resolved.resolvedPath);
|
|
432
|
+
} else {
|
|
433
|
+
logWarning2(
|
|
434
|
+
`Inline prompt used for evaluator '${name}' in '${evalId}' (file not found: ${resolved.displayPath})`,
|
|
435
|
+
resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
|
|
436
|
+
);
|
|
437
|
+
}
|
|
330
438
|
}
|
|
439
|
+
const _model = asString2(rawEvaluator.model);
|
|
440
|
+
evaluators.push({
|
|
441
|
+
name,
|
|
442
|
+
type: "llm_judge",
|
|
443
|
+
prompt,
|
|
444
|
+
promptPath
|
|
445
|
+
});
|
|
331
446
|
}
|
|
332
|
-
return
|
|
447
|
+
return evaluators.length > 0 ? evaluators : void 0;
|
|
448
|
+
}
|
|
449
|
+
function coerceEvaluator(candidate, contextId) {
|
|
450
|
+
if (typeof candidate !== "string") {
|
|
451
|
+
return void 0;
|
|
452
|
+
}
|
|
453
|
+
if (isEvaluatorKind(candidate)) {
|
|
454
|
+
return candidate;
|
|
455
|
+
}
|
|
456
|
+
logWarning2(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
|
|
457
|
+
return void 0;
|
|
458
|
+
}
|
|
459
|
+
function asString2(value) {
|
|
460
|
+
return typeof value === "string" ? value : void 0;
|
|
461
|
+
}
|
|
462
|
+
function isJsonObject2(value) {
|
|
463
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
333
464
|
}
|
|
465
|
+
function logWarning2(message, details) {
|
|
466
|
+
if (details && details.length > 0) {
|
|
467
|
+
const detailBlock = details.join("\n");
|
|
468
|
+
console.warn(`${ANSI_YELLOW2}Warning: ${message}
|
|
469
|
+
${detailBlock}${ANSI_RESET2}`);
|
|
470
|
+
} else {
|
|
471
|
+
console.warn(`${ANSI_YELLOW2}Warning: ${message}${ANSI_RESET2}`);
|
|
472
|
+
}
|
|
473
|
+
}
|
|
474
|
+
|
|
475
|
+
// src/evaluation/loaders/message-processor.ts
|
|
476
|
+
var import_promises3 = require("fs/promises");
|
|
477
|
+
var import_node_path4 = __toESM(require("path"), 1);
|
|
478
|
+
var ANSI_YELLOW3 = "\x1B[33m";
|
|
479
|
+
var ANSI_RESET3 = "\x1B[0m";
|
|
334
480
|
async function processMessages(options) {
|
|
335
481
|
const {
|
|
336
482
|
messages,
|
|
@@ -356,9 +502,9 @@ async function processMessages(options) {
|
|
|
356
502
|
if (!isJsonObject(rawSegment)) {
|
|
357
503
|
continue;
|
|
358
504
|
}
|
|
359
|
-
const segmentType =
|
|
505
|
+
const segmentType = asString3(rawSegment.type);
|
|
360
506
|
if (segmentType === "file") {
|
|
361
|
-
const rawValue =
|
|
507
|
+
const rawValue = asString3(rawSegment.value);
|
|
362
508
|
if (!rawValue) {
|
|
363
509
|
continue;
|
|
364
510
|
}
|
|
@@ -369,15 +515,15 @@ async function processMessages(options) {
|
|
|
369
515
|
if (!resolvedPath) {
|
|
370
516
|
const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
|
|
371
517
|
const context = messageType === "input" ? "" : " in expected_messages";
|
|
372
|
-
|
|
518
|
+
logWarning3(`File not found${context}: ${displayPath}`, attempts);
|
|
373
519
|
continue;
|
|
374
520
|
}
|
|
375
521
|
try {
|
|
376
|
-
const fileContent = (await (0,
|
|
522
|
+
const fileContent = (await (0, import_promises3.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
|
|
377
523
|
if (messageType === "input" && guidelinePatterns && guidelinePaths) {
|
|
378
|
-
const relativeToRepo =
|
|
524
|
+
const relativeToRepo = import_node_path4.default.relative(repoRootPath, resolvedPath);
|
|
379
525
|
if (isGuidelineFile(relativeToRepo, guidelinePatterns)) {
|
|
380
|
-
guidelinePaths.push(
|
|
526
|
+
guidelinePaths.push(import_node_path4.default.resolve(resolvedPath));
|
|
381
527
|
if (verbose) {
|
|
382
528
|
console.log(` [Guideline] Found: ${displayPath}`);
|
|
383
529
|
console.log(` Resolved to: ${resolvedPath}`);
|
|
@@ -389,7 +535,7 @@ async function processMessages(options) {
|
|
|
389
535
|
type: "file",
|
|
390
536
|
path: displayPath,
|
|
391
537
|
text: fileContent,
|
|
392
|
-
resolvedPath:
|
|
538
|
+
resolvedPath: import_node_path4.default.resolve(resolvedPath)
|
|
393
539
|
});
|
|
394
540
|
if (verbose) {
|
|
395
541
|
const label = messageType === "input" ? "[File]" : "[Expected Output File]";
|
|
@@ -398,7 +544,7 @@ async function processMessages(options) {
|
|
|
398
544
|
}
|
|
399
545
|
} catch (error) {
|
|
400
546
|
const context = messageType === "input" ? "" : " expected output";
|
|
401
|
-
|
|
547
|
+
logWarning3(`Could not read${context} file ${resolvedPath}: ${error.message}`);
|
|
402
548
|
}
|
|
403
549
|
continue;
|
|
404
550
|
}
|
|
@@ -412,202 +558,120 @@ async function processMessages(options) {
|
|
|
412
558
|
}
|
|
413
559
|
return segments;
|
|
414
560
|
}
|
|
415
|
-
async function
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
const absoluteTestPath = import_node_path2.default.resolve(evalFilePath);
|
|
419
|
-
if (!await fileExists2(absoluteTestPath)) {
|
|
420
|
-
throw new Error(`Test file not found: ${evalFilePath}`);
|
|
421
|
-
}
|
|
422
|
-
const repoRootPath = resolveToAbsolutePath(repoRoot);
|
|
423
|
-
const searchRoots = buildSearchRoots(absoluteTestPath, repoRootPath);
|
|
424
|
-
const config = await loadConfig(absoluteTestPath, repoRootPath);
|
|
425
|
-
const guidelinePatterns = config?.guideline_patterns;
|
|
426
|
-
const rawFile = await (0, import_promises2.readFile)(absoluteTestPath, "utf8");
|
|
427
|
-
const parsed = (0, import_yaml.parse)(rawFile);
|
|
428
|
-
if (!isJsonObject(parsed)) {
|
|
429
|
-
throw new Error(`Invalid test file format: ${evalFilePath}`);
|
|
430
|
-
}
|
|
431
|
-
const suite = parsed;
|
|
432
|
-
const datasetNameFromSuite = asString(suite.dataset)?.trim();
|
|
433
|
-
const fallbackDataset = import_node_path2.default.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
|
|
434
|
-
const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
|
|
435
|
-
const schema = suite.$schema;
|
|
436
|
-
if (schema !== SCHEMA_EVAL_V2) {
|
|
437
|
-
const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${evalFilePath}. Expected '${SCHEMA_EVAL_V2}'` : `Missing required field '$schema' in ${evalFilePath}.
|
|
438
|
-
Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
439
|
-
throw new Error(message);
|
|
561
|
+
async function resolveAssistantContent(content, searchRoots, verbose) {
|
|
562
|
+
if (typeof content === "string") {
|
|
563
|
+
return content;
|
|
440
564
|
}
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
throw new Error(`Invalid test file format: ${evalFilePath} - missing 'evalcases' field`);
|
|
565
|
+
if (!content) {
|
|
566
|
+
return "";
|
|
444
567
|
}
|
|
445
|
-
const
|
|
446
|
-
const
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
for (const rawEvalcase of rawTestcases) {
|
|
450
|
-
if (!isJsonObject(rawEvalcase)) {
|
|
451
|
-
logWarning("Skipping invalid eval case entry (expected object)");
|
|
568
|
+
const parts = [];
|
|
569
|
+
for (const entry of content) {
|
|
570
|
+
if (typeof entry === "string") {
|
|
571
|
+
parts.push({ content: entry, isFile: false });
|
|
452
572
|
continue;
|
|
453
573
|
}
|
|
454
|
-
|
|
455
|
-
const id = asString(evalcase.id);
|
|
456
|
-
if (evalIdFilter && id !== evalIdFilter) {
|
|
574
|
+
if (!isJsonObject(entry)) {
|
|
457
575
|
continue;
|
|
458
576
|
}
|
|
459
|
-
const
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
logWarning(`Skipping incomplete eval case: ${id ?? "unknown"}`);
|
|
465
|
-
continue;
|
|
466
|
-
}
|
|
467
|
-
const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
|
|
468
|
-
const inputMessages = inputMessagesValue.filter((msg) => isTestMessage(msg));
|
|
469
|
-
const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
|
|
470
|
-
if (hasExpectedMessages && expectedMessages.length === 0) {
|
|
471
|
-
logWarning(`No valid expected message found for eval case: ${id}`);
|
|
472
|
-
continue;
|
|
473
|
-
}
|
|
474
|
-
if (expectedMessages.length > 1) {
|
|
475
|
-
logWarning(`Multiple expected messages found for eval case: ${id}, using first`);
|
|
476
|
-
}
|
|
477
|
-
const guidelinePaths = [];
|
|
478
|
-
const inputTextParts = [];
|
|
479
|
-
const inputSegments = await processMessages({
|
|
480
|
-
messages: inputMessages,
|
|
481
|
-
searchRoots,
|
|
482
|
-
repoRootPath,
|
|
483
|
-
guidelinePatterns,
|
|
484
|
-
guidelinePaths,
|
|
485
|
-
textParts: inputTextParts,
|
|
486
|
-
messageType: "input",
|
|
487
|
-
verbose
|
|
488
|
-
});
|
|
489
|
-
const outputSegments = hasExpectedMessages ? await processMessages({
|
|
490
|
-
messages: expectedMessages,
|
|
491
|
-
searchRoots,
|
|
492
|
-
repoRootPath,
|
|
493
|
-
guidelinePatterns,
|
|
494
|
-
messageType: "output",
|
|
495
|
-
verbose
|
|
496
|
-
}) : [];
|
|
497
|
-
const codeSnippets = extractCodeBlocks(inputSegments);
|
|
498
|
-
const expectedContent = expectedMessages[0]?.content;
|
|
499
|
-
const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
|
|
500
|
-
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
501
|
-
const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
|
|
502
|
-
const evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
|
|
503
|
-
const userFilePaths = [];
|
|
504
|
-
for (const segment of inputSegments) {
|
|
505
|
-
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
506
|
-
userFilePaths.push(segment.resolvedPath);
|
|
577
|
+
const segmentType = asString3(entry.type);
|
|
578
|
+
if (segmentType === "file") {
|
|
579
|
+
const rawValue = asString3(entry.value);
|
|
580
|
+
if (!rawValue) {
|
|
581
|
+
continue;
|
|
507
582
|
}
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
guideline_patterns: guidelinePatterns,
|
|
524
|
-
file_paths: allFilePaths,
|
|
525
|
-
code_snippets: codeSnippets,
|
|
526
|
-
expected_outcome: outcome,
|
|
527
|
-
evaluator: evalCaseEvaluatorKind,
|
|
528
|
-
evaluators
|
|
529
|
-
};
|
|
530
|
-
if (verbose) {
|
|
531
|
-
console.log(`
|
|
532
|
-
[Eval Case: ${id}]`);
|
|
533
|
-
if (testCase.guideline_paths.length > 0) {
|
|
534
|
-
console.log(` Guidelines used: ${testCase.guideline_paths.length}`);
|
|
535
|
-
for (const guidelinePath of testCase.guideline_paths) {
|
|
536
|
-
console.log(` - ${guidelinePath}`);
|
|
583
|
+
const { displayPath, resolvedPath, attempted } = await resolveFileReference(
|
|
584
|
+
rawValue,
|
|
585
|
+
searchRoots
|
|
586
|
+
);
|
|
587
|
+
if (!resolvedPath) {
|
|
588
|
+
const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
|
|
589
|
+
logWarning3(`File not found in expected_messages: ${displayPath}`, attempts);
|
|
590
|
+
continue;
|
|
591
|
+
}
|
|
592
|
+
try {
|
|
593
|
+
const fileContent = (await (0, import_promises3.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n").trim();
|
|
594
|
+
parts.push({ content: fileContent, isFile: true, displayPath });
|
|
595
|
+
if (verbose) {
|
|
596
|
+
console.log(` [Expected Assistant File] Found: ${displayPath}`);
|
|
597
|
+
console.log(` Resolved to: ${resolvedPath}`);
|
|
537
598
|
}
|
|
538
|
-
}
|
|
539
|
-
|
|
599
|
+
} catch (error) {
|
|
600
|
+
logWarning3(`Could not read file ${resolvedPath}: ${error.message}`);
|
|
540
601
|
}
|
|
602
|
+
continue;
|
|
541
603
|
}
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
for (const segments of processedSegmentsByMessage) {
|
|
552
|
-
if (hasVisibleContent(segments)) {
|
|
553
|
-
messagesWithContent++;
|
|
604
|
+
const textValue = asString3(entry.text);
|
|
605
|
+
if (typeof textValue === "string") {
|
|
606
|
+
parts.push({ content: textValue, isFile: false });
|
|
607
|
+
continue;
|
|
608
|
+
}
|
|
609
|
+
const valueValue = asString3(entry.value);
|
|
610
|
+
if (typeof valueValue === "string") {
|
|
611
|
+
parts.push({ content: valueValue, isFile: false });
|
|
612
|
+
continue;
|
|
554
613
|
}
|
|
614
|
+
parts.push({ content: JSON.stringify(entry), isFile: false });
|
|
555
615
|
}
|
|
556
|
-
return
|
|
616
|
+
return formatFileContents(parts);
|
|
557
617
|
}
|
|
558
|
-
function
|
|
559
|
-
return
|
|
560
|
-
const type = asString(segment.type);
|
|
561
|
-
if (type === "text") {
|
|
562
|
-
const value = asString(segment.value);
|
|
563
|
-
return value !== void 0 && value.trim().length > 0;
|
|
564
|
-
}
|
|
565
|
-
if (type === "guideline_ref") {
|
|
566
|
-
return false;
|
|
567
|
-
}
|
|
568
|
-
if (type === "file") {
|
|
569
|
-
const text = asString(segment.text);
|
|
570
|
-
return text !== void 0 && text.trim().length > 0;
|
|
571
|
-
}
|
|
572
|
-
return false;
|
|
573
|
-
});
|
|
618
|
+
function asString3(value) {
|
|
619
|
+
return typeof value === "string" ? value : void 0;
|
|
574
620
|
}
|
|
575
|
-
function
|
|
576
|
-
const
|
|
577
|
-
|
|
578
|
-
|
|
621
|
+
function cloneJsonObject(source) {
|
|
622
|
+
const entries = Object.entries(source).map(([key, value]) => [key, cloneJsonValue(value)]);
|
|
623
|
+
return Object.fromEntries(entries);
|
|
624
|
+
}
|
|
625
|
+
function cloneJsonValue(value) {
|
|
626
|
+
if (value === null) {
|
|
627
|
+
return null;
|
|
579
628
|
}
|
|
580
|
-
if (
|
|
581
|
-
|
|
582
|
-
return refPath ? `<Attached: ${refPath}>` : void 0;
|
|
629
|
+
if (typeof value === "string" || typeof value === "number" || typeof value === "boolean") {
|
|
630
|
+
return value;
|
|
583
631
|
}
|
|
584
|
-
if (
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
632
|
+
if (Array.isArray(value)) {
|
|
633
|
+
return value.map((item) => cloneJsonValue(item));
|
|
634
|
+
}
|
|
635
|
+
if (typeof value === "object") {
|
|
636
|
+
return cloneJsonObject(value);
|
|
637
|
+
}
|
|
638
|
+
return value;
|
|
639
|
+
}
|
|
640
|
+
function logWarning3(message, details) {
|
|
641
|
+
if (details && details.length > 0) {
|
|
642
|
+
const detailBlock = details.join("\n");
|
|
643
|
+
console.warn(`${ANSI_YELLOW3}Warning: ${message}
|
|
644
|
+
${detailBlock}${ANSI_RESET3}`);
|
|
645
|
+
} else {
|
|
646
|
+
console.warn(`${ANSI_YELLOW3}Warning: ${message}${ANSI_RESET3}`);
|
|
591
647
|
}
|
|
592
|
-
return void 0;
|
|
593
648
|
}
|
|
649
|
+
|
|
650
|
+
// src/evaluation/formatting/prompt-builder.ts
|
|
651
|
+
var import_promises4 = require("fs/promises");
|
|
652
|
+
var import_node_path5 = __toESM(require("path"), 1);
|
|
653
|
+
var ANSI_YELLOW4 = "\x1B[33m";
|
|
654
|
+
var ANSI_RESET4 = "\x1B[0m";
|
|
594
655
|
async function buildPromptInputs(testCase) {
|
|
595
|
-
const
|
|
656
|
+
const guidelineParts = [];
|
|
596
657
|
for (const rawPath of testCase.guideline_paths) {
|
|
597
|
-
const absolutePath =
|
|
598
|
-
if (!await
|
|
599
|
-
|
|
658
|
+
const absolutePath = import_node_path5.default.resolve(rawPath);
|
|
659
|
+
if (!await fileExists(absolutePath)) {
|
|
660
|
+
logWarning4(`Could not read guideline file ${absolutePath}: file does not exist`);
|
|
600
661
|
continue;
|
|
601
662
|
}
|
|
602
663
|
try {
|
|
603
|
-
const content = (await (0,
|
|
604
|
-
|
|
605
|
-
|
|
664
|
+
const content = (await (0, import_promises4.readFile)(absolutePath, "utf8")).replace(/\r\n/g, "\n").trim();
|
|
665
|
+
guidelineParts.push({
|
|
666
|
+
content,
|
|
667
|
+
isFile: true,
|
|
668
|
+
displayPath: import_node_path5.default.basename(absolutePath)
|
|
669
|
+
});
|
|
606
670
|
} catch (error) {
|
|
607
|
-
|
|
671
|
+
logWarning4(`Could not read guideline file ${absolutePath}: ${error.message}`);
|
|
608
672
|
}
|
|
609
673
|
}
|
|
610
|
-
const guidelines =
|
|
674
|
+
const guidelines = formatFileContents(guidelineParts);
|
|
611
675
|
const segmentsByMessage = [];
|
|
612
676
|
const fileContentsByPath = /* @__PURE__ */ new Map();
|
|
613
677
|
for (const segment of testCase.input_segments) {
|
|
@@ -628,9 +692,9 @@ ${content}`);
|
|
|
628
692
|
messageSegments.push({ type: "text", value: segment });
|
|
629
693
|
}
|
|
630
694
|
} else if (isJsonObject(segment)) {
|
|
631
|
-
const type =
|
|
695
|
+
const type = asString4(segment.type);
|
|
632
696
|
if (type === "file") {
|
|
633
|
-
const value =
|
|
697
|
+
const value = asString4(segment.value);
|
|
634
698
|
if (!value) continue;
|
|
635
699
|
if (testCase.guideline_patterns && isGuidelineFile(value, testCase.guideline_patterns)) {
|
|
636
700
|
messageSegments.push({ type: "guideline_ref", path: value });
|
|
@@ -641,7 +705,7 @@ ${content}`);
|
|
|
641
705
|
messageSegments.push({ type: "file", text: fileText, path: value });
|
|
642
706
|
}
|
|
643
707
|
} else if (type === "text") {
|
|
644
|
-
const textValue =
|
|
708
|
+
const textValue = asString4(segment.value);
|
|
645
709
|
if (textValue && textValue.trim().length > 0) {
|
|
646
710
|
messageSegments.push({ type: "text", value: textValue });
|
|
647
711
|
}
|
|
@@ -697,6 +761,18 @@ ${messageContent}`);
|
|
|
697
761
|
}) : void 0;
|
|
698
762
|
return { question, guidelines, chatPrompt };
|
|
699
763
|
}
|
|
764
|
+
function needsRoleMarkers(messages, processedSegmentsByMessage) {
|
|
765
|
+
if (messages.some((msg) => msg.role === "assistant" || msg.role === "tool")) {
|
|
766
|
+
return true;
|
|
767
|
+
}
|
|
768
|
+
let messagesWithContent = 0;
|
|
769
|
+
for (const segments of processedSegmentsByMessage) {
|
|
770
|
+
if (hasVisibleContent(segments)) {
|
|
771
|
+
messagesWithContent++;
|
|
772
|
+
}
|
|
773
|
+
}
|
|
774
|
+
return messagesWithContent > 1;
|
|
775
|
+
}
|
|
700
776
|
function buildChatPromptFromSegments(options) {
|
|
701
777
|
const { messages, segmentsByMessage, guidelinePatterns, guidelineContent, systemPrompt } = options;
|
|
702
778
|
if (messages.length === 0) {
|
|
@@ -756,209 +832,294 @@ ${guidelineContent.trim()}`);
|
|
|
756
832
|
if (isGuidelineRef) {
|
|
757
833
|
continue;
|
|
758
834
|
}
|
|
759
|
-
contentParts.push(formatted);
|
|
835
|
+
contentParts.push(formatted);
|
|
836
|
+
}
|
|
837
|
+
}
|
|
838
|
+
if (contentParts.length === 0) {
|
|
839
|
+
continue;
|
|
840
|
+
}
|
|
841
|
+
chatPrompt.push({
|
|
842
|
+
role,
|
|
843
|
+
content: contentParts.join("\n"),
|
|
844
|
+
...name ? { name } : {}
|
|
845
|
+
});
|
|
846
|
+
}
|
|
847
|
+
return chatPrompt.length > 0 ? chatPrompt : void 0;
|
|
848
|
+
}
|
|
849
|
+
function asString4(value) {
|
|
850
|
+
return typeof value === "string" ? value : void 0;
|
|
851
|
+
}
|
|
852
|
+
function logWarning4(message) {
|
|
853
|
+
console.warn(`${ANSI_YELLOW4}Warning: ${message}${ANSI_RESET4}`);
|
|
854
|
+
}
|
|
855
|
+
|
|
856
|
+
// src/evaluation/yaml-parser.ts
|
|
857
|
+
var ANSI_YELLOW5 = "\x1B[33m";
|
|
858
|
+
var ANSI_RESET5 = "\x1B[0m";
|
|
859
|
+
var SCHEMA_EVAL_V2 = "agentv-eval-v2";
|
|
860
|
+
async function readTestSuiteMetadata(testFilePath) {
|
|
861
|
+
try {
|
|
862
|
+
const absolutePath = import_node_path6.default.resolve(testFilePath);
|
|
863
|
+
const content = await (0, import_promises5.readFile)(absolutePath, "utf8");
|
|
864
|
+
const parsed = (0, import_yaml2.parse)(content);
|
|
865
|
+
if (!isJsonObject(parsed)) {
|
|
866
|
+
return {};
|
|
867
|
+
}
|
|
868
|
+
return { target: extractTargetFromSuite(parsed) };
|
|
869
|
+
} catch {
|
|
870
|
+
return {};
|
|
871
|
+
}
|
|
872
|
+
}
|
|
873
|
+
async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
874
|
+
const verbose = options?.verbose ?? false;
|
|
875
|
+
const evalIdFilter = options?.evalId;
|
|
876
|
+
const absoluteTestPath = import_node_path6.default.resolve(evalFilePath);
|
|
877
|
+
const repoRootPath = resolveToAbsolutePath(repoRoot);
|
|
878
|
+
const searchRoots = buildSearchRoots(absoluteTestPath, repoRootPath);
|
|
879
|
+
const config = await loadConfig(absoluteTestPath, repoRootPath);
|
|
880
|
+
const guidelinePatterns = config?.guideline_patterns;
|
|
881
|
+
const rawFile = await (0, import_promises5.readFile)(absoluteTestPath, "utf8");
|
|
882
|
+
const parsed = (0, import_yaml2.parse)(rawFile);
|
|
883
|
+
if (!isJsonObject(parsed)) {
|
|
884
|
+
throw new Error(`Invalid test file format: ${evalFilePath}`);
|
|
885
|
+
}
|
|
886
|
+
const suite = parsed;
|
|
887
|
+
const datasetNameFromSuite = asString5(suite.dataset)?.trim();
|
|
888
|
+
const fallbackDataset = import_node_path6.default.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
|
|
889
|
+
const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
|
|
890
|
+
const schema = suite.$schema;
|
|
891
|
+
if (schema !== SCHEMA_EVAL_V2) {
|
|
892
|
+
const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${evalFilePath}. Expected '${SCHEMA_EVAL_V2}'` : `Missing required field '$schema' in ${evalFilePath}.
|
|
893
|
+
Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
894
|
+
throw new Error(message);
|
|
895
|
+
}
|
|
896
|
+
const rawTestcases = suite.evalcases;
|
|
897
|
+
if (!Array.isArray(rawTestcases)) {
|
|
898
|
+
throw new Error(`Invalid test file format: ${evalFilePath} - missing 'evalcases' field`);
|
|
899
|
+
}
|
|
900
|
+
const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm_judge";
|
|
901
|
+
const globalExecution = isJsonObject(suite.execution) ? suite.execution : void 0;
|
|
902
|
+
const _globalTarget = asString5(globalExecution?.target) ?? asString5(suite.target);
|
|
903
|
+
const results = [];
|
|
904
|
+
for (const rawEvalcase of rawTestcases) {
|
|
905
|
+
if (!isJsonObject(rawEvalcase)) {
|
|
906
|
+
logWarning5("Skipping invalid eval case entry (expected object)");
|
|
907
|
+
continue;
|
|
908
|
+
}
|
|
909
|
+
const evalcase = rawEvalcase;
|
|
910
|
+
const id = asString5(evalcase.id);
|
|
911
|
+
if (evalIdFilter && id !== evalIdFilter) {
|
|
912
|
+
continue;
|
|
913
|
+
}
|
|
914
|
+
const conversationId = asString5(evalcase.conversation_id);
|
|
915
|
+
const outcome = asString5(evalcase.outcome);
|
|
916
|
+
const inputMessagesValue = evalcase.input_messages;
|
|
917
|
+
const expectedMessagesValue = evalcase.expected_messages;
|
|
918
|
+
if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
|
|
919
|
+
logWarning5(`Skipping incomplete eval case: ${id ?? "unknown"}`);
|
|
920
|
+
continue;
|
|
921
|
+
}
|
|
922
|
+
const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
|
|
923
|
+
const inputMessages = inputMessagesValue.filter((msg) => isTestMessage(msg));
|
|
924
|
+
const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
|
|
925
|
+
if (hasExpectedMessages && expectedMessages.length === 0) {
|
|
926
|
+
logWarning5(`No valid expected message found for eval case: ${id}`);
|
|
927
|
+
continue;
|
|
928
|
+
}
|
|
929
|
+
if (expectedMessages.length > 1) {
|
|
930
|
+
logWarning5(`Multiple expected messages found for eval case: ${id}, using first`);
|
|
931
|
+
}
|
|
932
|
+
const guidelinePaths = [];
|
|
933
|
+
const inputTextParts = [];
|
|
934
|
+
const inputSegments = await processMessages({
|
|
935
|
+
messages: inputMessages,
|
|
936
|
+
searchRoots,
|
|
937
|
+
repoRootPath,
|
|
938
|
+
guidelinePatterns,
|
|
939
|
+
guidelinePaths,
|
|
940
|
+
textParts: inputTextParts,
|
|
941
|
+
messageType: "input",
|
|
942
|
+
verbose
|
|
943
|
+
});
|
|
944
|
+
const outputSegments = hasExpectedMessages ? await processMessages({
|
|
945
|
+
messages: expectedMessages,
|
|
946
|
+
searchRoots,
|
|
947
|
+
repoRootPath,
|
|
948
|
+
guidelinePatterns,
|
|
949
|
+
messageType: "output",
|
|
950
|
+
verbose
|
|
951
|
+
}) : [];
|
|
952
|
+
const codeSnippets = extractCodeBlocks(inputSegments);
|
|
953
|
+
const expectedContent = expectedMessages[0]?.content;
|
|
954
|
+
const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
|
|
955
|
+
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
956
|
+
const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
|
|
957
|
+
const evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
|
|
958
|
+
const userFilePaths = [];
|
|
959
|
+
for (const segment of inputSegments) {
|
|
960
|
+
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
961
|
+
userFilePaths.push(segment.resolvedPath);
|
|
962
|
+
}
|
|
963
|
+
}
|
|
964
|
+
const allFilePaths = [
|
|
965
|
+
...guidelinePaths.map((guidelinePath) => import_node_path6.default.resolve(guidelinePath)),
|
|
966
|
+
...userFilePaths
|
|
967
|
+
];
|
|
968
|
+
const testCase = {
|
|
969
|
+
id,
|
|
970
|
+
dataset: datasetName,
|
|
971
|
+
conversation_id: conversationId,
|
|
972
|
+
question,
|
|
973
|
+
input_messages: inputMessages,
|
|
974
|
+
input_segments: inputSegments,
|
|
975
|
+
output_segments: outputSegments,
|
|
976
|
+
reference_answer: referenceAnswer,
|
|
977
|
+
guideline_paths: guidelinePaths.map((guidelinePath) => import_node_path6.default.resolve(guidelinePath)),
|
|
978
|
+
guideline_patterns: guidelinePatterns,
|
|
979
|
+
file_paths: allFilePaths,
|
|
980
|
+
code_snippets: codeSnippets,
|
|
981
|
+
expected_outcome: outcome,
|
|
982
|
+
evaluator: evalCaseEvaluatorKind,
|
|
983
|
+
evaluators
|
|
984
|
+
};
|
|
985
|
+
if (verbose) {
|
|
986
|
+
console.log(`
|
|
987
|
+
[Eval Case: ${id}]`);
|
|
988
|
+
if (testCase.guideline_paths.length > 0) {
|
|
989
|
+
console.log(` Guidelines used: ${testCase.guideline_paths.length}`);
|
|
990
|
+
for (const guidelinePath of testCase.guideline_paths) {
|
|
991
|
+
console.log(` - ${guidelinePath}`);
|
|
992
|
+
}
|
|
993
|
+
} else {
|
|
994
|
+
console.log(" No guidelines found");
|
|
760
995
|
}
|
|
761
996
|
}
|
|
762
|
-
|
|
763
|
-
continue;
|
|
764
|
-
}
|
|
765
|
-
chatPrompt.push({
|
|
766
|
-
role,
|
|
767
|
-
content: contentParts.join("\n"),
|
|
768
|
-
...name ? { name } : {}
|
|
769
|
-
});
|
|
997
|
+
results.push(testCase);
|
|
770
998
|
}
|
|
771
|
-
return
|
|
999
|
+
return results;
|
|
1000
|
+
}
|
|
1001
|
+
function asString5(value) {
|
|
1002
|
+
return typeof value === "string" ? value : void 0;
|
|
772
1003
|
}
|
|
773
|
-
|
|
1004
|
+
function logWarning5(message, details) {
|
|
1005
|
+
if (details && details.length > 0) {
|
|
1006
|
+
const detailBlock = details.join("\n");
|
|
1007
|
+
console.warn(`${ANSI_YELLOW5}Warning: ${message}
|
|
1008
|
+
${detailBlock}${ANSI_RESET5}`);
|
|
1009
|
+
} else {
|
|
1010
|
+
console.warn(`${ANSI_YELLOW5}Warning: ${message}${ANSI_RESET5}`);
|
|
1011
|
+
}
|
|
1012
|
+
}
|
|
1013
|
+
|
|
1014
|
+
// src/evaluation/file-utils.ts
|
|
1015
|
+
var import_node_fs2 = require("fs");
|
|
1016
|
+
var import_promises6 = require("fs/promises");
|
|
1017
|
+
var import_node_path7 = __toESM(require("path"), 1);
|
|
1018
|
+
async function fileExists2(filePath) {
|
|
774
1019
|
try {
|
|
775
|
-
await (0,
|
|
1020
|
+
await (0, import_promises6.access)(filePath, import_node_fs2.constants.F_OK);
|
|
776
1021
|
return true;
|
|
777
1022
|
} catch {
|
|
778
1023
|
return false;
|
|
779
1024
|
}
|
|
780
1025
|
}
|
|
781
|
-
function
|
|
782
|
-
|
|
783
|
-
return (0, import_node_url.fileURLToPath)(candidate);
|
|
784
|
-
}
|
|
785
|
-
if (typeof candidate === "string") {
|
|
786
|
-
if (candidate.startsWith("file://")) {
|
|
787
|
-
return (0, import_node_url.fileURLToPath)(new URL(candidate));
|
|
788
|
-
}
|
|
789
|
-
return import_node_path2.default.resolve(candidate);
|
|
790
|
-
}
|
|
791
|
-
throw new TypeError("Unsupported repoRoot value. Expected string or URL.");
|
|
792
|
-
}
|
|
793
|
-
function asString(value) {
|
|
794
|
-
return typeof value === "string" ? value : void 0;
|
|
1026
|
+
function normalizeLineEndings(content) {
|
|
1027
|
+
return content.replace(/\r\n/g, "\n");
|
|
795
1028
|
}
|
|
796
|
-
function
|
|
797
|
-
const
|
|
798
|
-
return
|
|
1029
|
+
async function readTextFile(filePath) {
|
|
1030
|
+
const content = await (0, import_promises6.readFile)(filePath, "utf8");
|
|
1031
|
+
return normalizeLineEndings(content);
|
|
799
1032
|
}
|
|
800
|
-
function
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
1033
|
+
async function findGitRoot(startPath) {
|
|
1034
|
+
let currentDir = import_node_path7.default.dirname(import_node_path7.default.resolve(startPath));
|
|
1035
|
+
const root = import_node_path7.default.parse(currentDir).root;
|
|
1036
|
+
while (currentDir !== root) {
|
|
1037
|
+
const gitPath = import_node_path7.default.join(currentDir, ".git");
|
|
1038
|
+
if (await fileExists2(gitPath)) {
|
|
1039
|
+
return currentDir;
|
|
1040
|
+
}
|
|
1041
|
+
const parentDir = import_node_path7.default.dirname(currentDir);
|
|
1042
|
+
if (parentDir === currentDir) {
|
|
1043
|
+
break;
|
|
1044
|
+
}
|
|
1045
|
+
currentDir = parentDir;
|
|
809
1046
|
}
|
|
810
|
-
return
|
|
1047
|
+
return null;
|
|
811
1048
|
}
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
if (typeof entry === "string") {
|
|
822
|
-
parts.push(entry);
|
|
823
|
-
continue;
|
|
1049
|
+
function buildDirectoryChain2(filePath, repoRoot) {
|
|
1050
|
+
const directories = [];
|
|
1051
|
+
const seen = /* @__PURE__ */ new Set();
|
|
1052
|
+
const boundary = import_node_path7.default.resolve(repoRoot);
|
|
1053
|
+
let current = import_node_path7.default.resolve(import_node_path7.default.dirname(filePath));
|
|
1054
|
+
while (current !== void 0) {
|
|
1055
|
+
if (!seen.has(current)) {
|
|
1056
|
+
directories.push(current);
|
|
1057
|
+
seen.add(current);
|
|
824
1058
|
}
|
|
825
|
-
if (
|
|
826
|
-
|
|
1059
|
+
if (current === boundary) {
|
|
1060
|
+
break;
|
|
827
1061
|
}
|
|
828
|
-
const
|
|
829
|
-
if (
|
|
830
|
-
|
|
831
|
-
if (!rawValue) {
|
|
832
|
-
continue;
|
|
833
|
-
}
|
|
834
|
-
const { displayPath, resolvedPath, attempted } = await resolveFileReference(
|
|
835
|
-
rawValue,
|
|
836
|
-
searchRoots
|
|
837
|
-
);
|
|
838
|
-
if (!resolvedPath) {
|
|
839
|
-
const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
|
|
840
|
-
logWarning(`File not found in expected_messages: ${displayPath}`, attempts);
|
|
841
|
-
continue;
|
|
842
|
-
}
|
|
843
|
-
try {
|
|
844
|
-
const fileContent = (await (0, import_promises2.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
|
|
845
|
-
parts.push(fileContent);
|
|
846
|
-
if (verbose) {
|
|
847
|
-
console.log(` [Expected Assistant File] Found: ${displayPath}`);
|
|
848
|
-
console.log(` Resolved to: ${resolvedPath}`);
|
|
849
|
-
}
|
|
850
|
-
} catch (error) {
|
|
851
|
-
logWarning(`Could not read file ${resolvedPath}: ${error.message}`);
|
|
852
|
-
}
|
|
853
|
-
continue;
|
|
1062
|
+
const parent = import_node_path7.default.dirname(current);
|
|
1063
|
+
if (parent === current) {
|
|
1064
|
+
break;
|
|
854
1065
|
}
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
1066
|
+
current = parent;
|
|
1067
|
+
}
|
|
1068
|
+
if (!seen.has(boundary)) {
|
|
1069
|
+
directories.push(boundary);
|
|
1070
|
+
}
|
|
1071
|
+
return directories;
|
|
1072
|
+
}
|
|
1073
|
+
function buildSearchRoots2(evalPath, repoRoot) {
|
|
1074
|
+
const uniqueRoots = [];
|
|
1075
|
+
const addRoot = (root) => {
|
|
1076
|
+
const normalized = import_node_path7.default.resolve(root);
|
|
1077
|
+
if (!uniqueRoots.includes(normalized)) {
|
|
1078
|
+
uniqueRoots.push(normalized);
|
|
859
1079
|
}
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
1080
|
+
};
|
|
1081
|
+
let currentDir = import_node_path7.default.dirname(evalPath);
|
|
1082
|
+
let reachedBoundary = false;
|
|
1083
|
+
while (!reachedBoundary) {
|
|
1084
|
+
addRoot(currentDir);
|
|
1085
|
+
const parentDir = import_node_path7.default.dirname(currentDir);
|
|
1086
|
+
if (currentDir === repoRoot || parentDir === currentDir) {
|
|
1087
|
+
reachedBoundary = true;
|
|
1088
|
+
} else {
|
|
1089
|
+
currentDir = parentDir;
|
|
864
1090
|
}
|
|
865
|
-
parts.push(JSON.stringify(entry));
|
|
866
1091
|
}
|
|
867
|
-
|
|
1092
|
+
addRoot(repoRoot);
|
|
1093
|
+
addRoot(process.cwd());
|
|
1094
|
+
return uniqueRoots;
|
|
868
1095
|
}
|
|
869
|
-
|
|
870
|
-
const
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
|
|
1096
|
+
function trimLeadingSeparators2(value) {
|
|
1097
|
+
const trimmed = value.replace(/^[/\\]+/, "");
|
|
1098
|
+
return trimmed.length > 0 ? trimmed : value;
|
|
1099
|
+
}
|
|
1100
|
+
async function resolveFileReference2(rawValue, searchRoots) {
|
|
1101
|
+
const displayPath = trimLeadingSeparators2(rawValue);
|
|
1102
|
+
const potentialPaths = [];
|
|
1103
|
+
if (import_node_path7.default.isAbsolute(rawValue)) {
|
|
1104
|
+
potentialPaths.push(import_node_path7.default.normalize(rawValue));
|
|
874
1105
|
}
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
return void 0;
|
|
1106
|
+
for (const base of searchRoots) {
|
|
1107
|
+
potentialPaths.push(import_node_path7.default.resolve(base, displayPath));
|
|
878
1108
|
}
|
|
879
|
-
const
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
}
|
|
885
|
-
const name = asString(rawEvaluator.name);
|
|
886
|
-
const typeValue = rawEvaluator.type;
|
|
887
|
-
if (!name || !isEvaluatorKind(typeValue)) {
|
|
888
|
-
logWarning(`Skipping evaluator with invalid name/type in '${evalId}'`);
|
|
889
|
-
continue;
|
|
890
|
-
}
|
|
891
|
-
if (typeValue === "code") {
|
|
892
|
-
const script = asString(rawEvaluator.script);
|
|
893
|
-
if (!script) {
|
|
894
|
-
logWarning(`Skipping code evaluator '${name}' in '${evalId}': missing script`);
|
|
895
|
-
continue;
|
|
896
|
-
}
|
|
897
|
-
const cwd = asString(rawEvaluator.cwd);
|
|
898
|
-
let resolvedCwd;
|
|
899
|
-
if (cwd) {
|
|
900
|
-
const resolved = await resolveFileReference(cwd, searchRoots);
|
|
901
|
-
if (resolved.resolvedPath) {
|
|
902
|
-
resolvedCwd = import_node_path2.default.resolve(resolved.resolvedPath);
|
|
903
|
-
} else {
|
|
904
|
-
logWarning(
|
|
905
|
-
`Code evaluator '${name}' in '${evalId}': cwd not found (${resolved.displayPath})`,
|
|
906
|
-
resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
|
|
907
|
-
);
|
|
908
|
-
}
|
|
909
|
-
} else {
|
|
910
|
-
resolvedCwd = searchRoots[0];
|
|
911
|
-
}
|
|
912
|
-
evaluators.push({
|
|
913
|
-
name,
|
|
914
|
-
type: "code",
|
|
915
|
-
script,
|
|
916
|
-
cwd,
|
|
917
|
-
resolvedCwd
|
|
918
|
-
});
|
|
1109
|
+
const attempted = [];
|
|
1110
|
+
const seen = /* @__PURE__ */ new Set();
|
|
1111
|
+
for (const candidate of potentialPaths) {
|
|
1112
|
+
const absoluteCandidate = import_node_path7.default.resolve(candidate);
|
|
1113
|
+
if (seen.has(absoluteCandidate)) {
|
|
919
1114
|
continue;
|
|
920
1115
|
}
|
|
921
|
-
|
|
922
|
-
|
|
923
|
-
if (
|
|
924
|
-
|
|
925
|
-
if (resolved.resolvedPath) {
|
|
926
|
-
promptPath = import_node_path2.default.resolve(resolved.resolvedPath);
|
|
927
|
-
} else {
|
|
928
|
-
logWarning(
|
|
929
|
-
`Inline prompt used for evaluator '${name}' in '${evalId}' (file not found: ${resolved.displayPath})`,
|
|
930
|
-
resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
|
|
931
|
-
);
|
|
932
|
-
}
|
|
1116
|
+
seen.add(absoluteCandidate);
|
|
1117
|
+
attempted.push(absoluteCandidate);
|
|
1118
|
+
if (await fileExists2(absoluteCandidate)) {
|
|
1119
|
+
return { displayPath, resolvedPath: absoluteCandidate, attempted };
|
|
933
1120
|
}
|
|
934
|
-
const model = asString(rawEvaluator.model);
|
|
935
|
-
evaluators.push({
|
|
936
|
-
name,
|
|
937
|
-
type: "llm_judge",
|
|
938
|
-
prompt,
|
|
939
|
-
promptPath
|
|
940
|
-
});
|
|
941
|
-
}
|
|
942
|
-
return evaluators.length > 0 ? evaluators : void 0;
|
|
943
|
-
}
|
|
944
|
-
function coerceEvaluator(candidate, contextId) {
|
|
945
|
-
if (typeof candidate !== "string") {
|
|
946
|
-
return void 0;
|
|
947
|
-
}
|
|
948
|
-
if (isEvaluatorKind(candidate)) {
|
|
949
|
-
return candidate;
|
|
950
|
-
}
|
|
951
|
-
logWarning(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
|
|
952
|
-
return void 0;
|
|
953
|
-
}
|
|
954
|
-
function logWarning(message, details) {
|
|
955
|
-
if (details && details.length > 0) {
|
|
956
|
-
const detailBlock = details.join("\n");
|
|
957
|
-
console.warn(`${ANSI_YELLOW}Warning: ${message}
|
|
958
|
-
${detailBlock}${ANSI_RESET}`);
|
|
959
|
-
} else {
|
|
960
|
-
console.warn(`${ANSI_YELLOW}Warning: ${message}${ANSI_RESET}`);
|
|
961
1121
|
}
|
|
1122
|
+
return { displayPath, attempted };
|
|
962
1123
|
}
|
|
963
1124
|
|
|
964
1125
|
// src/evaluation/providers/ax.ts
|
|
@@ -989,9 +1150,8 @@ function buildChatPrompt(request) {
|
|
|
989
1150
|
}
|
|
990
1151
|
function resolveSystemContent(request) {
|
|
991
1152
|
const systemSegments = [];
|
|
992
|
-
|
|
993
|
-
|
|
994
|
-
systemSegments.push(metadataSystemPrompt.trim());
|
|
1153
|
+
if (request.systemPrompt && request.systemPrompt.trim().length > 0) {
|
|
1154
|
+
systemSegments.push(request.systemPrompt.trim());
|
|
995
1155
|
} else {
|
|
996
1156
|
systemSegments.push(DEFAULT_SYSTEM_PROMPT);
|
|
997
1157
|
}
|
|
@@ -1242,9 +1402,9 @@ var GeminiProvider = class {
|
|
|
1242
1402
|
|
|
1243
1403
|
// src/evaluation/providers/cli.ts
|
|
1244
1404
|
var import_node_child_process = require("child_process");
|
|
1245
|
-
var
|
|
1405
|
+
var import_promises7 = __toESM(require("fs/promises"), 1);
|
|
1246
1406
|
var import_node_os = __toESM(require("os"), 1);
|
|
1247
|
-
var
|
|
1407
|
+
var import_node_path8 = __toESM(require("path"), 1);
|
|
1248
1408
|
var import_node_util = require("util");
|
|
1249
1409
|
var execAsync = (0, import_node_util.promisify)(import_node_child_process.exec);
|
|
1250
1410
|
var DEFAULT_MAX_BUFFER = 10 * 1024 * 1024;
|
|
@@ -1341,7 +1501,7 @@ var CliProvider = class {
|
|
|
1341
1501
|
const errorMsg = error instanceof Error ? error.message : String(error);
|
|
1342
1502
|
throw new Error(`Failed to read output file '${filePath}': ${errorMsg}`);
|
|
1343
1503
|
} finally {
|
|
1344
|
-
await
|
|
1504
|
+
await import_promises7.default.unlink(filePath).catch(() => {
|
|
1345
1505
|
});
|
|
1346
1506
|
}
|
|
1347
1507
|
}
|
|
@@ -1423,7 +1583,7 @@ function normalizeInputFiles(inputFiles) {
|
|
|
1423
1583
|
}
|
|
1424
1584
|
const unique = /* @__PURE__ */ new Map();
|
|
1425
1585
|
for (const inputFile of inputFiles) {
|
|
1426
|
-
const absolutePath =
|
|
1586
|
+
const absolutePath = import_node_path8.default.resolve(inputFile);
|
|
1427
1587
|
if (!unique.has(absolutePath)) {
|
|
1428
1588
|
unique.set(absolutePath, absolutePath);
|
|
1429
1589
|
}
|
|
@@ -1437,7 +1597,7 @@ function formatFileList(files, template) {
|
|
|
1437
1597
|
const formatter = template ?? "{path}";
|
|
1438
1598
|
return files.map((filePath) => {
|
|
1439
1599
|
const escapedPath = shellEscape(filePath);
|
|
1440
|
-
const escapedName = shellEscape(
|
|
1600
|
+
const escapedName = shellEscape(import_node_path8.default.basename(filePath));
|
|
1441
1601
|
return formatter.replaceAll("{path}", escapedPath).replaceAll("{basename}", escapedName);
|
|
1442
1602
|
}).join(" ");
|
|
1443
1603
|
}
|
|
@@ -1461,7 +1621,7 @@ function generateOutputFilePath(evalCaseId) {
|
|
|
1461
1621
|
const safeEvalId = evalCaseId || "unknown";
|
|
1462
1622
|
const timestamp = Date.now();
|
|
1463
1623
|
const random = Math.random().toString(36).substring(2, 9);
|
|
1464
|
-
return
|
|
1624
|
+
return import_node_path8.default.join(import_node_os.default.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}.json`);
|
|
1465
1625
|
}
|
|
1466
1626
|
function formatTimeoutSuffix(timeoutMs) {
|
|
1467
1627
|
if (!timeoutMs || timeoutMs <= 0) {
|
|
@@ -1475,9 +1635,9 @@ function formatTimeoutSuffix(timeoutMs) {
|
|
|
1475
1635
|
var import_node_child_process2 = require("child_process");
|
|
1476
1636
|
var import_node_crypto = require("crypto");
|
|
1477
1637
|
var import_node_fs3 = require("fs");
|
|
1478
|
-
var
|
|
1638
|
+
var import_promises8 = require("fs/promises");
|
|
1479
1639
|
var import_node_os2 = require("os");
|
|
1480
|
-
var
|
|
1640
|
+
var import_node_path10 = __toESM(require("path"), 1);
|
|
1481
1641
|
var import_node_util2 = require("util");
|
|
1482
1642
|
|
|
1483
1643
|
// src/evaluation/providers/codex-log-tracker.ts
|
|
@@ -1534,7 +1694,7 @@ function subscribeToCodexLogEntries(listener) {
|
|
|
1534
1694
|
}
|
|
1535
1695
|
|
|
1536
1696
|
// src/evaluation/providers/preread.ts
|
|
1537
|
-
var
|
|
1697
|
+
var import_node_path9 = __toESM(require("path"), 1);
|
|
1538
1698
|
function buildPromptDocument(request, inputFiles, options) {
|
|
1539
1699
|
const parts = [];
|
|
1540
1700
|
const guidelineFiles = collectGuidelineFiles(
|
|
@@ -1559,7 +1719,7 @@ function normalizeInputFiles2(inputFiles) {
|
|
|
1559
1719
|
}
|
|
1560
1720
|
const deduped = /* @__PURE__ */ new Map();
|
|
1561
1721
|
for (const inputFile of inputFiles) {
|
|
1562
|
-
const absolutePath =
|
|
1722
|
+
const absolutePath = import_node_path9.default.resolve(inputFile);
|
|
1563
1723
|
if (!deduped.has(absolutePath)) {
|
|
1564
1724
|
deduped.set(absolutePath, absolutePath);
|
|
1565
1725
|
}
|
|
@@ -1572,14 +1732,14 @@ function collectGuidelineFiles(inputFiles, guidelinePatterns, overrides) {
|
|
|
1572
1732
|
}
|
|
1573
1733
|
const unique = /* @__PURE__ */ new Map();
|
|
1574
1734
|
for (const inputFile of inputFiles) {
|
|
1575
|
-
const absolutePath =
|
|
1735
|
+
const absolutePath = import_node_path9.default.resolve(inputFile);
|
|
1576
1736
|
if (overrides?.has(absolutePath)) {
|
|
1577
1737
|
if (!unique.has(absolutePath)) {
|
|
1578
1738
|
unique.set(absolutePath, absolutePath);
|
|
1579
1739
|
}
|
|
1580
1740
|
continue;
|
|
1581
1741
|
}
|
|
1582
|
-
const normalized = absolutePath.split(
|
|
1742
|
+
const normalized = absolutePath.split(import_node_path9.default.sep).join("/");
|
|
1583
1743
|
if (isGuidelineFile(normalized, guidelinePatterns)) {
|
|
1584
1744
|
if (!unique.has(absolutePath)) {
|
|
1585
1745
|
unique.set(absolutePath, absolutePath);
|
|
@@ -1594,7 +1754,7 @@ function collectInputFiles(inputFiles) {
|
|
|
1594
1754
|
}
|
|
1595
1755
|
const unique = /* @__PURE__ */ new Map();
|
|
1596
1756
|
for (const inputFile of inputFiles) {
|
|
1597
|
-
const absolutePath =
|
|
1757
|
+
const absolutePath = import_node_path9.default.resolve(inputFile);
|
|
1598
1758
|
if (!unique.has(absolutePath)) {
|
|
1599
1759
|
unique.set(absolutePath, absolutePath);
|
|
1600
1760
|
}
|
|
@@ -1606,7 +1766,7 @@ function buildMandatoryPrereadBlock(guidelineFiles, inputFiles) {
|
|
|
1606
1766
|
return "";
|
|
1607
1767
|
}
|
|
1608
1768
|
const buildList = (files) => files.map((absolutePath) => {
|
|
1609
|
-
const fileName =
|
|
1769
|
+
const fileName = import_node_path9.default.basename(absolutePath);
|
|
1610
1770
|
const fileUri = pathToFileUri(absolutePath);
|
|
1611
1771
|
return `* [${fileName}](${fileUri})`;
|
|
1612
1772
|
});
|
|
@@ -1626,7 +1786,7 @@ ${buildList(inputFiles).join("\n")}.`);
|
|
|
1626
1786
|
return sections.join("\n");
|
|
1627
1787
|
}
|
|
1628
1788
|
function pathToFileUri(filePath) {
|
|
1629
|
-
const absolutePath =
|
|
1789
|
+
const absolutePath = import_node_path9.default.isAbsolute(filePath) ? filePath : import_node_path9.default.resolve(filePath);
|
|
1630
1790
|
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
1631
1791
|
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
1632
1792
|
return `file:///${normalizedPath}`;
|
|
@@ -1664,8 +1824,8 @@ var CodexProvider = class {
|
|
|
1664
1824
|
const logger = await this.createStreamLogger(request).catch(() => void 0);
|
|
1665
1825
|
try {
|
|
1666
1826
|
const promptContent = buildPromptDocument(request, inputFiles);
|
|
1667
|
-
const promptFile =
|
|
1668
|
-
await (0,
|
|
1827
|
+
const promptFile = import_node_path10.default.join(workspaceRoot, PROMPT_FILENAME);
|
|
1828
|
+
await (0, import_promises8.writeFile)(promptFile, promptContent, "utf8");
|
|
1669
1829
|
const args = this.buildCodexArgs();
|
|
1670
1830
|
const cwd = this.resolveCwd(workspaceRoot);
|
|
1671
1831
|
const result = await this.executeCodex(args, cwd, promptContent, request.signal, logger);
|
|
@@ -1714,7 +1874,7 @@ var CodexProvider = class {
|
|
|
1714
1874
|
if (!this.config.cwd) {
|
|
1715
1875
|
return workspaceRoot;
|
|
1716
1876
|
}
|
|
1717
|
-
return
|
|
1877
|
+
return import_node_path10.default.resolve(this.config.cwd);
|
|
1718
1878
|
}
|
|
1719
1879
|
buildCodexArgs() {
|
|
1720
1880
|
const args = ["--ask-for-approval", "never", "exec", "--json", "--color", "never", "--skip-git-repo-check"];
|
|
@@ -1748,11 +1908,11 @@ var CodexProvider = class {
|
|
|
1748
1908
|
}
|
|
1749
1909
|
}
|
|
1750
1910
|
async createWorkspace() {
|
|
1751
|
-
return await (0,
|
|
1911
|
+
return await (0, import_promises8.mkdtemp)(import_node_path10.default.join((0, import_node_os2.tmpdir)(), WORKSPACE_PREFIX));
|
|
1752
1912
|
}
|
|
1753
1913
|
async cleanupWorkspace(workspaceRoot) {
|
|
1754
1914
|
try {
|
|
1755
|
-
await (0,
|
|
1915
|
+
await (0, import_promises8.rm)(workspaceRoot, { recursive: true, force: true });
|
|
1756
1916
|
} catch {
|
|
1757
1917
|
}
|
|
1758
1918
|
}
|
|
@@ -1762,9 +1922,9 @@ var CodexProvider = class {
|
|
|
1762
1922
|
return void 0;
|
|
1763
1923
|
}
|
|
1764
1924
|
if (this.config.logDir) {
|
|
1765
|
-
return
|
|
1925
|
+
return import_node_path10.default.resolve(this.config.logDir);
|
|
1766
1926
|
}
|
|
1767
|
-
return
|
|
1927
|
+
return import_node_path10.default.join(process.cwd(), ".agentv", "logs", "codex");
|
|
1768
1928
|
}
|
|
1769
1929
|
async createStreamLogger(request) {
|
|
1770
1930
|
const logDir = this.resolveLogDirectory();
|
|
@@ -1772,13 +1932,13 @@ var CodexProvider = class {
|
|
|
1772
1932
|
return void 0;
|
|
1773
1933
|
}
|
|
1774
1934
|
try {
|
|
1775
|
-
await (0,
|
|
1935
|
+
await (0, import_promises8.mkdir)(logDir, { recursive: true });
|
|
1776
1936
|
} catch (error) {
|
|
1777
1937
|
const message = error instanceof Error ? error.message : String(error);
|
|
1778
1938
|
console.warn(`Skipping Codex stream logging (could not create ${logDir}): ${message}`);
|
|
1779
1939
|
return void 0;
|
|
1780
1940
|
}
|
|
1781
|
-
const filePath =
|
|
1941
|
+
const filePath = import_node_path10.default.join(logDir, buildLogFilename(request, this.targetName));
|
|
1782
1942
|
try {
|
|
1783
1943
|
const logger = await CodexStreamLogger.create({
|
|
1784
1944
|
filePath,
|
|
@@ -1993,9 +2153,9 @@ function tryParseJsonValue(rawLine) {
|
|
|
1993
2153
|
async function locateExecutable(candidate) {
|
|
1994
2154
|
const includesPathSeparator = candidate.includes("/") || candidate.includes("\\");
|
|
1995
2155
|
if (includesPathSeparator) {
|
|
1996
|
-
const resolved =
|
|
2156
|
+
const resolved = import_node_path10.default.isAbsolute(candidate) ? candidate : import_node_path10.default.resolve(candidate);
|
|
1997
2157
|
const executablePath = await ensureWindowsExecutableVariant(resolved);
|
|
1998
|
-
await (0,
|
|
2158
|
+
await (0, import_promises8.access)(executablePath, import_node_fs3.constants.F_OK);
|
|
1999
2159
|
return executablePath;
|
|
2000
2160
|
}
|
|
2001
2161
|
const locator = process.platform === "win32" ? "where" : "which";
|
|
@@ -2005,7 +2165,7 @@ async function locateExecutable(candidate) {
|
|
|
2005
2165
|
const preferred = selectExecutableCandidate(lines);
|
|
2006
2166
|
if (preferred) {
|
|
2007
2167
|
const executablePath = await ensureWindowsExecutableVariant(preferred);
|
|
2008
|
-
await (0,
|
|
2168
|
+
await (0, import_promises8.access)(executablePath, import_node_fs3.constants.F_OK);
|
|
2009
2169
|
return executablePath;
|
|
2010
2170
|
}
|
|
2011
2171
|
} catch {
|
|
@@ -2039,7 +2199,7 @@ async function ensureWindowsExecutableVariant(candidate) {
|
|
|
2039
2199
|
for (const ext of extensions) {
|
|
2040
2200
|
const withExtension = `${candidate}${ext}`;
|
|
2041
2201
|
try {
|
|
2042
|
-
await (0,
|
|
2202
|
+
await (0, import_promises8.access)(withExtension, import_node_fs3.constants.F_OK);
|
|
2043
2203
|
return withExtension;
|
|
2044
2204
|
} catch {
|
|
2045
2205
|
}
|
|
@@ -2851,7 +3011,7 @@ function resolveOptionalNumberArray(source, description) {
|
|
|
2851
3011
|
}
|
|
2852
3012
|
|
|
2853
3013
|
// src/evaluation/providers/vscode.ts
|
|
2854
|
-
var
|
|
3014
|
+
var import_node_path11 = __toESM(require("path"), 1);
|
|
2855
3015
|
var import_subagent = require("subagent");
|
|
2856
3016
|
var VSCodeProvider = class {
|
|
2857
3017
|
id;
|
|
@@ -2964,6 +3124,9 @@ var VSCodeProvider = class {
|
|
|
2964
3124
|
};
|
|
2965
3125
|
function buildPromptDocument2(request, attachments, guidelinePatterns) {
|
|
2966
3126
|
const parts = [];
|
|
3127
|
+
if (request.systemPrompt && request.systemPrompt.trim().length > 0) {
|
|
3128
|
+
parts.push(request.systemPrompt.trim());
|
|
3129
|
+
}
|
|
2967
3130
|
const guidelineFiles = collectGuidelineFiles2(attachments, guidelinePatterns);
|
|
2968
3131
|
const attachmentFiles = collectAttachmentFiles(attachments);
|
|
2969
3132
|
const nonGuidelineAttachments = attachmentFiles.filter(
|
|
@@ -2981,7 +3144,7 @@ function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
|
|
|
2981
3144
|
return "";
|
|
2982
3145
|
}
|
|
2983
3146
|
const buildList = (files) => files.map((absolutePath) => {
|
|
2984
|
-
const fileName =
|
|
3147
|
+
const fileName = import_node_path11.default.basename(absolutePath);
|
|
2985
3148
|
const fileUri = pathToFileUri2(absolutePath);
|
|
2986
3149
|
return `* [${fileName}](${fileUri})`;
|
|
2987
3150
|
});
|
|
@@ -3006,8 +3169,8 @@ function collectGuidelineFiles2(attachments, guidelinePatterns) {
|
|
|
3006
3169
|
}
|
|
3007
3170
|
const unique = /* @__PURE__ */ new Map();
|
|
3008
3171
|
for (const attachment of attachments) {
|
|
3009
|
-
const absolutePath =
|
|
3010
|
-
const normalized = absolutePath.split(
|
|
3172
|
+
const absolutePath = import_node_path11.default.resolve(attachment);
|
|
3173
|
+
const normalized = absolutePath.split(import_node_path11.default.sep).join("/");
|
|
3011
3174
|
if (isGuidelineFile(normalized, guidelinePatterns)) {
|
|
3012
3175
|
if (!unique.has(absolutePath)) {
|
|
3013
3176
|
unique.set(absolutePath, absolutePath);
|
|
@@ -3022,7 +3185,7 @@ function collectAttachmentFiles(attachments) {
|
|
|
3022
3185
|
}
|
|
3023
3186
|
const unique = /* @__PURE__ */ new Map();
|
|
3024
3187
|
for (const attachment of attachments) {
|
|
3025
|
-
const absolutePath =
|
|
3188
|
+
const absolutePath = import_node_path11.default.resolve(attachment);
|
|
3026
3189
|
if (!unique.has(absolutePath)) {
|
|
3027
3190
|
unique.set(absolutePath, absolutePath);
|
|
3028
3191
|
}
|
|
@@ -3030,7 +3193,7 @@ function collectAttachmentFiles(attachments) {
|
|
|
3030
3193
|
return Array.from(unique.values());
|
|
3031
3194
|
}
|
|
3032
3195
|
function pathToFileUri2(filePath) {
|
|
3033
|
-
const absolutePath =
|
|
3196
|
+
const absolutePath = import_node_path11.default.isAbsolute(filePath) ? filePath : import_node_path11.default.resolve(filePath);
|
|
3034
3197
|
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
3035
3198
|
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
3036
3199
|
return `file:///${normalizedPath}`;
|
|
@@ -3043,7 +3206,7 @@ function normalizeAttachments(attachments) {
|
|
|
3043
3206
|
}
|
|
3044
3207
|
const deduped = /* @__PURE__ */ new Set();
|
|
3045
3208
|
for (const attachment of attachments) {
|
|
3046
|
-
deduped.add(
|
|
3209
|
+
deduped.add(import_node_path11.default.resolve(attachment));
|
|
3047
3210
|
}
|
|
3048
3211
|
return Array.from(deduped);
|
|
3049
3212
|
}
|
|
@@ -3052,7 +3215,7 @@ function mergeAttachments(all) {
|
|
|
3052
3215
|
for (const list of all) {
|
|
3053
3216
|
if (!list) continue;
|
|
3054
3217
|
for (const inputFile of list) {
|
|
3055
|
-
deduped.add(
|
|
3218
|
+
deduped.add(import_node_path11.default.resolve(inputFile));
|
|
3056
3219
|
}
|
|
3057
3220
|
}
|
|
3058
3221
|
return deduped.size > 0 ? Array.from(deduped) : void 0;
|
|
@@ -3098,9 +3261,9 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
|
|
|
3098
3261
|
|
|
3099
3262
|
// src/evaluation/providers/targets-file.ts
|
|
3100
3263
|
var import_node_fs4 = require("fs");
|
|
3101
|
-
var
|
|
3102
|
-
var
|
|
3103
|
-
var
|
|
3264
|
+
var import_promises9 = require("fs/promises");
|
|
3265
|
+
var import_node_path12 = __toESM(require("path"), 1);
|
|
3266
|
+
var import_yaml3 = require("yaml");
|
|
3104
3267
|
|
|
3105
3268
|
// src/evaluation/providers/types.ts
|
|
3106
3269
|
var AGENT_PROVIDER_KINDS = [
|
|
@@ -3161,19 +3324,19 @@ function assertTargetDefinition(value, index, filePath) {
|
|
|
3161
3324
|
}
|
|
3162
3325
|
async function fileExists3(filePath) {
|
|
3163
3326
|
try {
|
|
3164
|
-
await (0,
|
|
3327
|
+
await (0, import_promises9.access)(filePath, import_node_fs4.constants.F_OK);
|
|
3165
3328
|
return true;
|
|
3166
3329
|
} catch {
|
|
3167
3330
|
return false;
|
|
3168
3331
|
}
|
|
3169
3332
|
}
|
|
3170
3333
|
async function readTargetDefinitions(filePath) {
|
|
3171
|
-
const absolutePath =
|
|
3334
|
+
const absolutePath = import_node_path12.default.resolve(filePath);
|
|
3172
3335
|
if (!await fileExists3(absolutePath)) {
|
|
3173
3336
|
throw new Error(`targets.yaml not found at ${absolutePath}`);
|
|
3174
3337
|
}
|
|
3175
|
-
const raw = await (0,
|
|
3176
|
-
const parsed = (0,
|
|
3338
|
+
const raw = await (0, import_promises9.readFile)(absolutePath, "utf8");
|
|
3339
|
+
const parsed = (0, import_yaml3.parse)(raw);
|
|
3177
3340
|
if (!isRecord(parsed)) {
|
|
3178
3341
|
throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with '$schema' and 'targets' fields`);
|
|
3179
3342
|
}
|
|
@@ -3216,18 +3379,34 @@ function resolveAndCreateProvider(definition, env = process.env) {
|
|
|
3216
3379
|
}
|
|
3217
3380
|
|
|
3218
3381
|
// src/evaluation/evaluators.ts
|
|
3219
|
-
var
|
|
3382
|
+
var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
|
|
3383
|
+
|
|
3384
|
+
Use the reference_answer as a gold standard for a high-quality response (if provided). The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
|
|
3385
|
+
|
|
3386
|
+
Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.
|
|
3387
|
+
|
|
3388
|
+
[[ ## expected_outcome ## ]]
|
|
3389
|
+
{{expected_outcome}}
|
|
3390
|
+
|
|
3391
|
+
[[ ## question ## ]]
|
|
3392
|
+
{{question}}
|
|
3393
|
+
|
|
3394
|
+
[[ ## reference_answer ## ]]
|
|
3395
|
+
{{reference_answer}}
|
|
3396
|
+
|
|
3397
|
+
[[ ## candidate_answer ## ]]
|
|
3398
|
+
{{candidate_answer}}`;
|
|
3220
3399
|
var LlmJudgeEvaluator = class {
|
|
3221
3400
|
kind = "llm_judge";
|
|
3222
3401
|
resolveJudgeProvider;
|
|
3223
3402
|
maxOutputTokens;
|
|
3224
3403
|
temperature;
|
|
3225
|
-
|
|
3404
|
+
evaluatorTemplate;
|
|
3226
3405
|
constructor(options) {
|
|
3227
3406
|
this.resolveJudgeProvider = options.resolveJudgeProvider;
|
|
3228
3407
|
this.maxOutputTokens = options.maxOutputTokens;
|
|
3229
3408
|
this.temperature = options.temperature;
|
|
3230
|
-
this.
|
|
3409
|
+
this.evaluatorTemplate = options.evaluatorTemplate;
|
|
3231
3410
|
}
|
|
3232
3411
|
async evaluate(context) {
|
|
3233
3412
|
const judgeProvider = await this.resolveJudgeProvider(context);
|
|
@@ -3237,26 +3416,21 @@ var LlmJudgeEvaluator = class {
|
|
|
3237
3416
|
return this.evaluateWithPrompt(context, judgeProvider);
|
|
3238
3417
|
}
|
|
3239
3418
|
async evaluateWithPrompt(context, judgeProvider) {
|
|
3240
|
-
const hasReferenceAnswer = hasNonEmptyReferenceAnswer(context.evalCase);
|
|
3241
3419
|
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
3242
|
-
|
|
3243
|
-
|
|
3244
|
-
|
|
3245
|
-
|
|
3246
|
-
|
|
3247
|
-
|
|
3248
|
-
|
|
3249
|
-
|
|
3250
|
-
|
|
3251
|
-
|
|
3252
|
-
|
|
3253
|
-
prompt = substituteVariables(systemPrompt, variables);
|
|
3254
|
-
systemPrompt = buildSystemPrompt(hasReferenceAnswer);
|
|
3255
|
-
}
|
|
3256
|
-
const metadata = systemPrompt !== void 0 ? { systemPrompt } : {};
|
|
3420
|
+
const variables = {
|
|
3421
|
+
input_messages: JSON.stringify(context.evalCase.input_segments, null, 2),
|
|
3422
|
+
output_messages: JSON.stringify(context.evalCase.output_segments, null, 2),
|
|
3423
|
+
candidate_answer: context.candidate.trim(),
|
|
3424
|
+
reference_answer: (context.evalCase.reference_answer ?? "").trim(),
|
|
3425
|
+
expected_outcome: context.evalCase.expected_outcome.trim(),
|
|
3426
|
+
question: formattedQuestion.trim()
|
|
3427
|
+
};
|
|
3428
|
+
const systemPrompt = buildOutputSchema();
|
|
3429
|
+
const evaluatorTemplate = context.evaluatorTemplateOverride ?? this.evaluatorTemplate ?? DEFAULT_EVALUATOR_TEMPLATE;
|
|
3430
|
+
const userPrompt = substituteVariables(evaluatorTemplate, variables);
|
|
3257
3431
|
const response = await judgeProvider.invoke({
|
|
3258
|
-
question:
|
|
3259
|
-
|
|
3432
|
+
question: userPrompt,
|
|
3433
|
+
systemPrompt,
|
|
3260
3434
|
evalCaseId: context.evalCase.id,
|
|
3261
3435
|
attempt: context.attempt,
|
|
3262
3436
|
maxOutputTokens: this.maxOutputTokens,
|
|
@@ -3269,11 +3443,9 @@ var LlmJudgeEvaluator = class {
|
|
|
3269
3443
|
const reasoning = parsed.reasoning ?? response.reasoning;
|
|
3270
3444
|
const expectedAspectCount = Math.max(hits.length + misses.length, 1);
|
|
3271
3445
|
const evaluatorRawRequest = {
|
|
3272
|
-
|
|
3273
|
-
|
|
3274
|
-
|
|
3275
|
-
target: context.target.name,
|
|
3276
|
-
...systemPrompt !== void 0 && { systemPrompt }
|
|
3446
|
+
userPrompt,
|
|
3447
|
+
systemPrompt,
|
|
3448
|
+
target: judgeProvider.targetName
|
|
3277
3449
|
};
|
|
3278
3450
|
return {
|
|
3279
3451
|
score,
|
|
@@ -3285,20 +3457,8 @@ var LlmJudgeEvaluator = class {
|
|
|
3285
3457
|
};
|
|
3286
3458
|
}
|
|
3287
3459
|
};
|
|
3288
|
-
function
|
|
3289
|
-
|
|
3290
|
-
"You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.",
|
|
3291
|
-
""
|
|
3292
|
-
];
|
|
3293
|
-
if (hasReferenceAnswer) {
|
|
3294
|
-
basePrompt.push(
|
|
3295
|
-
"Use the reference_answer as a gold standard for a high-quality response. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.",
|
|
3296
|
-
""
|
|
3297
|
-
);
|
|
3298
|
-
}
|
|
3299
|
-
basePrompt.push(
|
|
3300
|
-
"Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.",
|
|
3301
|
-
"",
|
|
3460
|
+
function buildOutputSchema() {
|
|
3461
|
+
return [
|
|
3302
3462
|
"You must respond with a single JSON object matching this schema:",
|
|
3303
3463
|
"",
|
|
3304
3464
|
"{",
|
|
@@ -3307,30 +3467,7 @@ function buildSystemPrompt(hasReferenceAnswer) {
|
|
|
3307
3467
|
' "misses": [<array of strings, max 4 items, brief specific failures or omissions, empty if none>],',
|
|
3308
3468
|
' "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
|
|
3309
3469
|
"}"
|
|
3310
|
-
);
|
|
3311
|
-
return basePrompt.join("\n");
|
|
3312
|
-
}
|
|
3313
|
-
function buildQualityPrompt(evalCase, candidate, question) {
|
|
3314
|
-
const parts = [
|
|
3315
|
-
"[[ ## expected_outcome ## ]]",
|
|
3316
|
-
evalCase.expected_outcome.trim(),
|
|
3317
|
-
"",
|
|
3318
|
-
"[[ ## question ## ]]",
|
|
3319
|
-
question.trim(),
|
|
3320
|
-
""
|
|
3321
|
-
];
|
|
3322
|
-
if (hasNonEmptyReferenceAnswer(evalCase)) {
|
|
3323
|
-
parts.push(
|
|
3324
|
-
"[[ ## reference_answer ## ]]",
|
|
3325
|
-
evalCase.reference_answer.trim(),
|
|
3326
|
-
""
|
|
3327
|
-
);
|
|
3328
|
-
}
|
|
3329
|
-
parts.push(
|
|
3330
|
-
"[[ ## candidate_answer ## ]]",
|
|
3331
|
-
candidate.trim()
|
|
3332
|
-
);
|
|
3333
|
-
return parts.join("\n");
|
|
3470
|
+
].join("\n");
|
|
3334
3471
|
}
|
|
3335
3472
|
function clampScore(value) {
|
|
3336
3473
|
if (Number.isNaN(value) || !Number.isFinite(value)) {
|
|
@@ -3412,9 +3549,6 @@ function extractJsonBlob(text) {
|
|
|
3412
3549
|
function isNonEmptyString(value) {
|
|
3413
3550
|
return typeof value === "string" && value.trim().length > 0;
|
|
3414
3551
|
}
|
|
3415
|
-
function hasNonEmptyReferenceAnswer(evalCase) {
|
|
3416
|
-
return evalCase.reference_answer !== void 0 && evalCase.reference_answer.trim().length > 0;
|
|
3417
|
-
}
|
|
3418
3552
|
var CodeEvaluator = class {
|
|
3419
3553
|
kind = "code";
|
|
3420
3554
|
script;
|
|
@@ -3520,19 +3654,16 @@ function parseJsonSafe(payload) {
|
|
|
3520
3654
|
return void 0;
|
|
3521
3655
|
}
|
|
3522
3656
|
}
|
|
3523
|
-
function hasTemplateVariables(text) {
|
|
3524
|
-
return /\$\{[a-zA-Z0-9_]+\}/.test(text);
|
|
3525
|
-
}
|
|
3526
3657
|
function substituteVariables(template, variables) {
|
|
3527
|
-
return template.replace(
|
|
3658
|
+
return template.replace(/\{\{([a-zA-Z0-9_]+)\}\}/g, (match, varName) => {
|
|
3528
3659
|
return variables[varName] ?? match;
|
|
3529
3660
|
});
|
|
3530
3661
|
}
|
|
3531
3662
|
|
|
3532
3663
|
// src/evaluation/orchestrator.ts
|
|
3533
|
-
var
|
|
3534
|
-
var
|
|
3535
|
-
var
|
|
3664
|
+
var import_node_crypto2 = require("crypto");
|
|
3665
|
+
var import_promises10 = require("fs/promises");
|
|
3666
|
+
var import_node_path13 = __toESM(require("path"), 1);
|
|
3536
3667
|
|
|
3537
3668
|
// ../../node_modules/.pnpm/yocto-queue@1.2.1/node_modules/yocto-queue/index.js
|
|
3538
3669
|
var Node = class {
|
|
@@ -4095,6 +4226,7 @@ async function evaluateCandidate(options) {
|
|
|
4095
4226
|
}
|
|
4096
4227
|
}
|
|
4097
4228
|
return {
|
|
4229
|
+
timestamp: completedAt.toISOString(),
|
|
4098
4230
|
eval_id: evalCase.id,
|
|
4099
4231
|
dataset: evalCase.dataset,
|
|
4100
4232
|
conversation_id: evalCase.conversation_id,
|
|
@@ -4102,14 +4234,12 @@ async function evaluateCandidate(options) {
|
|
|
4102
4234
|
hits: score.hits,
|
|
4103
4235
|
misses: score.misses,
|
|
4104
4236
|
candidate_answer: candidate,
|
|
4105
|
-
expected_aspect_count: score.expectedAspectCount,
|
|
4106
4237
|
target: target.name,
|
|
4107
|
-
timestamp: completedAt.toISOString(),
|
|
4108
4238
|
reasoning: score.reasoning,
|
|
4109
4239
|
raw_aspects: score.rawAspects,
|
|
4110
4240
|
agent_provider_request: agentProviderRequest,
|
|
4111
4241
|
lm_provider_request: lmProviderRequest,
|
|
4112
|
-
|
|
4242
|
+
evaluator_provider_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
|
|
4113
4243
|
evaluator_results: evaluatorResults
|
|
4114
4244
|
};
|
|
4115
4245
|
}
|
|
@@ -4186,7 +4316,7 @@ async function runEvaluatorList(options) {
|
|
|
4186
4316
|
hits: score2.hits,
|
|
4187
4317
|
misses: score2.misses,
|
|
4188
4318
|
reasoning: score2.reasoning,
|
|
4189
|
-
|
|
4319
|
+
evaluator_provider_request: score2.evaluatorRawRequest
|
|
4190
4320
|
});
|
|
4191
4321
|
continue;
|
|
4192
4322
|
}
|
|
@@ -4213,7 +4343,7 @@ async function runEvaluatorList(options) {
|
|
|
4213
4343
|
hits: score2.hits,
|
|
4214
4344
|
misses: score2.misses,
|
|
4215
4345
|
reasoning: score2.reasoning,
|
|
4216
|
-
|
|
4346
|
+
evaluator_provider_request: score2.evaluatorRawRequest
|
|
4217
4347
|
});
|
|
4218
4348
|
continue;
|
|
4219
4349
|
}
|
|
@@ -4266,7 +4396,7 @@ async function runLlmJudgeEvaluator(options) {
|
|
|
4266
4396
|
promptInputs,
|
|
4267
4397
|
now,
|
|
4268
4398
|
judgeProvider,
|
|
4269
|
-
|
|
4399
|
+
evaluatorTemplateOverride: customPrompt,
|
|
4270
4400
|
evaluator: config
|
|
4271
4401
|
});
|
|
4272
4402
|
}
|
|
@@ -4307,22 +4437,22 @@ function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
|
|
|
4307
4437
|
async function dumpPrompt(directory, evalCase, promptInputs) {
|
|
4308
4438
|
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
4309
4439
|
const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
|
|
4310
|
-
const filePath =
|
|
4311
|
-
await (0,
|
|
4440
|
+
const filePath = import_node_path13.default.resolve(directory, filename);
|
|
4441
|
+
await (0, import_promises10.mkdir)(import_node_path13.default.dirname(filePath), { recursive: true });
|
|
4312
4442
|
const payload = {
|
|
4313
4443
|
eval_id: evalCase.id,
|
|
4314
4444
|
question: promptInputs.question,
|
|
4315
4445
|
guidelines: promptInputs.guidelines,
|
|
4316
4446
|
guideline_paths: evalCase.guideline_paths
|
|
4317
4447
|
};
|
|
4318
|
-
await (0,
|
|
4448
|
+
await (0, import_promises10.writeFile)(filePath, JSON.stringify(payload, null, 2), "utf8");
|
|
4319
4449
|
}
|
|
4320
4450
|
function sanitizeFilename(value) {
|
|
4321
4451
|
if (!value) {
|
|
4322
4452
|
return "prompt";
|
|
4323
4453
|
}
|
|
4324
4454
|
const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
|
|
4325
|
-
return sanitized.length > 0 ? sanitized : (0,
|
|
4455
|
+
return sanitized.length > 0 ? sanitized : (0, import_node_crypto2.randomUUID)();
|
|
4326
4456
|
}
|
|
4327
4457
|
async function invokeProvider(provider, options) {
|
|
4328
4458
|
const { evalCase, promptInputs, attempt, agentTimeoutMs, signal } = options;
|
|
@@ -4378,6 +4508,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
4378
4508
|
}
|
|
4379
4509
|
}
|
|
4380
4510
|
return {
|
|
4511
|
+
timestamp: timestamp.toISOString(),
|
|
4381
4512
|
eval_id: evalCase.id,
|
|
4382
4513
|
dataset: evalCase.dataset,
|
|
4383
4514
|
conversation_id: evalCase.conversation_id,
|
|
@@ -4385,9 +4516,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
4385
4516
|
hits: [],
|
|
4386
4517
|
misses: [`Error: ${message}`],
|
|
4387
4518
|
candidate_answer: `Error occurred: ${message}`,
|
|
4388
|
-
expected_aspect_count: 0,
|
|
4389
4519
|
target: targetName,
|
|
4390
|
-
timestamp: timestamp.toISOString(),
|
|
4391
4520
|
raw_aspects: [],
|
|
4392
4521
|
agent_provider_request: agentProviderRequest,
|
|
4393
4522
|
lm_provider_request: lmProviderRequest,
|
|
@@ -4395,7 +4524,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
4395
4524
|
};
|
|
4396
4525
|
}
|
|
4397
4526
|
function createCacheKey(provider, target, evalCase, promptInputs) {
|
|
4398
|
-
const hash = (0,
|
|
4527
|
+
const hash = (0, import_node_crypto2.createHash)("sha256");
|
|
4399
4528
|
hash.update(provider.id);
|
|
4400
4529
|
hash.update(target.name);
|
|
4401
4530
|
hash.update(evalCase.id);
|