@agentv/core 0.11.0 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-YQBJAT5I.js → chunk-U3GEJ3K7.js} +1 -1
- package/dist/{chunk-YQBJAT5I.js.map → chunk-U3GEJ3K7.js.map} +1 -1
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/index.cjs +675 -562
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +29 -26
- package/dist/index.d.ts +29 -26
- package/dist/index.js +707 -592
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.cjs
CHANGED
|
@@ -33,15 +33,15 @@ __export(index_exports, {
|
|
|
33
33
|
CodeEvaluator: () => CodeEvaluator,
|
|
34
34
|
LlmJudgeEvaluator: () => LlmJudgeEvaluator,
|
|
35
35
|
TEST_MESSAGE_ROLES: () => TEST_MESSAGE_ROLES,
|
|
36
|
-
buildDirectoryChain: () =>
|
|
36
|
+
buildDirectoryChain: () => buildDirectoryChain2,
|
|
37
37
|
buildPromptInputs: () => buildPromptInputs,
|
|
38
|
-
buildSearchRoots: () =>
|
|
38
|
+
buildSearchRoots: () => buildSearchRoots2,
|
|
39
39
|
consumeCodexLogEntries: () => consumeCodexLogEntries,
|
|
40
40
|
createAgentKernel: () => createAgentKernel,
|
|
41
41
|
createProvider: () => createProvider,
|
|
42
42
|
ensureVSCodeSubagents: () => ensureVSCodeSubagents,
|
|
43
43
|
extractCodeBlocks: () => extractCodeBlocks,
|
|
44
|
-
fileExists: () =>
|
|
44
|
+
fileExists: () => fileExists2,
|
|
45
45
|
findGitRoot: () => findGitRoot,
|
|
46
46
|
getHitCount: () => getHitCount,
|
|
47
47
|
isEvaluatorKind: () => isEvaluatorKind,
|
|
@@ -57,7 +57,7 @@ __export(index_exports, {
|
|
|
57
57
|
readTestSuiteMetadata: () => readTestSuiteMetadata,
|
|
58
58
|
readTextFile: () => readTextFile,
|
|
59
59
|
resolveAndCreateProvider: () => resolveAndCreateProvider,
|
|
60
|
-
resolveFileReference: () =>
|
|
60
|
+
resolveFileReference: () => resolveFileReference2,
|
|
61
61
|
resolveTargetDefinition: () => resolveTargetDefinition,
|
|
62
62
|
runEvalCase: () => runEvalCase,
|
|
63
63
|
runEvaluation: () => runEvaluation,
|
|
@@ -116,47 +116,112 @@ function getHitCount(result) {
|
|
|
116
116
|
}
|
|
117
117
|
|
|
118
118
|
// src/evaluation/yaml-parser.ts
|
|
119
|
+
var import_promises5 = require("fs/promises");
|
|
120
|
+
var import_node_path6 = __toESM(require("path"), 1);
|
|
121
|
+
var import_yaml2 = require("yaml");
|
|
122
|
+
|
|
123
|
+
// src/evaluation/formatting/segment-formatter.ts
|
|
124
|
+
function extractCodeBlocks(segments) {
|
|
125
|
+
const CODE_BLOCK_PATTERN = /```[\s\S]*?```/g;
|
|
126
|
+
const codeBlocks = [];
|
|
127
|
+
for (const segment of segments) {
|
|
128
|
+
const typeValue = segment["type"];
|
|
129
|
+
if (typeof typeValue !== "string" || typeValue !== "text") {
|
|
130
|
+
continue;
|
|
131
|
+
}
|
|
132
|
+
const textValue = segment["value"];
|
|
133
|
+
if (typeof textValue !== "string") {
|
|
134
|
+
continue;
|
|
135
|
+
}
|
|
136
|
+
const matches = textValue.match(CODE_BLOCK_PATTERN);
|
|
137
|
+
if (matches) {
|
|
138
|
+
codeBlocks.push(...matches);
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
return codeBlocks;
|
|
142
|
+
}
|
|
143
|
+
function formatFileContents(parts) {
|
|
144
|
+
const fileCount = parts.filter((p) => p.isFile).length;
|
|
145
|
+
if (fileCount > 0) {
|
|
146
|
+
return parts.map((part) => {
|
|
147
|
+
if (part.isFile && part.displayPath) {
|
|
148
|
+
return `<file path="${part.displayPath}">
|
|
149
|
+
${part.content}
|
|
150
|
+
</file>`;
|
|
151
|
+
}
|
|
152
|
+
return part.content;
|
|
153
|
+
}).join("\n\n");
|
|
154
|
+
}
|
|
155
|
+
return parts.map((p) => p.content).join(" ");
|
|
156
|
+
}
|
|
157
|
+
function formatSegment(segment) {
|
|
158
|
+
const type = asString(segment.type);
|
|
159
|
+
if (type === "text") {
|
|
160
|
+
return asString(segment.value);
|
|
161
|
+
}
|
|
162
|
+
if (type === "guideline_ref") {
|
|
163
|
+
const refPath = asString(segment.path);
|
|
164
|
+
return refPath ? `<Attached: ${refPath}>` : void 0;
|
|
165
|
+
}
|
|
166
|
+
if (type === "file") {
|
|
167
|
+
const text = asString(segment.text);
|
|
168
|
+
const filePath = asString(segment.path);
|
|
169
|
+
if (text && filePath) {
|
|
170
|
+
return formatFileContents([{ content: text.trim(), isFile: true, displayPath: filePath }]);
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
return void 0;
|
|
174
|
+
}
|
|
175
|
+
function hasVisibleContent(segments) {
|
|
176
|
+
return segments.some((segment) => {
|
|
177
|
+
const type = asString(segment.type);
|
|
178
|
+
if (type === "text") {
|
|
179
|
+
const value = asString(segment.value);
|
|
180
|
+
return value !== void 0 && value.trim().length > 0;
|
|
181
|
+
}
|
|
182
|
+
if (type === "guideline_ref") {
|
|
183
|
+
return false;
|
|
184
|
+
}
|
|
185
|
+
if (type === "file") {
|
|
186
|
+
const text = asString(segment.text);
|
|
187
|
+
return text !== void 0 && text.trim().length > 0;
|
|
188
|
+
}
|
|
189
|
+
return false;
|
|
190
|
+
});
|
|
191
|
+
}
|
|
192
|
+
function asString(value) {
|
|
193
|
+
return typeof value === "string" ? value : void 0;
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
// src/evaluation/loaders/config-loader.ts
|
|
119
197
|
var import_micromatch = __toESM(require("micromatch"), 1);
|
|
120
|
-
var import_node_fs2 = require("fs");
|
|
121
198
|
var import_promises2 = require("fs/promises");
|
|
122
199
|
var import_node_path2 = __toESM(require("path"), 1);
|
|
123
|
-
var import_node_url = require("url");
|
|
124
200
|
var import_yaml = require("yaml");
|
|
125
201
|
|
|
126
|
-
// src/evaluation/file-
|
|
202
|
+
// src/evaluation/loaders/file-resolver.ts
|
|
127
203
|
var import_node_fs = require("fs");
|
|
128
204
|
var import_promises = require("fs/promises");
|
|
129
205
|
var import_node_path = __toESM(require("path"), 1);
|
|
130
|
-
async function fileExists(
|
|
206
|
+
async function fileExists(absolutePath) {
|
|
131
207
|
try {
|
|
132
|
-
await (0, import_promises.access)(
|
|
208
|
+
await (0, import_promises.access)(absolutePath, import_node_fs.constants.F_OK);
|
|
133
209
|
return true;
|
|
134
210
|
} catch {
|
|
135
211
|
return false;
|
|
136
212
|
}
|
|
137
213
|
}
|
|
138
|
-
function
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
async function findGitRoot(startPath) {
|
|
146
|
-
let currentDir = import_node_path.default.dirname(import_node_path.default.resolve(startPath));
|
|
147
|
-
const root = import_node_path.default.parse(currentDir).root;
|
|
148
|
-
while (currentDir !== root) {
|
|
149
|
-
const gitPath = import_node_path.default.join(currentDir, ".git");
|
|
150
|
-
if (await fileExists(gitPath)) {
|
|
151
|
-
return currentDir;
|
|
152
|
-
}
|
|
153
|
-
const parentDir = import_node_path.default.dirname(currentDir);
|
|
154
|
-
if (parentDir === currentDir) {
|
|
155
|
-
break;
|
|
214
|
+
function resolveToAbsolutePath(candidate) {
|
|
215
|
+
if (candidate instanceof URL) {
|
|
216
|
+
return new URL(candidate).pathname;
|
|
217
|
+
}
|
|
218
|
+
if (typeof candidate === "string") {
|
|
219
|
+
if (candidate.startsWith("file://")) {
|
|
220
|
+
return new URL(candidate).pathname;
|
|
156
221
|
}
|
|
157
|
-
|
|
222
|
+
return import_node_path.default.resolve(candidate);
|
|
158
223
|
}
|
|
159
|
-
|
|
224
|
+
throw new TypeError("Unsupported repoRoot value. Expected string or URL.");
|
|
160
225
|
}
|
|
161
226
|
function buildDirectoryChain(filePath, repoRoot) {
|
|
162
227
|
const directories = [];
|
|
@@ -234,44 +299,15 @@ async function resolveFileReference(rawValue, searchRoots) {
|
|
|
234
299
|
return { displayPath, attempted };
|
|
235
300
|
}
|
|
236
301
|
|
|
237
|
-
// src/evaluation/
|
|
238
|
-
var
|
|
302
|
+
// src/evaluation/loaders/config-loader.ts
|
|
303
|
+
var SCHEMA_CONFIG_V2 = "agentv-config-v2";
|
|
239
304
|
var ANSI_YELLOW = "\x1B[33m";
|
|
240
305
|
var ANSI_RESET = "\x1B[0m";
|
|
241
|
-
var SCHEMA_EVAL_V2 = "agentv-eval-v2";
|
|
242
|
-
var SCHEMA_CONFIG_V2 = "agentv-config-v2";
|
|
243
|
-
async function readTestSuiteMetadata(testFilePath) {
|
|
244
|
-
try {
|
|
245
|
-
const absolutePath = import_node_path2.default.resolve(testFilePath);
|
|
246
|
-
const content = await (0, import_promises2.readFile)(absolutePath, "utf8");
|
|
247
|
-
const parsed = (0, import_yaml.parse)(content);
|
|
248
|
-
if (!isJsonObject(parsed)) {
|
|
249
|
-
return {};
|
|
250
|
-
}
|
|
251
|
-
return { target: extractTargetFromSuite(parsed) };
|
|
252
|
-
} catch {
|
|
253
|
-
return {};
|
|
254
|
-
}
|
|
255
|
-
}
|
|
256
|
-
function extractTargetFromSuite(suite) {
|
|
257
|
-
const execution = suite.execution;
|
|
258
|
-
if (execution && typeof execution === "object" && !Array.isArray(execution)) {
|
|
259
|
-
const executionTarget = execution.target;
|
|
260
|
-
if (typeof executionTarget === "string" && executionTarget.trim().length > 0) {
|
|
261
|
-
return executionTarget.trim();
|
|
262
|
-
}
|
|
263
|
-
}
|
|
264
|
-
const targetValue = suite.target;
|
|
265
|
-
if (typeof targetValue === "string" && targetValue.trim().length > 0) {
|
|
266
|
-
return targetValue.trim();
|
|
267
|
-
}
|
|
268
|
-
return void 0;
|
|
269
|
-
}
|
|
270
306
|
async function loadConfig(evalFilePath, repoRoot) {
|
|
271
307
|
const directories = buildDirectoryChain(evalFilePath, repoRoot);
|
|
272
308
|
for (const directory of directories) {
|
|
273
309
|
const configPath = import_node_path2.default.join(directory, ".agentv", "config.yaml");
|
|
274
|
-
if (!await
|
|
310
|
+
if (!await fileExists(configPath)) {
|
|
275
311
|
continue;
|
|
276
312
|
}
|
|
277
313
|
try {
|
|
@@ -313,24 +349,134 @@ function isGuidelineFile(filePath, patterns) {
|
|
|
313
349
|
const patternsToUse = patterns ?? [];
|
|
314
350
|
return import_micromatch.default.isMatch(normalized, patternsToUse);
|
|
315
351
|
}
|
|
316
|
-
function
|
|
317
|
-
const
|
|
318
|
-
|
|
319
|
-
const
|
|
320
|
-
if (typeof
|
|
352
|
+
function extractTargetFromSuite(suite) {
|
|
353
|
+
const execution = suite.execution;
|
|
354
|
+
if (execution && typeof execution === "object" && !Array.isArray(execution)) {
|
|
355
|
+
const executionTarget = execution.target;
|
|
356
|
+
if (typeof executionTarget === "string" && executionTarget.trim().length > 0) {
|
|
357
|
+
return executionTarget.trim();
|
|
358
|
+
}
|
|
359
|
+
}
|
|
360
|
+
const targetValue = suite.target;
|
|
361
|
+
if (typeof targetValue === "string" && targetValue.trim().length > 0) {
|
|
362
|
+
return targetValue.trim();
|
|
363
|
+
}
|
|
364
|
+
return void 0;
|
|
365
|
+
}
|
|
366
|
+
function logWarning(message) {
|
|
367
|
+
console.warn(`${ANSI_YELLOW}Warning: ${message}${ANSI_RESET}`);
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
// src/evaluation/loaders/evaluator-parser.ts
|
|
371
|
+
var import_node_path3 = __toESM(require("path"), 1);
|
|
372
|
+
var ANSI_YELLOW2 = "\x1B[33m";
|
|
373
|
+
var ANSI_RESET2 = "\x1B[0m";
|
|
374
|
+
async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
|
|
375
|
+
const execution = rawEvalCase.execution;
|
|
376
|
+
const candidateEvaluators = isJsonObject2(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators ?? globalExecution?.evaluators;
|
|
377
|
+
if (candidateEvaluators === void 0) {
|
|
378
|
+
return void 0;
|
|
379
|
+
}
|
|
380
|
+
if (!Array.isArray(candidateEvaluators)) {
|
|
381
|
+
logWarning2(`Skipping evaluators for '${evalId}': expected array`);
|
|
382
|
+
return void 0;
|
|
383
|
+
}
|
|
384
|
+
const evaluators = [];
|
|
385
|
+
for (const rawEvaluator of candidateEvaluators) {
|
|
386
|
+
if (!isJsonObject2(rawEvaluator)) {
|
|
387
|
+
logWarning2(`Skipping invalid evaluator entry for '${evalId}' (expected object)`);
|
|
321
388
|
continue;
|
|
322
389
|
}
|
|
323
|
-
const
|
|
324
|
-
|
|
390
|
+
const name = asString2(rawEvaluator.name);
|
|
391
|
+
const typeValue = rawEvaluator.type;
|
|
392
|
+
if (!name || !isEvaluatorKind(typeValue)) {
|
|
393
|
+
logWarning2(`Skipping evaluator with invalid name/type in '${evalId}'`);
|
|
325
394
|
continue;
|
|
326
395
|
}
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
396
|
+
if (typeValue === "code") {
|
|
397
|
+
const script = asString2(rawEvaluator.script);
|
|
398
|
+
if (!script) {
|
|
399
|
+
logWarning2(`Skipping code evaluator '${name}' in '${evalId}': missing script`);
|
|
400
|
+
continue;
|
|
401
|
+
}
|
|
402
|
+
const cwd = asString2(rawEvaluator.cwd);
|
|
403
|
+
let resolvedCwd;
|
|
404
|
+
if (cwd) {
|
|
405
|
+
const resolved = await resolveFileReference(cwd, searchRoots);
|
|
406
|
+
if (resolved.resolvedPath) {
|
|
407
|
+
resolvedCwd = import_node_path3.default.resolve(resolved.resolvedPath);
|
|
408
|
+
} else {
|
|
409
|
+
logWarning2(
|
|
410
|
+
`Code evaluator '${name}' in '${evalId}': cwd not found (${resolved.displayPath})`,
|
|
411
|
+
resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
|
|
412
|
+
);
|
|
413
|
+
}
|
|
414
|
+
} else {
|
|
415
|
+
resolvedCwd = searchRoots[0];
|
|
416
|
+
}
|
|
417
|
+
evaluators.push({
|
|
418
|
+
name,
|
|
419
|
+
type: "code",
|
|
420
|
+
script,
|
|
421
|
+
cwd,
|
|
422
|
+
resolvedCwd
|
|
423
|
+
});
|
|
424
|
+
continue;
|
|
425
|
+
}
|
|
426
|
+
const prompt = asString2(rawEvaluator.prompt);
|
|
427
|
+
let promptPath;
|
|
428
|
+
if (prompt) {
|
|
429
|
+
const resolved = await resolveFileReference(prompt, searchRoots);
|
|
430
|
+
if (resolved.resolvedPath) {
|
|
431
|
+
promptPath = import_node_path3.default.resolve(resolved.resolvedPath);
|
|
432
|
+
} else {
|
|
433
|
+
logWarning2(
|
|
434
|
+
`Inline prompt used for evaluator '${name}' in '${evalId}' (file not found: ${resolved.displayPath})`,
|
|
435
|
+
resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
|
|
436
|
+
);
|
|
437
|
+
}
|
|
330
438
|
}
|
|
439
|
+
const _model = asString2(rawEvaluator.model);
|
|
440
|
+
evaluators.push({
|
|
441
|
+
name,
|
|
442
|
+
type: "llm_judge",
|
|
443
|
+
prompt,
|
|
444
|
+
promptPath
|
|
445
|
+
});
|
|
446
|
+
}
|
|
447
|
+
return evaluators.length > 0 ? evaluators : void 0;
|
|
448
|
+
}
|
|
449
|
+
function coerceEvaluator(candidate, contextId) {
|
|
450
|
+
if (typeof candidate !== "string") {
|
|
451
|
+
return void 0;
|
|
452
|
+
}
|
|
453
|
+
if (isEvaluatorKind(candidate)) {
|
|
454
|
+
return candidate;
|
|
455
|
+
}
|
|
456
|
+
logWarning2(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
|
|
457
|
+
return void 0;
|
|
458
|
+
}
|
|
459
|
+
function asString2(value) {
|
|
460
|
+
return typeof value === "string" ? value : void 0;
|
|
461
|
+
}
|
|
462
|
+
function isJsonObject2(value) {
|
|
463
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
464
|
+
}
|
|
465
|
+
function logWarning2(message, details) {
|
|
466
|
+
if (details && details.length > 0) {
|
|
467
|
+
const detailBlock = details.join("\n");
|
|
468
|
+
console.warn(`${ANSI_YELLOW2}Warning: ${message}
|
|
469
|
+
${detailBlock}${ANSI_RESET2}`);
|
|
470
|
+
} else {
|
|
471
|
+
console.warn(`${ANSI_YELLOW2}Warning: ${message}${ANSI_RESET2}`);
|
|
331
472
|
}
|
|
332
|
-
return codeBlocks;
|
|
333
473
|
}
|
|
474
|
+
|
|
475
|
+
// src/evaluation/loaders/message-processor.ts
|
|
476
|
+
var import_promises3 = require("fs/promises");
|
|
477
|
+
var import_node_path4 = __toESM(require("path"), 1);
|
|
478
|
+
var ANSI_YELLOW3 = "\x1B[33m";
|
|
479
|
+
var ANSI_RESET3 = "\x1B[0m";
|
|
334
480
|
async function processMessages(options) {
|
|
335
481
|
const {
|
|
336
482
|
messages,
|
|
@@ -356,9 +502,9 @@ async function processMessages(options) {
|
|
|
356
502
|
if (!isJsonObject(rawSegment)) {
|
|
357
503
|
continue;
|
|
358
504
|
}
|
|
359
|
-
const segmentType =
|
|
505
|
+
const segmentType = asString3(rawSegment.type);
|
|
360
506
|
if (segmentType === "file") {
|
|
361
|
-
const rawValue =
|
|
507
|
+
const rawValue = asString3(rawSegment.value);
|
|
362
508
|
if (!rawValue) {
|
|
363
509
|
continue;
|
|
364
510
|
}
|
|
@@ -369,15 +515,15 @@ async function processMessages(options) {
|
|
|
369
515
|
if (!resolvedPath) {
|
|
370
516
|
const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
|
|
371
517
|
const context = messageType === "input" ? "" : " in expected_messages";
|
|
372
|
-
|
|
518
|
+
logWarning3(`File not found${context}: ${displayPath}`, attempts);
|
|
373
519
|
continue;
|
|
374
520
|
}
|
|
375
521
|
try {
|
|
376
|
-
const fileContent = (await (0,
|
|
522
|
+
const fileContent = (await (0, import_promises3.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
|
|
377
523
|
if (messageType === "input" && guidelinePatterns && guidelinePaths) {
|
|
378
|
-
const relativeToRepo =
|
|
524
|
+
const relativeToRepo = import_node_path4.default.relative(repoRootPath, resolvedPath);
|
|
379
525
|
if (isGuidelineFile(relativeToRepo, guidelinePatterns)) {
|
|
380
|
-
guidelinePaths.push(
|
|
526
|
+
guidelinePaths.push(import_node_path4.default.resolve(resolvedPath));
|
|
381
527
|
if (verbose) {
|
|
382
528
|
console.log(` [Guideline] Found: ${displayPath}`);
|
|
383
529
|
console.log(` Resolved to: ${resolvedPath}`);
|
|
@@ -389,7 +535,7 @@ async function processMessages(options) {
|
|
|
389
535
|
type: "file",
|
|
390
536
|
path: displayPath,
|
|
391
537
|
text: fileContent,
|
|
392
|
-
resolvedPath:
|
|
538
|
+
resolvedPath: import_node_path4.default.resolve(resolvedPath)
|
|
393
539
|
});
|
|
394
540
|
if (verbose) {
|
|
395
541
|
const label = messageType === "input" ? "[File]" : "[Expected Output File]";
|
|
@@ -398,7 +544,7 @@ async function processMessages(options) {
|
|
|
398
544
|
}
|
|
399
545
|
} catch (error) {
|
|
400
546
|
const context = messageType === "input" ? "" : " expected output";
|
|
401
|
-
|
|
547
|
+
logWarning3(`Could not read${context} file ${resolvedPath}: ${error.message}`);
|
|
402
548
|
}
|
|
403
549
|
continue;
|
|
404
550
|
}
|
|
@@ -412,201 +558,117 @@ async function processMessages(options) {
|
|
|
412
558
|
}
|
|
413
559
|
return segments;
|
|
414
560
|
}
|
|
415
|
-
async function
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
const absoluteTestPath = import_node_path2.default.resolve(evalFilePath);
|
|
419
|
-
if (!await fileExists2(absoluteTestPath)) {
|
|
420
|
-
throw new Error(`Test file not found: ${evalFilePath}`);
|
|
421
|
-
}
|
|
422
|
-
const repoRootPath = resolveToAbsolutePath(repoRoot);
|
|
423
|
-
const searchRoots = buildSearchRoots(absoluteTestPath, repoRootPath);
|
|
424
|
-
const config = await loadConfig(absoluteTestPath, repoRootPath);
|
|
425
|
-
const guidelinePatterns = config?.guideline_patterns;
|
|
426
|
-
const rawFile = await (0, import_promises2.readFile)(absoluteTestPath, "utf8");
|
|
427
|
-
const parsed = (0, import_yaml.parse)(rawFile);
|
|
428
|
-
if (!isJsonObject(parsed)) {
|
|
429
|
-
throw new Error(`Invalid test file format: ${evalFilePath}`);
|
|
430
|
-
}
|
|
431
|
-
const suite = parsed;
|
|
432
|
-
const datasetNameFromSuite = asString(suite.dataset)?.trim();
|
|
433
|
-
const fallbackDataset = import_node_path2.default.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
|
|
434
|
-
const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
|
|
435
|
-
const schema = suite.$schema;
|
|
436
|
-
if (schema !== SCHEMA_EVAL_V2) {
|
|
437
|
-
const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${evalFilePath}. Expected '${SCHEMA_EVAL_V2}'` : `Missing required field '$schema' in ${evalFilePath}.
|
|
438
|
-
Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
439
|
-
throw new Error(message);
|
|
561
|
+
async function resolveAssistantContent(content, searchRoots, verbose) {
|
|
562
|
+
if (typeof content === "string") {
|
|
563
|
+
return content;
|
|
440
564
|
}
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
throw new Error(`Invalid test file format: ${evalFilePath} - missing 'evalcases' field`);
|
|
565
|
+
if (!content) {
|
|
566
|
+
return "";
|
|
444
567
|
}
|
|
445
|
-
const
|
|
446
|
-
const
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
for (const rawEvalcase of rawTestcases) {
|
|
450
|
-
if (!isJsonObject(rawEvalcase)) {
|
|
451
|
-
logWarning("Skipping invalid eval case entry (expected object)");
|
|
568
|
+
const parts = [];
|
|
569
|
+
for (const entry of content) {
|
|
570
|
+
if (typeof entry === "string") {
|
|
571
|
+
parts.push({ content: entry, isFile: false });
|
|
452
572
|
continue;
|
|
453
573
|
}
|
|
454
|
-
|
|
455
|
-
const id = asString(evalcase.id);
|
|
456
|
-
if (evalIdFilter && id !== evalIdFilter) {
|
|
574
|
+
if (!isJsonObject(entry)) {
|
|
457
575
|
continue;
|
|
458
576
|
}
|
|
459
|
-
const
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
577
|
+
const segmentType = asString3(entry.type);
|
|
578
|
+
if (segmentType === "file") {
|
|
579
|
+
const rawValue = asString3(entry.value);
|
|
580
|
+
if (!rawValue) {
|
|
581
|
+
continue;
|
|
582
|
+
}
|
|
583
|
+
const { displayPath, resolvedPath, attempted } = await resolveFileReference(
|
|
584
|
+
rawValue,
|
|
585
|
+
searchRoots
|
|
586
|
+
);
|
|
587
|
+
if (!resolvedPath) {
|
|
588
|
+
const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
|
|
589
|
+
logWarning3(`File not found in expected_messages: ${displayPath}`, attempts);
|
|
590
|
+
continue;
|
|
591
|
+
}
|
|
592
|
+
try {
|
|
593
|
+
const fileContent = (await (0, import_promises3.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n").trim();
|
|
594
|
+
parts.push({ content: fileContent, isFile: true, displayPath });
|
|
595
|
+
if (verbose) {
|
|
596
|
+
console.log(` [Expected Assistant File] Found: ${displayPath}`);
|
|
597
|
+
console.log(` Resolved to: ${resolvedPath}`);
|
|
598
|
+
}
|
|
599
|
+
} catch (error) {
|
|
600
|
+
logWarning3(`Could not read file ${resolvedPath}: ${error.message}`);
|
|
601
|
+
}
|
|
465
602
|
continue;
|
|
466
603
|
}
|
|
467
|
-
const
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
if (hasExpectedMessages && expectedMessages.length === 0) {
|
|
471
|
-
logWarning(`No valid expected message found for eval case: ${id}`);
|
|
604
|
+
const textValue = asString3(entry.text);
|
|
605
|
+
if (typeof textValue === "string") {
|
|
606
|
+
parts.push({ content: textValue, isFile: false });
|
|
472
607
|
continue;
|
|
473
608
|
}
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
const inputTextParts = [];
|
|
479
|
-
const inputSegments = await processMessages({
|
|
480
|
-
messages: inputMessages,
|
|
481
|
-
searchRoots,
|
|
482
|
-
repoRootPath,
|
|
483
|
-
guidelinePatterns,
|
|
484
|
-
guidelinePaths,
|
|
485
|
-
textParts: inputTextParts,
|
|
486
|
-
messageType: "input",
|
|
487
|
-
verbose
|
|
488
|
-
});
|
|
489
|
-
const outputSegments = hasExpectedMessages ? await processMessages({
|
|
490
|
-
messages: expectedMessages,
|
|
491
|
-
searchRoots,
|
|
492
|
-
repoRootPath,
|
|
493
|
-
guidelinePatterns,
|
|
494
|
-
messageType: "output",
|
|
495
|
-
verbose
|
|
496
|
-
}) : [];
|
|
497
|
-
const codeSnippets = extractCodeBlocks(inputSegments);
|
|
498
|
-
const expectedContent = expectedMessages[0]?.content;
|
|
499
|
-
const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
|
|
500
|
-
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
501
|
-
const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
|
|
502
|
-
const evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
|
|
503
|
-
const userFilePaths = [];
|
|
504
|
-
for (const segment of inputSegments) {
|
|
505
|
-
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
506
|
-
userFilePaths.push(segment.resolvedPath);
|
|
507
|
-
}
|
|
508
|
-
}
|
|
509
|
-
const allFilePaths = [
|
|
510
|
-
...guidelinePaths.map((guidelinePath) => import_node_path2.default.resolve(guidelinePath)),
|
|
511
|
-
...userFilePaths
|
|
512
|
-
];
|
|
513
|
-
const testCase = {
|
|
514
|
-
id,
|
|
515
|
-
dataset: datasetName,
|
|
516
|
-
conversation_id: conversationId,
|
|
517
|
-
question,
|
|
518
|
-
input_messages: inputMessages,
|
|
519
|
-
input_segments: inputSegments,
|
|
520
|
-
output_segments: outputSegments,
|
|
521
|
-
reference_answer: referenceAnswer,
|
|
522
|
-
guideline_paths: guidelinePaths.map((guidelinePath) => import_node_path2.default.resolve(guidelinePath)),
|
|
523
|
-
guideline_patterns: guidelinePatterns,
|
|
524
|
-
file_paths: allFilePaths,
|
|
525
|
-
code_snippets: codeSnippets,
|
|
526
|
-
expected_outcome: outcome,
|
|
527
|
-
evaluator: evalCaseEvaluatorKind,
|
|
528
|
-
evaluators
|
|
529
|
-
};
|
|
530
|
-
if (verbose) {
|
|
531
|
-
console.log(`
|
|
532
|
-
[Eval Case: ${id}]`);
|
|
533
|
-
if (testCase.guideline_paths.length > 0) {
|
|
534
|
-
console.log(` Guidelines used: ${testCase.guideline_paths.length}`);
|
|
535
|
-
for (const guidelinePath of testCase.guideline_paths) {
|
|
536
|
-
console.log(` - ${guidelinePath}`);
|
|
537
|
-
}
|
|
538
|
-
} else {
|
|
539
|
-
console.log(" No guidelines found");
|
|
540
|
-
}
|
|
609
|
+
const valueValue = asString3(entry.value);
|
|
610
|
+
if (typeof valueValue === "string") {
|
|
611
|
+
parts.push({ content: valueValue, isFile: false });
|
|
612
|
+
continue;
|
|
541
613
|
}
|
|
542
|
-
|
|
614
|
+
parts.push({ content: JSON.stringify(entry), isFile: false });
|
|
543
615
|
}
|
|
544
|
-
return
|
|
616
|
+
return formatFileContents(parts);
|
|
545
617
|
}
|
|
546
|
-
function
|
|
547
|
-
|
|
548
|
-
return true;
|
|
549
|
-
}
|
|
550
|
-
let messagesWithContent = 0;
|
|
551
|
-
for (const segments of processedSegmentsByMessage) {
|
|
552
|
-
if (hasVisibleContent(segments)) {
|
|
553
|
-
messagesWithContent++;
|
|
554
|
-
}
|
|
555
|
-
}
|
|
556
|
-
return messagesWithContent > 1;
|
|
618
|
+
function asString3(value) {
|
|
619
|
+
return typeof value === "string" ? value : void 0;
|
|
557
620
|
}
|
|
558
|
-
function
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
if (type === "text") {
|
|
562
|
-
const value = asString(segment.value);
|
|
563
|
-
return value !== void 0 && value.trim().length > 0;
|
|
564
|
-
}
|
|
565
|
-
if (type === "guideline_ref") {
|
|
566
|
-
return false;
|
|
567
|
-
}
|
|
568
|
-
if (type === "file") {
|
|
569
|
-
const text = asString(segment.text);
|
|
570
|
-
return text !== void 0 && text.trim().length > 0;
|
|
571
|
-
}
|
|
572
|
-
return false;
|
|
573
|
-
});
|
|
621
|
+
function cloneJsonObject(source) {
|
|
622
|
+
const entries = Object.entries(source).map(([key, value]) => [key, cloneJsonValue(value)]);
|
|
623
|
+
return Object.fromEntries(entries);
|
|
574
624
|
}
|
|
575
|
-
function
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
return asString(segment.value);
|
|
625
|
+
function cloneJsonValue(value) {
|
|
626
|
+
if (value === null) {
|
|
627
|
+
return null;
|
|
579
628
|
}
|
|
580
|
-
if (
|
|
581
|
-
|
|
582
|
-
return refPath ? `<Attached: ${refPath}>` : void 0;
|
|
629
|
+
if (typeof value === "string" || typeof value === "number" || typeof value === "boolean") {
|
|
630
|
+
return value;
|
|
583
631
|
}
|
|
584
|
-
if (
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
632
|
+
if (Array.isArray(value)) {
|
|
633
|
+
return value.map((item) => cloneJsonValue(item));
|
|
634
|
+
}
|
|
635
|
+
if (typeof value === "object") {
|
|
636
|
+
return cloneJsonObject(value);
|
|
637
|
+
}
|
|
638
|
+
return value;
|
|
639
|
+
}
|
|
640
|
+
function logWarning3(message, details) {
|
|
641
|
+
if (details && details.length > 0) {
|
|
642
|
+
const detailBlock = details.join("\n");
|
|
643
|
+
console.warn(`${ANSI_YELLOW3}Warning: ${message}
|
|
644
|
+
${detailBlock}${ANSI_RESET3}`);
|
|
645
|
+
} else {
|
|
646
|
+
console.warn(`${ANSI_YELLOW3}Warning: ${message}${ANSI_RESET3}`);
|
|
590
647
|
}
|
|
591
|
-
return void 0;
|
|
592
648
|
}
|
|
649
|
+
|
|
650
|
+
// src/evaluation/formatting/prompt-builder.ts
|
|
651
|
+
var import_promises4 = require("fs/promises");
|
|
652
|
+
var import_node_path5 = __toESM(require("path"), 1);
|
|
653
|
+
var ANSI_YELLOW4 = "\x1B[33m";
|
|
654
|
+
var ANSI_RESET4 = "\x1B[0m";
|
|
593
655
|
async function buildPromptInputs(testCase) {
|
|
594
656
|
const guidelineParts = [];
|
|
595
657
|
for (const rawPath of testCase.guideline_paths) {
|
|
596
|
-
const absolutePath =
|
|
597
|
-
if (!await
|
|
598
|
-
|
|
658
|
+
const absolutePath = import_node_path5.default.resolve(rawPath);
|
|
659
|
+
if (!await fileExists(absolutePath)) {
|
|
660
|
+
logWarning4(`Could not read guideline file ${absolutePath}: file does not exist`);
|
|
599
661
|
continue;
|
|
600
662
|
}
|
|
601
663
|
try {
|
|
602
|
-
const content = (await (0,
|
|
664
|
+
const content = (await (0, import_promises4.readFile)(absolutePath, "utf8")).replace(/\r\n/g, "\n").trim();
|
|
603
665
|
guidelineParts.push({
|
|
604
666
|
content,
|
|
605
667
|
isFile: true,
|
|
606
|
-
displayPath:
|
|
668
|
+
displayPath: import_node_path5.default.basename(absolutePath)
|
|
607
669
|
});
|
|
608
670
|
} catch (error) {
|
|
609
|
-
|
|
671
|
+
logWarning4(`Could not read guideline file ${absolutePath}: ${error.message}`);
|
|
610
672
|
}
|
|
611
673
|
}
|
|
612
674
|
const guidelines = formatFileContents(guidelineParts);
|
|
@@ -630,9 +692,9 @@ async function buildPromptInputs(testCase) {
|
|
|
630
692
|
messageSegments.push({ type: "text", value: segment });
|
|
631
693
|
}
|
|
632
694
|
} else if (isJsonObject(segment)) {
|
|
633
|
-
const type =
|
|
695
|
+
const type = asString4(segment.type);
|
|
634
696
|
if (type === "file") {
|
|
635
|
-
const value =
|
|
697
|
+
const value = asString4(segment.value);
|
|
636
698
|
if (!value) continue;
|
|
637
699
|
if (testCase.guideline_patterns && isGuidelineFile(value, testCase.guideline_patterns)) {
|
|
638
700
|
messageSegments.push({ type: "guideline_ref", path: value });
|
|
@@ -643,7 +705,7 @@ async function buildPromptInputs(testCase) {
|
|
|
643
705
|
messageSegments.push({ type: "file", text: fileText, path: value });
|
|
644
706
|
}
|
|
645
707
|
} else if (type === "text") {
|
|
646
|
-
const textValue =
|
|
708
|
+
const textValue = asString4(segment.value);
|
|
647
709
|
if (textValue && textValue.trim().length > 0) {
|
|
648
710
|
messageSegments.push({ type: "text", value: textValue });
|
|
649
711
|
}
|
|
@@ -699,6 +761,18 @@ ${messageContent}`);
|
|
|
699
761
|
}) : void 0;
|
|
700
762
|
return { question, guidelines, chatPrompt };
|
|
701
763
|
}
|
|
764
|
+
function needsRoleMarkers(messages, processedSegmentsByMessage) {
|
|
765
|
+
if (messages.some((msg) => msg.role === "assistant" || msg.role === "tool")) {
|
|
766
|
+
return true;
|
|
767
|
+
}
|
|
768
|
+
let messagesWithContent = 0;
|
|
769
|
+
for (const segments of processedSegmentsByMessage) {
|
|
770
|
+
if (hasVisibleContent(segments)) {
|
|
771
|
+
messagesWithContent++;
|
|
772
|
+
}
|
|
773
|
+
}
|
|
774
|
+
return messagesWithContent > 1;
|
|
775
|
+
}
|
|
702
776
|
function buildChatPromptFromSegments(options) {
|
|
703
777
|
const { messages, segmentsByMessage, guidelinePatterns, guidelineContent, systemPrompt } = options;
|
|
704
778
|
if (messages.length === 0) {
|
|
@@ -770,211 +844,282 @@ ${guidelineContent.trim()}`);
|
|
|
770
844
|
...name ? { name } : {}
|
|
771
845
|
});
|
|
772
846
|
}
|
|
773
|
-
return chatPrompt.length > 0 ? chatPrompt : void 0;
|
|
847
|
+
return chatPrompt.length > 0 ? chatPrompt : void 0;
|
|
848
|
+
}
|
|
849
|
+
function asString4(value) {
|
|
850
|
+
return typeof value === "string" ? value : void 0;
|
|
851
|
+
}
|
|
852
|
+
function logWarning4(message) {
|
|
853
|
+
console.warn(`${ANSI_YELLOW4}Warning: ${message}${ANSI_RESET4}`);
|
|
854
|
+
}
|
|
855
|
+
|
|
856
|
+
// src/evaluation/yaml-parser.ts
|
|
857
|
+
var ANSI_YELLOW5 = "\x1B[33m";
|
|
858
|
+
var ANSI_RESET5 = "\x1B[0m";
|
|
859
|
+
var SCHEMA_EVAL_V2 = "agentv-eval-v2";
|
|
860
|
+
async function readTestSuiteMetadata(testFilePath) {
|
|
861
|
+
try {
|
|
862
|
+
const absolutePath = import_node_path6.default.resolve(testFilePath);
|
|
863
|
+
const content = await (0, import_promises5.readFile)(absolutePath, "utf8");
|
|
864
|
+
const parsed = (0, import_yaml2.parse)(content);
|
|
865
|
+
if (!isJsonObject(parsed)) {
|
|
866
|
+
return {};
|
|
867
|
+
}
|
|
868
|
+
return { target: extractTargetFromSuite(parsed) };
|
|
869
|
+
} catch {
|
|
870
|
+
return {};
|
|
871
|
+
}
|
|
872
|
+
}
|
|
873
|
+
async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
874
|
+
const verbose = options?.verbose ?? false;
|
|
875
|
+
const evalIdFilter = options?.evalId;
|
|
876
|
+
const absoluteTestPath = import_node_path6.default.resolve(evalFilePath);
|
|
877
|
+
const repoRootPath = resolveToAbsolutePath(repoRoot);
|
|
878
|
+
const searchRoots = buildSearchRoots(absoluteTestPath, repoRootPath);
|
|
879
|
+
const config = await loadConfig(absoluteTestPath, repoRootPath);
|
|
880
|
+
const guidelinePatterns = config?.guideline_patterns;
|
|
881
|
+
const rawFile = await (0, import_promises5.readFile)(absoluteTestPath, "utf8");
|
|
882
|
+
const parsed = (0, import_yaml2.parse)(rawFile);
|
|
883
|
+
if (!isJsonObject(parsed)) {
|
|
884
|
+
throw new Error(`Invalid test file format: ${evalFilePath}`);
|
|
885
|
+
}
|
|
886
|
+
const suite = parsed;
|
|
887
|
+
const datasetNameFromSuite = asString5(suite.dataset)?.trim();
|
|
888
|
+
const fallbackDataset = import_node_path6.default.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
|
|
889
|
+
const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
|
|
890
|
+
const schema = suite.$schema;
|
|
891
|
+
if (schema !== SCHEMA_EVAL_V2) {
|
|
892
|
+
const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${evalFilePath}. Expected '${SCHEMA_EVAL_V2}'` : `Missing required field '$schema' in ${evalFilePath}.
|
|
893
|
+
Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
|
|
894
|
+
throw new Error(message);
|
|
895
|
+
}
|
|
896
|
+
const rawTestcases = suite.evalcases;
|
|
897
|
+
if (!Array.isArray(rawTestcases)) {
|
|
898
|
+
throw new Error(`Invalid test file format: ${evalFilePath} - missing 'evalcases' field`);
|
|
899
|
+
}
|
|
900
|
+
const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm_judge";
|
|
901
|
+
const globalExecution = isJsonObject(suite.execution) ? suite.execution : void 0;
|
|
902
|
+
const _globalTarget = asString5(globalExecution?.target) ?? asString5(suite.target);
|
|
903
|
+
const results = [];
|
|
904
|
+
for (const rawEvalcase of rawTestcases) {
|
|
905
|
+
if (!isJsonObject(rawEvalcase)) {
|
|
906
|
+
logWarning5("Skipping invalid eval case entry (expected object)");
|
|
907
|
+
continue;
|
|
908
|
+
}
|
|
909
|
+
const evalcase = rawEvalcase;
|
|
910
|
+
const id = asString5(evalcase.id);
|
|
911
|
+
if (evalIdFilter && id !== evalIdFilter) {
|
|
912
|
+
continue;
|
|
913
|
+
}
|
|
914
|
+
const conversationId = asString5(evalcase.conversation_id);
|
|
915
|
+
const outcome = asString5(evalcase.outcome);
|
|
916
|
+
const inputMessagesValue = evalcase.input_messages;
|
|
917
|
+
const expectedMessagesValue = evalcase.expected_messages;
|
|
918
|
+
if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
|
|
919
|
+
logWarning5(`Skipping incomplete eval case: ${id ?? "unknown"}`);
|
|
920
|
+
continue;
|
|
921
|
+
}
|
|
922
|
+
const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
|
|
923
|
+
const inputMessages = inputMessagesValue.filter((msg) => isTestMessage(msg));
|
|
924
|
+
const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
|
|
925
|
+
if (hasExpectedMessages && expectedMessages.length === 0) {
|
|
926
|
+
logWarning5(`No valid expected message found for eval case: ${id}`);
|
|
927
|
+
continue;
|
|
928
|
+
}
|
|
929
|
+
if (expectedMessages.length > 1) {
|
|
930
|
+
logWarning5(`Multiple expected messages found for eval case: ${id}, using first`);
|
|
931
|
+
}
|
|
932
|
+
const guidelinePaths = [];
|
|
933
|
+
const inputTextParts = [];
|
|
934
|
+
const inputSegments = await processMessages({
|
|
935
|
+
messages: inputMessages,
|
|
936
|
+
searchRoots,
|
|
937
|
+
repoRootPath,
|
|
938
|
+
guidelinePatterns,
|
|
939
|
+
guidelinePaths,
|
|
940
|
+
textParts: inputTextParts,
|
|
941
|
+
messageType: "input",
|
|
942
|
+
verbose
|
|
943
|
+
});
|
|
944
|
+
const outputSegments = hasExpectedMessages ? await processMessages({
|
|
945
|
+
messages: expectedMessages,
|
|
946
|
+
searchRoots,
|
|
947
|
+
repoRootPath,
|
|
948
|
+
guidelinePatterns,
|
|
949
|
+
messageType: "output",
|
|
950
|
+
verbose
|
|
951
|
+
}) : [];
|
|
952
|
+
const codeSnippets = extractCodeBlocks(inputSegments);
|
|
953
|
+
const expectedContent = expectedMessages[0]?.content;
|
|
954
|
+
const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
|
|
955
|
+
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
956
|
+
const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
|
|
957
|
+
const evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
|
|
958
|
+
const userFilePaths = [];
|
|
959
|
+
for (const segment of inputSegments) {
|
|
960
|
+
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
961
|
+
userFilePaths.push(segment.resolvedPath);
|
|
962
|
+
}
|
|
963
|
+
}
|
|
964
|
+
const allFilePaths = [
|
|
965
|
+
...guidelinePaths.map((guidelinePath) => import_node_path6.default.resolve(guidelinePath)),
|
|
966
|
+
...userFilePaths
|
|
967
|
+
];
|
|
968
|
+
const testCase = {
|
|
969
|
+
id,
|
|
970
|
+
dataset: datasetName,
|
|
971
|
+
conversation_id: conversationId,
|
|
972
|
+
question,
|
|
973
|
+
input_messages: inputMessages,
|
|
974
|
+
input_segments: inputSegments,
|
|
975
|
+
output_segments: outputSegments,
|
|
976
|
+
reference_answer: referenceAnswer,
|
|
977
|
+
guideline_paths: guidelinePaths.map((guidelinePath) => import_node_path6.default.resolve(guidelinePath)),
|
|
978
|
+
guideline_patterns: guidelinePatterns,
|
|
979
|
+
file_paths: allFilePaths,
|
|
980
|
+
code_snippets: codeSnippets,
|
|
981
|
+
expected_outcome: outcome,
|
|
982
|
+
evaluator: evalCaseEvaluatorKind,
|
|
983
|
+
evaluators
|
|
984
|
+
};
|
|
985
|
+
if (verbose) {
|
|
986
|
+
console.log(`
|
|
987
|
+
[Eval Case: ${id}]`);
|
|
988
|
+
if (testCase.guideline_paths.length > 0) {
|
|
989
|
+
console.log(` Guidelines used: ${testCase.guideline_paths.length}`);
|
|
990
|
+
for (const guidelinePath of testCase.guideline_paths) {
|
|
991
|
+
console.log(` - ${guidelinePath}`);
|
|
992
|
+
}
|
|
993
|
+
} else {
|
|
994
|
+
console.log(" No guidelines found");
|
|
995
|
+
}
|
|
996
|
+
}
|
|
997
|
+
results.push(testCase);
|
|
998
|
+
}
|
|
999
|
+
return results;
|
|
1000
|
+
}
|
|
1001
|
+
function asString5(value) {
|
|
1002
|
+
return typeof value === "string" ? value : void 0;
|
|
1003
|
+
}
|
|
1004
|
+
function logWarning5(message, details) {
|
|
1005
|
+
if (details && details.length > 0) {
|
|
1006
|
+
const detailBlock = details.join("\n");
|
|
1007
|
+
console.warn(`${ANSI_YELLOW5}Warning: ${message}
|
|
1008
|
+
${detailBlock}${ANSI_RESET5}`);
|
|
1009
|
+
} else {
|
|
1010
|
+
console.warn(`${ANSI_YELLOW5}Warning: ${message}${ANSI_RESET5}`);
|
|
1011
|
+
}
|
|
774
1012
|
}
|
|
775
|
-
|
|
1013
|
+
|
|
1014
|
+
// src/evaluation/file-utils.ts
|
|
1015
|
+
var import_node_fs2 = require("fs");
|
|
1016
|
+
var import_promises6 = require("fs/promises");
|
|
1017
|
+
var import_node_path7 = __toESM(require("path"), 1);
|
|
1018
|
+
async function fileExists2(filePath) {
|
|
776
1019
|
try {
|
|
777
|
-
await (0,
|
|
1020
|
+
await (0, import_promises6.access)(filePath, import_node_fs2.constants.F_OK);
|
|
778
1021
|
return true;
|
|
779
1022
|
} catch {
|
|
780
1023
|
return false;
|
|
781
1024
|
}
|
|
782
1025
|
}
|
|
783
|
-
function
|
|
784
|
-
|
|
785
|
-
return (0, import_node_url.fileURLToPath)(candidate);
|
|
786
|
-
}
|
|
787
|
-
if (typeof candidate === "string") {
|
|
788
|
-
if (candidate.startsWith("file://")) {
|
|
789
|
-
return (0, import_node_url.fileURLToPath)(new URL(candidate));
|
|
790
|
-
}
|
|
791
|
-
return import_node_path2.default.resolve(candidate);
|
|
792
|
-
}
|
|
793
|
-
throw new TypeError("Unsupported repoRoot value. Expected string or URL.");
|
|
794
|
-
}
|
|
795
|
-
function asString(value) {
|
|
796
|
-
return typeof value === "string" ? value : void 0;
|
|
797
|
-
}
|
|
798
|
-
function cloneJsonObject(source) {
|
|
799
|
-
const entries = Object.entries(source).map(([key, value]) => [key, cloneJsonValue(value)]);
|
|
800
|
-
return Object.fromEntries(entries);
|
|
1026
|
+
function normalizeLineEndings(content) {
|
|
1027
|
+
return content.replace(/\r\n/g, "\n");
|
|
801
1028
|
}
|
|
802
|
-
function
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
}
|
|
806
|
-
if (typeof value === "string" || typeof value === "number" || typeof value === "boolean") {
|
|
807
|
-
return value;
|
|
808
|
-
}
|
|
809
|
-
if (Array.isArray(value)) {
|
|
810
|
-
return value.map((item) => cloneJsonValue(item));
|
|
811
|
-
}
|
|
812
|
-
return cloneJsonObject(value);
|
|
1029
|
+
async function readTextFile(filePath) {
|
|
1030
|
+
const content = await (0, import_promises6.readFile)(filePath, "utf8");
|
|
1031
|
+
return normalizeLineEndings(content);
|
|
813
1032
|
}
|
|
814
|
-
function
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
1033
|
+
async function findGitRoot(startPath) {
|
|
1034
|
+
let currentDir = import_node_path7.default.dirname(import_node_path7.default.resolve(startPath));
|
|
1035
|
+
const root = import_node_path7.default.parse(currentDir).root;
|
|
1036
|
+
while (currentDir !== root) {
|
|
1037
|
+
const gitPath = import_node_path7.default.join(currentDir, ".git");
|
|
1038
|
+
if (await fileExists2(gitPath)) {
|
|
1039
|
+
return currentDir;
|
|
1040
|
+
}
|
|
1041
|
+
const parentDir = import_node_path7.default.dirname(currentDir);
|
|
1042
|
+
if (parentDir === currentDir) {
|
|
1043
|
+
break;
|
|
1044
|
+
}
|
|
1045
|
+
currentDir = parentDir;
|
|
825
1046
|
}
|
|
826
|
-
return
|
|
1047
|
+
return null;
|
|
827
1048
|
}
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
if (typeof entry === "string") {
|
|
838
|
-
parts.push({ content: entry, isFile: false });
|
|
839
|
-
continue;
|
|
1049
|
+
function buildDirectoryChain2(filePath, repoRoot) {
|
|
1050
|
+
const directories = [];
|
|
1051
|
+
const seen = /* @__PURE__ */ new Set();
|
|
1052
|
+
const boundary = import_node_path7.default.resolve(repoRoot);
|
|
1053
|
+
let current = import_node_path7.default.resolve(import_node_path7.default.dirname(filePath));
|
|
1054
|
+
while (current !== void 0) {
|
|
1055
|
+
if (!seen.has(current)) {
|
|
1056
|
+
directories.push(current);
|
|
1057
|
+
seen.add(current);
|
|
840
1058
|
}
|
|
841
|
-
if (
|
|
842
|
-
|
|
1059
|
+
if (current === boundary) {
|
|
1060
|
+
break;
|
|
843
1061
|
}
|
|
844
|
-
const
|
|
845
|
-
if (
|
|
846
|
-
|
|
847
|
-
if (!rawValue) {
|
|
848
|
-
continue;
|
|
849
|
-
}
|
|
850
|
-
const { displayPath, resolvedPath, attempted } = await resolveFileReference(
|
|
851
|
-
rawValue,
|
|
852
|
-
searchRoots
|
|
853
|
-
);
|
|
854
|
-
if (!resolvedPath) {
|
|
855
|
-
const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
|
|
856
|
-
logWarning(`File not found in expected_messages: ${displayPath}`, attempts);
|
|
857
|
-
continue;
|
|
858
|
-
}
|
|
859
|
-
try {
|
|
860
|
-
const fileContent = (await (0, import_promises2.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n").trim();
|
|
861
|
-
parts.push({ content: fileContent, isFile: true, displayPath });
|
|
862
|
-
if (verbose) {
|
|
863
|
-
console.log(` [Expected Assistant File] Found: ${displayPath}`);
|
|
864
|
-
console.log(` Resolved to: ${resolvedPath}`);
|
|
865
|
-
}
|
|
866
|
-
} catch (error) {
|
|
867
|
-
logWarning(`Could not read file ${resolvedPath}: ${error.message}`);
|
|
868
|
-
}
|
|
869
|
-
continue;
|
|
1062
|
+
const parent = import_node_path7.default.dirname(current);
|
|
1063
|
+
if (parent === current) {
|
|
1064
|
+
break;
|
|
870
1065
|
}
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
|
|
1066
|
+
current = parent;
|
|
1067
|
+
}
|
|
1068
|
+
if (!seen.has(boundary)) {
|
|
1069
|
+
directories.push(boundary);
|
|
1070
|
+
}
|
|
1071
|
+
return directories;
|
|
1072
|
+
}
|
|
1073
|
+
function buildSearchRoots2(evalPath, repoRoot) {
|
|
1074
|
+
const uniqueRoots = [];
|
|
1075
|
+
const addRoot = (root) => {
|
|
1076
|
+
const normalized = import_node_path7.default.resolve(root);
|
|
1077
|
+
if (!uniqueRoots.includes(normalized)) {
|
|
1078
|
+
uniqueRoots.push(normalized);
|
|
875
1079
|
}
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
1080
|
+
};
|
|
1081
|
+
let currentDir = import_node_path7.default.dirname(evalPath);
|
|
1082
|
+
let reachedBoundary = false;
|
|
1083
|
+
while (!reachedBoundary) {
|
|
1084
|
+
addRoot(currentDir);
|
|
1085
|
+
const parentDir = import_node_path7.default.dirname(currentDir);
|
|
1086
|
+
if (currentDir === repoRoot || parentDir === currentDir) {
|
|
1087
|
+
reachedBoundary = true;
|
|
1088
|
+
} else {
|
|
1089
|
+
currentDir = parentDir;
|
|
880
1090
|
}
|
|
881
|
-
parts.push({ content: JSON.stringify(entry), isFile: false });
|
|
882
1091
|
}
|
|
883
|
-
|
|
1092
|
+
addRoot(repoRoot);
|
|
1093
|
+
addRoot(process.cwd());
|
|
1094
|
+
return uniqueRoots;
|
|
884
1095
|
}
|
|
885
|
-
|
|
886
|
-
const
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
1096
|
+
function trimLeadingSeparators2(value) {
|
|
1097
|
+
const trimmed = value.replace(/^[/\\]+/, "");
|
|
1098
|
+
return trimmed.length > 0 ? trimmed : value;
|
|
1099
|
+
}
|
|
1100
|
+
async function resolveFileReference2(rawValue, searchRoots) {
|
|
1101
|
+
const displayPath = trimLeadingSeparators2(rawValue);
|
|
1102
|
+
const potentialPaths = [];
|
|
1103
|
+
if (import_node_path7.default.isAbsolute(rawValue)) {
|
|
1104
|
+
potentialPaths.push(import_node_path7.default.normalize(rawValue));
|
|
890
1105
|
}
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
return void 0;
|
|
1106
|
+
for (const base of searchRoots) {
|
|
1107
|
+
potentialPaths.push(import_node_path7.default.resolve(base, displayPath));
|
|
894
1108
|
}
|
|
895
|
-
const
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
}
|
|
901
|
-
const name = asString(rawEvaluator.name);
|
|
902
|
-
const typeValue = rawEvaluator.type;
|
|
903
|
-
if (!name || !isEvaluatorKind(typeValue)) {
|
|
904
|
-
logWarning(`Skipping evaluator with invalid name/type in '${evalId}'`);
|
|
905
|
-
continue;
|
|
906
|
-
}
|
|
907
|
-
if (typeValue === "code") {
|
|
908
|
-
const script = asString(rawEvaluator.script);
|
|
909
|
-
if (!script) {
|
|
910
|
-
logWarning(`Skipping code evaluator '${name}' in '${evalId}': missing script`);
|
|
911
|
-
continue;
|
|
912
|
-
}
|
|
913
|
-
const cwd = asString(rawEvaluator.cwd);
|
|
914
|
-
let resolvedCwd;
|
|
915
|
-
if (cwd) {
|
|
916
|
-
const resolved = await resolveFileReference(cwd, searchRoots);
|
|
917
|
-
if (resolved.resolvedPath) {
|
|
918
|
-
resolvedCwd = import_node_path2.default.resolve(resolved.resolvedPath);
|
|
919
|
-
} else {
|
|
920
|
-
logWarning(
|
|
921
|
-
`Code evaluator '${name}' in '${evalId}': cwd not found (${resolved.displayPath})`,
|
|
922
|
-
resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
|
|
923
|
-
);
|
|
924
|
-
}
|
|
925
|
-
} else {
|
|
926
|
-
resolvedCwd = searchRoots[0];
|
|
927
|
-
}
|
|
928
|
-
evaluators.push({
|
|
929
|
-
name,
|
|
930
|
-
type: "code",
|
|
931
|
-
script,
|
|
932
|
-
cwd,
|
|
933
|
-
resolvedCwd
|
|
934
|
-
});
|
|
1109
|
+
const attempted = [];
|
|
1110
|
+
const seen = /* @__PURE__ */ new Set();
|
|
1111
|
+
for (const candidate of potentialPaths) {
|
|
1112
|
+
const absoluteCandidate = import_node_path7.default.resolve(candidate);
|
|
1113
|
+
if (seen.has(absoluteCandidate)) {
|
|
935
1114
|
continue;
|
|
936
1115
|
}
|
|
937
|
-
|
|
938
|
-
|
|
939
|
-
if (
|
|
940
|
-
|
|
941
|
-
if (resolved.resolvedPath) {
|
|
942
|
-
promptPath = import_node_path2.default.resolve(resolved.resolvedPath);
|
|
943
|
-
} else {
|
|
944
|
-
logWarning(
|
|
945
|
-
`Inline prompt used for evaluator '${name}' in '${evalId}' (file not found: ${resolved.displayPath})`,
|
|
946
|
-
resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
|
|
947
|
-
);
|
|
948
|
-
}
|
|
1116
|
+
seen.add(absoluteCandidate);
|
|
1117
|
+
attempted.push(absoluteCandidate);
|
|
1118
|
+
if (await fileExists2(absoluteCandidate)) {
|
|
1119
|
+
return { displayPath, resolvedPath: absoluteCandidate, attempted };
|
|
949
1120
|
}
|
|
950
|
-
const model = asString(rawEvaluator.model);
|
|
951
|
-
evaluators.push({
|
|
952
|
-
name,
|
|
953
|
-
type: "llm_judge",
|
|
954
|
-
prompt,
|
|
955
|
-
promptPath
|
|
956
|
-
});
|
|
957
|
-
}
|
|
958
|
-
return evaluators.length > 0 ? evaluators : void 0;
|
|
959
|
-
}
|
|
960
|
-
function coerceEvaluator(candidate, contextId) {
|
|
961
|
-
if (typeof candidate !== "string") {
|
|
962
|
-
return void 0;
|
|
963
|
-
}
|
|
964
|
-
if (isEvaluatorKind(candidate)) {
|
|
965
|
-
return candidate;
|
|
966
|
-
}
|
|
967
|
-
logWarning(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
|
|
968
|
-
return void 0;
|
|
969
|
-
}
|
|
970
|
-
function logWarning(message, details) {
|
|
971
|
-
if (details && details.length > 0) {
|
|
972
|
-
const detailBlock = details.join("\n");
|
|
973
|
-
console.warn(`${ANSI_YELLOW}Warning: ${message}
|
|
974
|
-
${detailBlock}${ANSI_RESET}`);
|
|
975
|
-
} else {
|
|
976
|
-
console.warn(`${ANSI_YELLOW}Warning: ${message}${ANSI_RESET}`);
|
|
977
1121
|
}
|
|
1122
|
+
return { displayPath, attempted };
|
|
978
1123
|
}
|
|
979
1124
|
|
|
980
1125
|
// src/evaluation/providers/ax.ts
|
|
@@ -1005,9 +1150,8 @@ function buildChatPrompt(request) {
|
|
|
1005
1150
|
}
|
|
1006
1151
|
function resolveSystemContent(request) {
|
|
1007
1152
|
const systemSegments = [];
|
|
1008
|
-
|
|
1009
|
-
|
|
1010
|
-
systemSegments.push(metadataSystemPrompt.trim());
|
|
1153
|
+
if (request.systemPrompt && request.systemPrompt.trim().length > 0) {
|
|
1154
|
+
systemSegments.push(request.systemPrompt.trim());
|
|
1011
1155
|
} else {
|
|
1012
1156
|
systemSegments.push(DEFAULT_SYSTEM_PROMPT);
|
|
1013
1157
|
}
|
|
@@ -1258,9 +1402,9 @@ var GeminiProvider = class {
|
|
|
1258
1402
|
|
|
1259
1403
|
// src/evaluation/providers/cli.ts
|
|
1260
1404
|
var import_node_child_process = require("child_process");
|
|
1261
|
-
var
|
|
1405
|
+
var import_promises7 = __toESM(require("fs/promises"), 1);
|
|
1262
1406
|
var import_node_os = __toESM(require("os"), 1);
|
|
1263
|
-
var
|
|
1407
|
+
var import_node_path8 = __toESM(require("path"), 1);
|
|
1264
1408
|
var import_node_util = require("util");
|
|
1265
1409
|
var execAsync = (0, import_node_util.promisify)(import_node_child_process.exec);
|
|
1266
1410
|
var DEFAULT_MAX_BUFFER = 10 * 1024 * 1024;
|
|
@@ -1357,7 +1501,7 @@ var CliProvider = class {
|
|
|
1357
1501
|
const errorMsg = error instanceof Error ? error.message : String(error);
|
|
1358
1502
|
throw new Error(`Failed to read output file '${filePath}': ${errorMsg}`);
|
|
1359
1503
|
} finally {
|
|
1360
|
-
await
|
|
1504
|
+
await import_promises7.default.unlink(filePath).catch(() => {
|
|
1361
1505
|
});
|
|
1362
1506
|
}
|
|
1363
1507
|
}
|
|
@@ -1439,7 +1583,7 @@ function normalizeInputFiles(inputFiles) {
|
|
|
1439
1583
|
}
|
|
1440
1584
|
const unique = /* @__PURE__ */ new Map();
|
|
1441
1585
|
for (const inputFile of inputFiles) {
|
|
1442
|
-
const absolutePath =
|
|
1586
|
+
const absolutePath = import_node_path8.default.resolve(inputFile);
|
|
1443
1587
|
if (!unique.has(absolutePath)) {
|
|
1444
1588
|
unique.set(absolutePath, absolutePath);
|
|
1445
1589
|
}
|
|
@@ -1453,7 +1597,7 @@ function formatFileList(files, template) {
|
|
|
1453
1597
|
const formatter = template ?? "{path}";
|
|
1454
1598
|
return files.map((filePath) => {
|
|
1455
1599
|
const escapedPath = shellEscape(filePath);
|
|
1456
|
-
const escapedName = shellEscape(
|
|
1600
|
+
const escapedName = shellEscape(import_node_path8.default.basename(filePath));
|
|
1457
1601
|
return formatter.replaceAll("{path}", escapedPath).replaceAll("{basename}", escapedName);
|
|
1458
1602
|
}).join(" ");
|
|
1459
1603
|
}
|
|
@@ -1477,7 +1621,7 @@ function generateOutputFilePath(evalCaseId) {
|
|
|
1477
1621
|
const safeEvalId = evalCaseId || "unknown";
|
|
1478
1622
|
const timestamp = Date.now();
|
|
1479
1623
|
const random = Math.random().toString(36).substring(2, 9);
|
|
1480
|
-
return
|
|
1624
|
+
return import_node_path8.default.join(import_node_os.default.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}.json`);
|
|
1481
1625
|
}
|
|
1482
1626
|
function formatTimeoutSuffix(timeoutMs) {
|
|
1483
1627
|
if (!timeoutMs || timeoutMs <= 0) {
|
|
@@ -1491,9 +1635,9 @@ function formatTimeoutSuffix(timeoutMs) {
|
|
|
1491
1635
|
var import_node_child_process2 = require("child_process");
|
|
1492
1636
|
var import_node_crypto = require("crypto");
|
|
1493
1637
|
var import_node_fs3 = require("fs");
|
|
1494
|
-
var
|
|
1638
|
+
var import_promises8 = require("fs/promises");
|
|
1495
1639
|
var import_node_os2 = require("os");
|
|
1496
|
-
var
|
|
1640
|
+
var import_node_path10 = __toESM(require("path"), 1);
|
|
1497
1641
|
var import_node_util2 = require("util");
|
|
1498
1642
|
|
|
1499
1643
|
// src/evaluation/providers/codex-log-tracker.ts
|
|
@@ -1550,7 +1694,7 @@ function subscribeToCodexLogEntries(listener) {
|
|
|
1550
1694
|
}
|
|
1551
1695
|
|
|
1552
1696
|
// src/evaluation/providers/preread.ts
|
|
1553
|
-
var
|
|
1697
|
+
var import_node_path9 = __toESM(require("path"), 1);
|
|
1554
1698
|
function buildPromptDocument(request, inputFiles, options) {
|
|
1555
1699
|
const parts = [];
|
|
1556
1700
|
const guidelineFiles = collectGuidelineFiles(
|
|
@@ -1575,7 +1719,7 @@ function normalizeInputFiles2(inputFiles) {
|
|
|
1575
1719
|
}
|
|
1576
1720
|
const deduped = /* @__PURE__ */ new Map();
|
|
1577
1721
|
for (const inputFile of inputFiles) {
|
|
1578
|
-
const absolutePath =
|
|
1722
|
+
const absolutePath = import_node_path9.default.resolve(inputFile);
|
|
1579
1723
|
if (!deduped.has(absolutePath)) {
|
|
1580
1724
|
deduped.set(absolutePath, absolutePath);
|
|
1581
1725
|
}
|
|
@@ -1588,14 +1732,14 @@ function collectGuidelineFiles(inputFiles, guidelinePatterns, overrides) {
|
|
|
1588
1732
|
}
|
|
1589
1733
|
const unique = /* @__PURE__ */ new Map();
|
|
1590
1734
|
for (const inputFile of inputFiles) {
|
|
1591
|
-
const absolutePath =
|
|
1735
|
+
const absolutePath = import_node_path9.default.resolve(inputFile);
|
|
1592
1736
|
if (overrides?.has(absolutePath)) {
|
|
1593
1737
|
if (!unique.has(absolutePath)) {
|
|
1594
1738
|
unique.set(absolutePath, absolutePath);
|
|
1595
1739
|
}
|
|
1596
1740
|
continue;
|
|
1597
1741
|
}
|
|
1598
|
-
const normalized = absolutePath.split(
|
|
1742
|
+
const normalized = absolutePath.split(import_node_path9.default.sep).join("/");
|
|
1599
1743
|
if (isGuidelineFile(normalized, guidelinePatterns)) {
|
|
1600
1744
|
if (!unique.has(absolutePath)) {
|
|
1601
1745
|
unique.set(absolutePath, absolutePath);
|
|
@@ -1610,7 +1754,7 @@ function collectInputFiles(inputFiles) {
|
|
|
1610
1754
|
}
|
|
1611
1755
|
const unique = /* @__PURE__ */ new Map();
|
|
1612
1756
|
for (const inputFile of inputFiles) {
|
|
1613
|
-
const absolutePath =
|
|
1757
|
+
const absolutePath = import_node_path9.default.resolve(inputFile);
|
|
1614
1758
|
if (!unique.has(absolutePath)) {
|
|
1615
1759
|
unique.set(absolutePath, absolutePath);
|
|
1616
1760
|
}
|
|
@@ -1622,7 +1766,7 @@ function buildMandatoryPrereadBlock(guidelineFiles, inputFiles) {
|
|
|
1622
1766
|
return "";
|
|
1623
1767
|
}
|
|
1624
1768
|
const buildList = (files) => files.map((absolutePath) => {
|
|
1625
|
-
const fileName =
|
|
1769
|
+
const fileName = import_node_path9.default.basename(absolutePath);
|
|
1626
1770
|
const fileUri = pathToFileUri(absolutePath);
|
|
1627
1771
|
return `* [${fileName}](${fileUri})`;
|
|
1628
1772
|
});
|
|
@@ -1642,7 +1786,7 @@ ${buildList(inputFiles).join("\n")}.`);
|
|
|
1642
1786
|
return sections.join("\n");
|
|
1643
1787
|
}
|
|
1644
1788
|
function pathToFileUri(filePath) {
|
|
1645
|
-
const absolutePath =
|
|
1789
|
+
const absolutePath = import_node_path9.default.isAbsolute(filePath) ? filePath : import_node_path9.default.resolve(filePath);
|
|
1646
1790
|
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
1647
1791
|
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
1648
1792
|
return `file:///${normalizedPath}`;
|
|
@@ -1680,8 +1824,8 @@ var CodexProvider = class {
|
|
|
1680
1824
|
const logger = await this.createStreamLogger(request).catch(() => void 0);
|
|
1681
1825
|
try {
|
|
1682
1826
|
const promptContent = buildPromptDocument(request, inputFiles);
|
|
1683
|
-
const promptFile =
|
|
1684
|
-
await (0,
|
|
1827
|
+
const promptFile = import_node_path10.default.join(workspaceRoot, PROMPT_FILENAME);
|
|
1828
|
+
await (0, import_promises8.writeFile)(promptFile, promptContent, "utf8");
|
|
1685
1829
|
const args = this.buildCodexArgs();
|
|
1686
1830
|
const cwd = this.resolveCwd(workspaceRoot);
|
|
1687
1831
|
const result = await this.executeCodex(args, cwd, promptContent, request.signal, logger);
|
|
@@ -1730,7 +1874,7 @@ var CodexProvider = class {
|
|
|
1730
1874
|
if (!this.config.cwd) {
|
|
1731
1875
|
return workspaceRoot;
|
|
1732
1876
|
}
|
|
1733
|
-
return
|
|
1877
|
+
return import_node_path10.default.resolve(this.config.cwd);
|
|
1734
1878
|
}
|
|
1735
1879
|
buildCodexArgs() {
|
|
1736
1880
|
const args = ["--ask-for-approval", "never", "exec", "--json", "--color", "never", "--skip-git-repo-check"];
|
|
@@ -1764,11 +1908,11 @@ var CodexProvider = class {
|
|
|
1764
1908
|
}
|
|
1765
1909
|
}
|
|
1766
1910
|
async createWorkspace() {
|
|
1767
|
-
return await (0,
|
|
1911
|
+
return await (0, import_promises8.mkdtemp)(import_node_path10.default.join((0, import_node_os2.tmpdir)(), WORKSPACE_PREFIX));
|
|
1768
1912
|
}
|
|
1769
1913
|
async cleanupWorkspace(workspaceRoot) {
|
|
1770
1914
|
try {
|
|
1771
|
-
await (0,
|
|
1915
|
+
await (0, import_promises8.rm)(workspaceRoot, { recursive: true, force: true });
|
|
1772
1916
|
} catch {
|
|
1773
1917
|
}
|
|
1774
1918
|
}
|
|
@@ -1778,9 +1922,9 @@ var CodexProvider = class {
|
|
|
1778
1922
|
return void 0;
|
|
1779
1923
|
}
|
|
1780
1924
|
if (this.config.logDir) {
|
|
1781
|
-
return
|
|
1925
|
+
return import_node_path10.default.resolve(this.config.logDir);
|
|
1782
1926
|
}
|
|
1783
|
-
return
|
|
1927
|
+
return import_node_path10.default.join(process.cwd(), ".agentv", "logs", "codex");
|
|
1784
1928
|
}
|
|
1785
1929
|
async createStreamLogger(request) {
|
|
1786
1930
|
const logDir = this.resolveLogDirectory();
|
|
@@ -1788,13 +1932,13 @@ var CodexProvider = class {
|
|
|
1788
1932
|
return void 0;
|
|
1789
1933
|
}
|
|
1790
1934
|
try {
|
|
1791
|
-
await (0,
|
|
1935
|
+
await (0, import_promises8.mkdir)(logDir, { recursive: true });
|
|
1792
1936
|
} catch (error) {
|
|
1793
1937
|
const message = error instanceof Error ? error.message : String(error);
|
|
1794
1938
|
console.warn(`Skipping Codex stream logging (could not create ${logDir}): ${message}`);
|
|
1795
1939
|
return void 0;
|
|
1796
1940
|
}
|
|
1797
|
-
const filePath =
|
|
1941
|
+
const filePath = import_node_path10.default.join(logDir, buildLogFilename(request, this.targetName));
|
|
1798
1942
|
try {
|
|
1799
1943
|
const logger = await CodexStreamLogger.create({
|
|
1800
1944
|
filePath,
|
|
@@ -2009,9 +2153,9 @@ function tryParseJsonValue(rawLine) {
|
|
|
2009
2153
|
async function locateExecutable(candidate) {
|
|
2010
2154
|
const includesPathSeparator = candidate.includes("/") || candidate.includes("\\");
|
|
2011
2155
|
if (includesPathSeparator) {
|
|
2012
|
-
const resolved =
|
|
2156
|
+
const resolved = import_node_path10.default.isAbsolute(candidate) ? candidate : import_node_path10.default.resolve(candidate);
|
|
2013
2157
|
const executablePath = await ensureWindowsExecutableVariant(resolved);
|
|
2014
|
-
await (0,
|
|
2158
|
+
await (0, import_promises8.access)(executablePath, import_node_fs3.constants.F_OK);
|
|
2015
2159
|
return executablePath;
|
|
2016
2160
|
}
|
|
2017
2161
|
const locator = process.platform === "win32" ? "where" : "which";
|
|
@@ -2021,7 +2165,7 @@ async function locateExecutable(candidate) {
|
|
|
2021
2165
|
const preferred = selectExecutableCandidate(lines);
|
|
2022
2166
|
if (preferred) {
|
|
2023
2167
|
const executablePath = await ensureWindowsExecutableVariant(preferred);
|
|
2024
|
-
await (0,
|
|
2168
|
+
await (0, import_promises8.access)(executablePath, import_node_fs3.constants.F_OK);
|
|
2025
2169
|
return executablePath;
|
|
2026
2170
|
}
|
|
2027
2171
|
} catch {
|
|
@@ -2055,7 +2199,7 @@ async function ensureWindowsExecutableVariant(candidate) {
|
|
|
2055
2199
|
for (const ext of extensions) {
|
|
2056
2200
|
const withExtension = `${candidate}${ext}`;
|
|
2057
2201
|
try {
|
|
2058
|
-
await (0,
|
|
2202
|
+
await (0, import_promises8.access)(withExtension, import_node_fs3.constants.F_OK);
|
|
2059
2203
|
return withExtension;
|
|
2060
2204
|
} catch {
|
|
2061
2205
|
}
|
|
@@ -2867,7 +3011,7 @@ function resolveOptionalNumberArray(source, description) {
|
|
|
2867
3011
|
}
|
|
2868
3012
|
|
|
2869
3013
|
// src/evaluation/providers/vscode.ts
|
|
2870
|
-
var
|
|
3014
|
+
var import_node_path11 = __toESM(require("path"), 1);
|
|
2871
3015
|
var import_subagent = require("subagent");
|
|
2872
3016
|
var VSCodeProvider = class {
|
|
2873
3017
|
id;
|
|
@@ -2980,6 +3124,9 @@ var VSCodeProvider = class {
|
|
|
2980
3124
|
};
|
|
2981
3125
|
function buildPromptDocument2(request, attachments, guidelinePatterns) {
|
|
2982
3126
|
const parts = [];
|
|
3127
|
+
if (request.systemPrompt && request.systemPrompt.trim().length > 0) {
|
|
3128
|
+
parts.push(request.systemPrompt.trim());
|
|
3129
|
+
}
|
|
2983
3130
|
const guidelineFiles = collectGuidelineFiles2(attachments, guidelinePatterns);
|
|
2984
3131
|
const attachmentFiles = collectAttachmentFiles(attachments);
|
|
2985
3132
|
const nonGuidelineAttachments = attachmentFiles.filter(
|
|
@@ -2997,7 +3144,7 @@ function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
|
|
|
2997
3144
|
return "";
|
|
2998
3145
|
}
|
|
2999
3146
|
const buildList = (files) => files.map((absolutePath) => {
|
|
3000
|
-
const fileName =
|
|
3147
|
+
const fileName = import_node_path11.default.basename(absolutePath);
|
|
3001
3148
|
const fileUri = pathToFileUri2(absolutePath);
|
|
3002
3149
|
return `* [${fileName}](${fileUri})`;
|
|
3003
3150
|
});
|
|
@@ -3022,8 +3169,8 @@ function collectGuidelineFiles2(attachments, guidelinePatterns) {
|
|
|
3022
3169
|
}
|
|
3023
3170
|
const unique = /* @__PURE__ */ new Map();
|
|
3024
3171
|
for (const attachment of attachments) {
|
|
3025
|
-
const absolutePath =
|
|
3026
|
-
const normalized = absolutePath.split(
|
|
3172
|
+
const absolutePath = import_node_path11.default.resolve(attachment);
|
|
3173
|
+
const normalized = absolutePath.split(import_node_path11.default.sep).join("/");
|
|
3027
3174
|
if (isGuidelineFile(normalized, guidelinePatterns)) {
|
|
3028
3175
|
if (!unique.has(absolutePath)) {
|
|
3029
3176
|
unique.set(absolutePath, absolutePath);
|
|
@@ -3038,7 +3185,7 @@ function collectAttachmentFiles(attachments) {
|
|
|
3038
3185
|
}
|
|
3039
3186
|
const unique = /* @__PURE__ */ new Map();
|
|
3040
3187
|
for (const attachment of attachments) {
|
|
3041
|
-
const absolutePath =
|
|
3188
|
+
const absolutePath = import_node_path11.default.resolve(attachment);
|
|
3042
3189
|
if (!unique.has(absolutePath)) {
|
|
3043
3190
|
unique.set(absolutePath, absolutePath);
|
|
3044
3191
|
}
|
|
@@ -3046,7 +3193,7 @@ function collectAttachmentFiles(attachments) {
|
|
|
3046
3193
|
return Array.from(unique.values());
|
|
3047
3194
|
}
|
|
3048
3195
|
function pathToFileUri2(filePath) {
|
|
3049
|
-
const absolutePath =
|
|
3196
|
+
const absolutePath = import_node_path11.default.isAbsolute(filePath) ? filePath : import_node_path11.default.resolve(filePath);
|
|
3050
3197
|
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
3051
3198
|
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
3052
3199
|
return `file:///${normalizedPath}`;
|
|
@@ -3059,7 +3206,7 @@ function normalizeAttachments(attachments) {
|
|
|
3059
3206
|
}
|
|
3060
3207
|
const deduped = /* @__PURE__ */ new Set();
|
|
3061
3208
|
for (const attachment of attachments) {
|
|
3062
|
-
deduped.add(
|
|
3209
|
+
deduped.add(import_node_path11.default.resolve(attachment));
|
|
3063
3210
|
}
|
|
3064
3211
|
return Array.from(deduped);
|
|
3065
3212
|
}
|
|
@@ -3068,7 +3215,7 @@ function mergeAttachments(all) {
|
|
|
3068
3215
|
for (const list of all) {
|
|
3069
3216
|
if (!list) continue;
|
|
3070
3217
|
for (const inputFile of list) {
|
|
3071
|
-
deduped.add(
|
|
3218
|
+
deduped.add(import_node_path11.default.resolve(inputFile));
|
|
3072
3219
|
}
|
|
3073
3220
|
}
|
|
3074
3221
|
return deduped.size > 0 ? Array.from(deduped) : void 0;
|
|
@@ -3114,9 +3261,9 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
|
|
|
3114
3261
|
|
|
3115
3262
|
// src/evaluation/providers/targets-file.ts
|
|
3116
3263
|
var import_node_fs4 = require("fs");
|
|
3117
|
-
var
|
|
3118
|
-
var
|
|
3119
|
-
var
|
|
3264
|
+
var import_promises9 = require("fs/promises");
|
|
3265
|
+
var import_node_path12 = __toESM(require("path"), 1);
|
|
3266
|
+
var import_yaml3 = require("yaml");
|
|
3120
3267
|
|
|
3121
3268
|
// src/evaluation/providers/types.ts
|
|
3122
3269
|
var AGENT_PROVIDER_KINDS = [
|
|
@@ -3177,19 +3324,19 @@ function assertTargetDefinition(value, index, filePath) {
|
|
|
3177
3324
|
}
|
|
3178
3325
|
async function fileExists3(filePath) {
|
|
3179
3326
|
try {
|
|
3180
|
-
await (0,
|
|
3327
|
+
await (0, import_promises9.access)(filePath, import_node_fs4.constants.F_OK);
|
|
3181
3328
|
return true;
|
|
3182
3329
|
} catch {
|
|
3183
3330
|
return false;
|
|
3184
3331
|
}
|
|
3185
3332
|
}
|
|
3186
3333
|
async function readTargetDefinitions(filePath) {
|
|
3187
|
-
const absolutePath =
|
|
3334
|
+
const absolutePath = import_node_path12.default.resolve(filePath);
|
|
3188
3335
|
if (!await fileExists3(absolutePath)) {
|
|
3189
3336
|
throw new Error(`targets.yaml not found at ${absolutePath}`);
|
|
3190
3337
|
}
|
|
3191
|
-
const raw = await (0,
|
|
3192
|
-
const parsed = (0,
|
|
3338
|
+
const raw = await (0, import_promises9.readFile)(absolutePath, "utf8");
|
|
3339
|
+
const parsed = (0, import_yaml3.parse)(raw);
|
|
3193
3340
|
if (!isRecord(parsed)) {
|
|
3194
3341
|
throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with '$schema' and 'targets' fields`);
|
|
3195
3342
|
}
|
|
@@ -3232,18 +3379,34 @@ function resolveAndCreateProvider(definition, env = process.env) {
|
|
|
3232
3379
|
}
|
|
3233
3380
|
|
|
3234
3381
|
// src/evaluation/evaluators.ts
|
|
3235
|
-
var
|
|
3382
|
+
var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
|
|
3383
|
+
|
|
3384
|
+
Use the reference_answer as a gold standard for a high-quality response (if provided). The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
|
|
3385
|
+
|
|
3386
|
+
Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.
|
|
3387
|
+
|
|
3388
|
+
[[ ## expected_outcome ## ]]
|
|
3389
|
+
{{expected_outcome}}
|
|
3390
|
+
|
|
3391
|
+
[[ ## question ## ]]
|
|
3392
|
+
{{question}}
|
|
3393
|
+
|
|
3394
|
+
[[ ## reference_answer ## ]]
|
|
3395
|
+
{{reference_answer}}
|
|
3396
|
+
|
|
3397
|
+
[[ ## candidate_answer ## ]]
|
|
3398
|
+
{{candidate_answer}}`;
|
|
3236
3399
|
var LlmJudgeEvaluator = class {
|
|
3237
3400
|
kind = "llm_judge";
|
|
3238
3401
|
resolveJudgeProvider;
|
|
3239
3402
|
maxOutputTokens;
|
|
3240
3403
|
temperature;
|
|
3241
|
-
|
|
3404
|
+
evaluatorTemplate;
|
|
3242
3405
|
constructor(options) {
|
|
3243
3406
|
this.resolveJudgeProvider = options.resolveJudgeProvider;
|
|
3244
3407
|
this.maxOutputTokens = options.maxOutputTokens;
|
|
3245
3408
|
this.temperature = options.temperature;
|
|
3246
|
-
this.
|
|
3409
|
+
this.evaluatorTemplate = options.evaluatorTemplate;
|
|
3247
3410
|
}
|
|
3248
3411
|
async evaluate(context) {
|
|
3249
3412
|
const judgeProvider = await this.resolveJudgeProvider(context);
|
|
@@ -3253,26 +3416,21 @@ var LlmJudgeEvaluator = class {
|
|
|
3253
3416
|
return this.evaluateWithPrompt(context, judgeProvider);
|
|
3254
3417
|
}
|
|
3255
3418
|
async evaluateWithPrompt(context, judgeProvider) {
|
|
3256
|
-
const hasReferenceAnswer = hasNonEmptyReferenceAnswer(context.evalCase);
|
|
3257
3419
|
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
3258
|
-
|
|
3259
|
-
|
|
3260
|
-
|
|
3261
|
-
|
|
3262
|
-
|
|
3263
|
-
|
|
3264
|
-
|
|
3265
|
-
|
|
3266
|
-
|
|
3267
|
-
|
|
3268
|
-
|
|
3269
|
-
prompt = substituteVariables(systemPrompt, variables);
|
|
3270
|
-
systemPrompt = buildSystemPrompt(hasReferenceAnswer);
|
|
3271
|
-
}
|
|
3272
|
-
const metadata = systemPrompt !== void 0 ? { systemPrompt } : {};
|
|
3420
|
+
const variables = {
|
|
3421
|
+
input_messages: JSON.stringify(context.evalCase.input_segments, null, 2),
|
|
3422
|
+
output_messages: JSON.stringify(context.evalCase.output_segments, null, 2),
|
|
3423
|
+
candidate_answer: context.candidate.trim(),
|
|
3424
|
+
reference_answer: (context.evalCase.reference_answer ?? "").trim(),
|
|
3425
|
+
expected_outcome: context.evalCase.expected_outcome.trim(),
|
|
3426
|
+
question: formattedQuestion.trim()
|
|
3427
|
+
};
|
|
3428
|
+
const systemPrompt = buildOutputSchema();
|
|
3429
|
+
const evaluatorTemplate = context.evaluatorTemplateOverride ?? this.evaluatorTemplate ?? DEFAULT_EVALUATOR_TEMPLATE;
|
|
3430
|
+
const userPrompt = substituteVariables(evaluatorTemplate, variables);
|
|
3273
3431
|
const response = await judgeProvider.invoke({
|
|
3274
|
-
question:
|
|
3275
|
-
|
|
3432
|
+
question: userPrompt,
|
|
3433
|
+
systemPrompt,
|
|
3276
3434
|
evalCaseId: context.evalCase.id,
|
|
3277
3435
|
attempt: context.attempt,
|
|
3278
3436
|
maxOutputTokens: this.maxOutputTokens,
|
|
@@ -3285,11 +3443,9 @@ var LlmJudgeEvaluator = class {
|
|
|
3285
3443
|
const reasoning = parsed.reasoning ?? response.reasoning;
|
|
3286
3444
|
const expectedAspectCount = Math.max(hits.length + misses.length, 1);
|
|
3287
3445
|
const evaluatorRawRequest = {
|
|
3288
|
-
|
|
3289
|
-
|
|
3290
|
-
|
|
3291
|
-
target: context.target.name,
|
|
3292
|
-
...systemPrompt !== void 0 && { systemPrompt }
|
|
3446
|
+
userPrompt,
|
|
3447
|
+
systemPrompt,
|
|
3448
|
+
target: judgeProvider.targetName
|
|
3293
3449
|
};
|
|
3294
3450
|
return {
|
|
3295
3451
|
score,
|
|
@@ -3301,20 +3457,8 @@ var LlmJudgeEvaluator = class {
|
|
|
3301
3457
|
};
|
|
3302
3458
|
}
|
|
3303
3459
|
};
|
|
3304
|
-
function
|
|
3305
|
-
|
|
3306
|
-
"You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.",
|
|
3307
|
-
""
|
|
3308
|
-
];
|
|
3309
|
-
if (hasReferenceAnswer) {
|
|
3310
|
-
basePrompt.push(
|
|
3311
|
-
"Use the reference_answer as a gold standard for a high-quality response. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.",
|
|
3312
|
-
""
|
|
3313
|
-
);
|
|
3314
|
-
}
|
|
3315
|
-
basePrompt.push(
|
|
3316
|
-
"Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.",
|
|
3317
|
-
"",
|
|
3460
|
+
function buildOutputSchema() {
|
|
3461
|
+
return [
|
|
3318
3462
|
"You must respond with a single JSON object matching this schema:",
|
|
3319
3463
|
"",
|
|
3320
3464
|
"{",
|
|
@@ -3323,30 +3467,7 @@ function buildSystemPrompt(hasReferenceAnswer) {
|
|
|
3323
3467
|
' "misses": [<array of strings, max 4 items, brief specific failures or omissions, empty if none>],',
|
|
3324
3468
|
' "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
|
|
3325
3469
|
"}"
|
|
3326
|
-
);
|
|
3327
|
-
return basePrompt.join("\n");
|
|
3328
|
-
}
|
|
3329
|
-
function buildQualityPrompt(evalCase, candidate, question) {
|
|
3330
|
-
const parts = [
|
|
3331
|
-
"[[ ## expected_outcome ## ]]",
|
|
3332
|
-
evalCase.expected_outcome.trim(),
|
|
3333
|
-
"",
|
|
3334
|
-
"[[ ## question ## ]]",
|
|
3335
|
-
question.trim(),
|
|
3336
|
-
""
|
|
3337
|
-
];
|
|
3338
|
-
if (hasNonEmptyReferenceAnswer(evalCase)) {
|
|
3339
|
-
parts.push(
|
|
3340
|
-
"[[ ## reference_answer ## ]]",
|
|
3341
|
-
evalCase.reference_answer.trim(),
|
|
3342
|
-
""
|
|
3343
|
-
);
|
|
3344
|
-
}
|
|
3345
|
-
parts.push(
|
|
3346
|
-
"[[ ## candidate_answer ## ]]",
|
|
3347
|
-
candidate.trim()
|
|
3348
|
-
);
|
|
3349
|
-
return parts.join("\n");
|
|
3470
|
+
].join("\n");
|
|
3350
3471
|
}
|
|
3351
3472
|
function clampScore(value) {
|
|
3352
3473
|
if (Number.isNaN(value) || !Number.isFinite(value)) {
|
|
@@ -3428,9 +3549,6 @@ function extractJsonBlob(text) {
|
|
|
3428
3549
|
function isNonEmptyString(value) {
|
|
3429
3550
|
return typeof value === "string" && value.trim().length > 0;
|
|
3430
3551
|
}
|
|
3431
|
-
function hasNonEmptyReferenceAnswer(evalCase) {
|
|
3432
|
-
return evalCase.reference_answer !== void 0 && evalCase.reference_answer.trim().length > 0;
|
|
3433
|
-
}
|
|
3434
3552
|
var CodeEvaluator = class {
|
|
3435
3553
|
kind = "code";
|
|
3436
3554
|
script;
|
|
@@ -3536,19 +3654,16 @@ function parseJsonSafe(payload) {
|
|
|
3536
3654
|
return void 0;
|
|
3537
3655
|
}
|
|
3538
3656
|
}
|
|
3539
|
-
function hasTemplateVariables(text) {
|
|
3540
|
-
return /\$\{[a-zA-Z0-9_]+\}/.test(text);
|
|
3541
|
-
}
|
|
3542
3657
|
function substituteVariables(template, variables) {
|
|
3543
|
-
return template.replace(
|
|
3658
|
+
return template.replace(/\{\{([a-zA-Z0-9_]+)\}\}/g, (match, varName) => {
|
|
3544
3659
|
return variables[varName] ?? match;
|
|
3545
3660
|
});
|
|
3546
3661
|
}
|
|
3547
3662
|
|
|
3548
3663
|
// src/evaluation/orchestrator.ts
|
|
3549
|
-
var
|
|
3550
|
-
var
|
|
3551
|
-
var
|
|
3664
|
+
var import_node_crypto2 = require("crypto");
|
|
3665
|
+
var import_promises10 = require("fs/promises");
|
|
3666
|
+
var import_node_path13 = __toESM(require("path"), 1);
|
|
3552
3667
|
|
|
3553
3668
|
// ../../node_modules/.pnpm/yocto-queue@1.2.1/node_modules/yocto-queue/index.js
|
|
3554
3669
|
var Node = class {
|
|
@@ -4111,6 +4226,7 @@ async function evaluateCandidate(options) {
|
|
|
4111
4226
|
}
|
|
4112
4227
|
}
|
|
4113
4228
|
return {
|
|
4229
|
+
timestamp: completedAt.toISOString(),
|
|
4114
4230
|
eval_id: evalCase.id,
|
|
4115
4231
|
dataset: evalCase.dataset,
|
|
4116
4232
|
conversation_id: evalCase.conversation_id,
|
|
@@ -4118,14 +4234,12 @@ async function evaluateCandidate(options) {
|
|
|
4118
4234
|
hits: score.hits,
|
|
4119
4235
|
misses: score.misses,
|
|
4120
4236
|
candidate_answer: candidate,
|
|
4121
|
-
expected_aspect_count: score.expectedAspectCount,
|
|
4122
4237
|
target: target.name,
|
|
4123
|
-
timestamp: completedAt.toISOString(),
|
|
4124
4238
|
reasoning: score.reasoning,
|
|
4125
4239
|
raw_aspects: score.rawAspects,
|
|
4126
4240
|
agent_provider_request: agentProviderRequest,
|
|
4127
4241
|
lm_provider_request: lmProviderRequest,
|
|
4128
|
-
|
|
4242
|
+
evaluator_provider_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
|
|
4129
4243
|
evaluator_results: evaluatorResults
|
|
4130
4244
|
};
|
|
4131
4245
|
}
|
|
@@ -4202,7 +4316,7 @@ async function runEvaluatorList(options) {
|
|
|
4202
4316
|
hits: score2.hits,
|
|
4203
4317
|
misses: score2.misses,
|
|
4204
4318
|
reasoning: score2.reasoning,
|
|
4205
|
-
|
|
4319
|
+
evaluator_provider_request: score2.evaluatorRawRequest
|
|
4206
4320
|
});
|
|
4207
4321
|
continue;
|
|
4208
4322
|
}
|
|
@@ -4229,7 +4343,7 @@ async function runEvaluatorList(options) {
|
|
|
4229
4343
|
hits: score2.hits,
|
|
4230
4344
|
misses: score2.misses,
|
|
4231
4345
|
reasoning: score2.reasoning,
|
|
4232
|
-
|
|
4346
|
+
evaluator_provider_request: score2.evaluatorRawRequest
|
|
4233
4347
|
});
|
|
4234
4348
|
continue;
|
|
4235
4349
|
}
|
|
@@ -4282,7 +4396,7 @@ async function runLlmJudgeEvaluator(options) {
|
|
|
4282
4396
|
promptInputs,
|
|
4283
4397
|
now,
|
|
4284
4398
|
judgeProvider,
|
|
4285
|
-
|
|
4399
|
+
evaluatorTemplateOverride: customPrompt,
|
|
4286
4400
|
evaluator: config
|
|
4287
4401
|
});
|
|
4288
4402
|
}
|
|
@@ -4323,22 +4437,22 @@ function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
|
|
|
4323
4437
|
async function dumpPrompt(directory, evalCase, promptInputs) {
|
|
4324
4438
|
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
4325
4439
|
const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
|
|
4326
|
-
const filePath =
|
|
4327
|
-
await (0,
|
|
4440
|
+
const filePath = import_node_path13.default.resolve(directory, filename);
|
|
4441
|
+
await (0, import_promises10.mkdir)(import_node_path13.default.dirname(filePath), { recursive: true });
|
|
4328
4442
|
const payload = {
|
|
4329
4443
|
eval_id: evalCase.id,
|
|
4330
4444
|
question: promptInputs.question,
|
|
4331
4445
|
guidelines: promptInputs.guidelines,
|
|
4332
4446
|
guideline_paths: evalCase.guideline_paths
|
|
4333
4447
|
};
|
|
4334
|
-
await (0,
|
|
4448
|
+
await (0, import_promises10.writeFile)(filePath, JSON.stringify(payload, null, 2), "utf8");
|
|
4335
4449
|
}
|
|
4336
4450
|
function sanitizeFilename(value) {
|
|
4337
4451
|
if (!value) {
|
|
4338
4452
|
return "prompt";
|
|
4339
4453
|
}
|
|
4340
4454
|
const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
|
|
4341
|
-
return sanitized.length > 0 ? sanitized : (0,
|
|
4455
|
+
return sanitized.length > 0 ? sanitized : (0, import_node_crypto2.randomUUID)();
|
|
4342
4456
|
}
|
|
4343
4457
|
async function invokeProvider(provider, options) {
|
|
4344
4458
|
const { evalCase, promptInputs, attempt, agentTimeoutMs, signal } = options;
|
|
@@ -4394,6 +4508,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
4394
4508
|
}
|
|
4395
4509
|
}
|
|
4396
4510
|
return {
|
|
4511
|
+
timestamp: timestamp.toISOString(),
|
|
4397
4512
|
eval_id: evalCase.id,
|
|
4398
4513
|
dataset: evalCase.dataset,
|
|
4399
4514
|
conversation_id: evalCase.conversation_id,
|
|
@@ -4401,9 +4516,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
4401
4516
|
hits: [],
|
|
4402
4517
|
misses: [`Error: ${message}`],
|
|
4403
4518
|
candidate_answer: `Error occurred: ${message}`,
|
|
4404
|
-
expected_aspect_count: 0,
|
|
4405
4519
|
target: targetName,
|
|
4406
|
-
timestamp: timestamp.toISOString(),
|
|
4407
4520
|
raw_aspects: [],
|
|
4408
4521
|
agent_provider_request: agentProviderRequest,
|
|
4409
4522
|
lm_provider_request: lmProviderRequest,
|
|
@@ -4411,7 +4524,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
4411
4524
|
};
|
|
4412
4525
|
}
|
|
4413
4526
|
function createCacheKey(provider, target, evalCase, promptInputs) {
|
|
4414
|
-
const hash = (0,
|
|
4527
|
+
const hash = (0, import_node_crypto2.createHash)("sha256");
|
|
4415
4528
|
hash.update(provider.id);
|
|
4416
4529
|
hash.update(target.name);
|
|
4417
4530
|
hash.update(evalCase.id);
|