@agentv/core 2.1.0 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/evaluation/validation/index.cjs +0 -11
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +0 -11
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +458 -211
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +8 -2
- package/dist/index.d.ts +8 -2
- package/dist/index.js +405 -159
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.cjs
CHANGED
|
@@ -53,6 +53,7 @@ __export(index_exports, {
|
|
|
53
53
|
createAgentKernel: () => createAgentKernel,
|
|
54
54
|
createProvider: () => createProvider,
|
|
55
55
|
deepEqual: () => deepEqual,
|
|
56
|
+
detectFormat: () => detectFormat,
|
|
56
57
|
ensureVSCodeSubagents: () => ensureVSCodeSubagents,
|
|
57
58
|
executeScript: () => executeScript,
|
|
58
59
|
explorationRatio: () => explorationRatio,
|
|
@@ -226,9 +227,9 @@ function mergeExecutionMetrics(summary, metrics) {
|
|
|
226
227
|
}
|
|
227
228
|
|
|
228
229
|
// src/evaluation/yaml-parser.ts
|
|
229
|
-
var
|
|
230
|
-
var
|
|
231
|
-
var
|
|
230
|
+
var import_promises7 = require("fs/promises");
|
|
231
|
+
var import_node_path7 = __toESM(require("path"), 1);
|
|
232
|
+
var import_yaml3 = require("yaml");
|
|
232
233
|
|
|
233
234
|
// src/evaluation/loaders/config-loader.ts
|
|
234
235
|
var import_promises2 = require("fs/promises");
|
|
@@ -337,7 +338,6 @@ async function resolveFileReference(rawValue, searchRoots) {
|
|
|
337
338
|
}
|
|
338
339
|
|
|
339
340
|
// src/evaluation/loaders/config-loader.ts
|
|
340
|
-
var SCHEMA_CONFIG_V2 = "agentv-config-v2";
|
|
341
341
|
var ANSI_YELLOW = "\x1B[33m";
|
|
342
342
|
var ANSI_RESET = "\x1B[0m";
|
|
343
343
|
async function loadConfig(evalFilePath, repoRoot) {
|
|
@@ -355,13 +355,6 @@ async function loadConfig(evalFilePath, repoRoot) {
|
|
|
355
355
|
continue;
|
|
356
356
|
}
|
|
357
357
|
const config = parsed;
|
|
358
|
-
const schema = config.$schema;
|
|
359
|
-
if (schema !== SCHEMA_CONFIG_V2) {
|
|
360
|
-
const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${configPath}. Expected '${SCHEMA_CONFIG_V2}'` : `Missing required field '$schema' in ${configPath}.
|
|
361
|
-
Please add '$schema: ${SCHEMA_CONFIG_V2}' at the top of the file.`;
|
|
362
|
-
logWarning(message);
|
|
363
|
-
continue;
|
|
364
|
-
}
|
|
365
358
|
const guidelinePatterns = config.guideline_patterns;
|
|
366
359
|
if (guidelinePatterns !== void 0 && !Array.isArray(guidelinePatterns)) {
|
|
367
360
|
logWarning(`Invalid guideline_patterns in ${configPath}, expected array`);
|
|
@@ -470,7 +463,8 @@ var ANSI_YELLOW3 = "\x1B[33m";
|
|
|
470
463
|
var ANSI_RESET3 = "\x1B[0m";
|
|
471
464
|
async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
|
|
472
465
|
const execution = rawEvalCase.execution;
|
|
473
|
-
const
|
|
466
|
+
const executionObject = isJsonObject2(execution) ? execution : void 0;
|
|
467
|
+
const candidateEvaluators = (executionObject ? executionObject.evaluators : void 0) ?? rawEvalCase.evaluators ?? globalExecution?.evaluators;
|
|
474
468
|
if (candidateEvaluators === void 0) {
|
|
475
469
|
return void 0;
|
|
476
470
|
}
|
|
@@ -1013,6 +1007,11 @@ function isValidFieldAggregationType(value) {
|
|
|
1013
1007
|
return typeof value === "string" && VALID_FIELD_AGGREGATION_TYPES.has(value);
|
|
1014
1008
|
}
|
|
1015
1009
|
|
|
1010
|
+
// src/evaluation/loaders/jsonl-parser.ts
|
|
1011
|
+
var import_promises5 = require("fs/promises");
|
|
1012
|
+
var import_node_path5 = __toESM(require("path"), 1);
|
|
1013
|
+
var import_yaml2 = require("yaml");
|
|
1014
|
+
|
|
1016
1015
|
// src/evaluation/loaders/message-processor.ts
|
|
1017
1016
|
var import_promises4 = require("fs/promises");
|
|
1018
1017
|
var import_node_path4 = __toESM(require("path"), 1);
|
|
@@ -1273,28 +1272,271 @@ async function processExpectedMessages(options) {
|
|
|
1273
1272
|
return segments;
|
|
1274
1273
|
}
|
|
1275
1274
|
|
|
1276
|
-
// src/evaluation/
|
|
1277
|
-
var import_promises5 = require("fs/promises");
|
|
1278
|
-
var import_node_path5 = __toESM(require("path"), 1);
|
|
1275
|
+
// src/evaluation/loaders/jsonl-parser.ts
|
|
1279
1276
|
var ANSI_YELLOW5 = "\x1B[33m";
|
|
1277
|
+
var ANSI_RED = "\x1B[31m";
|
|
1280
1278
|
var ANSI_RESET5 = "\x1B[0m";
|
|
1279
|
+
function detectFormat(filePath) {
|
|
1280
|
+
const ext = import_node_path5.default.extname(filePath).toLowerCase();
|
|
1281
|
+
if (ext === ".jsonl") return "jsonl";
|
|
1282
|
+
if (ext === ".yaml" || ext === ".yml") return "yaml";
|
|
1283
|
+
throw new Error(`Unsupported file format: '${ext}'. Supported formats: .yaml, .yml, .jsonl`);
|
|
1284
|
+
}
|
|
1285
|
+
async function loadSidecarMetadata(jsonlPath, verbose) {
|
|
1286
|
+
const dir = import_node_path5.default.dirname(jsonlPath);
|
|
1287
|
+
const base = import_node_path5.default.basename(jsonlPath, ".jsonl");
|
|
1288
|
+
const sidecarPath = import_node_path5.default.join(dir, `${base}.yaml`);
|
|
1289
|
+
if (!await fileExists(sidecarPath)) {
|
|
1290
|
+
if (verbose) {
|
|
1291
|
+
logWarning4(`Sidecar metadata file not found: ${sidecarPath} (using defaults)`);
|
|
1292
|
+
}
|
|
1293
|
+
return {};
|
|
1294
|
+
}
|
|
1295
|
+
try {
|
|
1296
|
+
const content = await (0, import_promises5.readFile)(sidecarPath, "utf8");
|
|
1297
|
+
const parsed = (0, import_yaml2.parse)(content);
|
|
1298
|
+
if (!isJsonObject(parsed)) {
|
|
1299
|
+
logWarning4(`Invalid sidecar metadata format in ${sidecarPath}`);
|
|
1300
|
+
return {};
|
|
1301
|
+
}
|
|
1302
|
+
return {
|
|
1303
|
+
description: asString4(parsed.description),
|
|
1304
|
+
dataset: asString4(parsed.dataset),
|
|
1305
|
+
execution: isJsonObject(parsed.execution) ? parsed.execution : void 0,
|
|
1306
|
+
evaluator: parsed.evaluator
|
|
1307
|
+
};
|
|
1308
|
+
} catch (error) {
|
|
1309
|
+
logWarning4(`Could not read sidecar metadata from ${sidecarPath}: ${error.message}`);
|
|
1310
|
+
return {};
|
|
1311
|
+
}
|
|
1312
|
+
}
|
|
1313
|
+
function parseJsonlContent(content, filePath) {
|
|
1314
|
+
const lines = content.split("\n");
|
|
1315
|
+
const cases = [];
|
|
1316
|
+
for (let i = 0; i < lines.length; i++) {
|
|
1317
|
+
const line = lines[i].trim();
|
|
1318
|
+
if (line === "") continue;
|
|
1319
|
+
try {
|
|
1320
|
+
const parsed = JSON.parse(line);
|
|
1321
|
+
if (!isJsonObject(parsed)) {
|
|
1322
|
+
throw new Error("Expected JSON object");
|
|
1323
|
+
}
|
|
1324
|
+
cases.push(parsed);
|
|
1325
|
+
} catch (error) {
|
|
1326
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1327
|
+
throw new Error(`Line ${i + 1}: Invalid JSON - ${message}
|
|
1328
|
+
File: ${filePath}`);
|
|
1329
|
+
}
|
|
1330
|
+
}
|
|
1331
|
+
return cases;
|
|
1332
|
+
}
|
|
1333
|
+
async function loadEvalCasesFromJsonl(evalFilePath, repoRoot, options) {
|
|
1334
|
+
const verbose = options?.verbose ?? false;
|
|
1335
|
+
const evalIdFilter = options?.evalId;
|
|
1336
|
+
const absoluteTestPath = import_node_path5.default.resolve(evalFilePath);
|
|
1337
|
+
const repoRootPath = resolveToAbsolutePath(repoRoot);
|
|
1338
|
+
const searchRoots = buildSearchRoots(absoluteTestPath, repoRootPath);
|
|
1339
|
+
const config = await loadConfig(absoluteTestPath, repoRootPath);
|
|
1340
|
+
const guidelinePatterns = config?.guideline_patterns;
|
|
1341
|
+
const sidecar = await loadSidecarMetadata(absoluteTestPath, verbose);
|
|
1342
|
+
const rawFile = await (0, import_promises5.readFile)(absoluteTestPath, "utf8");
|
|
1343
|
+
const rawCases = parseJsonlContent(rawFile, evalFilePath);
|
|
1344
|
+
const fallbackDataset = import_node_path5.default.basename(absoluteTestPath, ".jsonl") || "eval";
|
|
1345
|
+
const datasetName = sidecar.dataset && sidecar.dataset.trim().length > 0 ? sidecar.dataset : fallbackDataset;
|
|
1346
|
+
const globalEvaluator = coerceEvaluator(sidecar.evaluator, "sidecar") ?? "llm_judge";
|
|
1347
|
+
const globalExecution = sidecar.execution;
|
|
1348
|
+
if (verbose) {
|
|
1349
|
+
console.log(`
|
|
1350
|
+
[JSONL Dataset: ${evalFilePath}]`);
|
|
1351
|
+
console.log(` Cases: ${rawCases.length}`);
|
|
1352
|
+
console.log(` Dataset name: ${datasetName}`);
|
|
1353
|
+
if (sidecar.description) {
|
|
1354
|
+
console.log(` Description: ${sidecar.description}`);
|
|
1355
|
+
}
|
|
1356
|
+
}
|
|
1357
|
+
const results = [];
|
|
1358
|
+
for (let lineIndex = 0; lineIndex < rawCases.length; lineIndex++) {
|
|
1359
|
+
const evalcase = rawCases[lineIndex];
|
|
1360
|
+
const lineNumber = lineIndex + 1;
|
|
1361
|
+
const id = asString4(evalcase.id);
|
|
1362
|
+
if (evalIdFilter && id !== evalIdFilter) {
|
|
1363
|
+
continue;
|
|
1364
|
+
}
|
|
1365
|
+
const conversationId = asString4(evalcase.conversation_id);
|
|
1366
|
+
const outcome = asString4(evalcase.expected_outcome) ?? asString4(evalcase.outcome);
|
|
1367
|
+
const inputMessagesValue = evalcase.input_messages;
|
|
1368
|
+
const expectedMessagesValue = evalcase.expected_messages;
|
|
1369
|
+
if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
|
|
1370
|
+
logError(
|
|
1371
|
+
`Skipping incomplete eval case at line ${lineNumber}: ${id ?? "unknown"}. Missing required fields: id, expected_outcome, and/or input_messages`
|
|
1372
|
+
);
|
|
1373
|
+
continue;
|
|
1374
|
+
}
|
|
1375
|
+
const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
|
|
1376
|
+
const inputMessages = inputMessagesValue.filter(
|
|
1377
|
+
(msg) => isTestMessage(msg)
|
|
1378
|
+
);
|
|
1379
|
+
const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
|
|
1380
|
+
if (hasExpectedMessages && expectedMessages.length === 0) {
|
|
1381
|
+
logError(`Line ${lineNumber}: No valid expected message found for eval case: ${id}`);
|
|
1382
|
+
continue;
|
|
1383
|
+
}
|
|
1384
|
+
const guidelinePaths = [];
|
|
1385
|
+
const inputTextParts = [];
|
|
1386
|
+
const inputSegments = await processMessages({
|
|
1387
|
+
messages: inputMessages,
|
|
1388
|
+
searchRoots,
|
|
1389
|
+
repoRootPath,
|
|
1390
|
+
guidelinePatterns,
|
|
1391
|
+
guidelinePaths,
|
|
1392
|
+
textParts: inputTextParts,
|
|
1393
|
+
messageType: "input",
|
|
1394
|
+
verbose
|
|
1395
|
+
});
|
|
1396
|
+
const outputSegments = hasExpectedMessages ? await processExpectedMessages({
|
|
1397
|
+
messages: expectedMessages,
|
|
1398
|
+
searchRoots,
|
|
1399
|
+
repoRootPath,
|
|
1400
|
+
verbose
|
|
1401
|
+
}) : [];
|
|
1402
|
+
let referenceAnswer = "";
|
|
1403
|
+
if (outputSegments.length > 0) {
|
|
1404
|
+
const lastMessage = outputSegments[outputSegments.length - 1];
|
|
1405
|
+
const content = lastMessage.content;
|
|
1406
|
+
const toolCalls = lastMessage.tool_calls;
|
|
1407
|
+
if (typeof content === "string") {
|
|
1408
|
+
referenceAnswer = content;
|
|
1409
|
+
} else if (content !== void 0 && content !== null) {
|
|
1410
|
+
referenceAnswer = JSON.stringify(content, null, 2);
|
|
1411
|
+
} else if (toolCalls !== void 0 && toolCalls !== null) {
|
|
1412
|
+
referenceAnswer = JSON.stringify(toolCalls, null, 2);
|
|
1413
|
+
}
|
|
1414
|
+
}
|
|
1415
|
+
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
1416
|
+
const caseExecution = isJsonObject(evalcase.execution) ? evalcase.execution : void 0;
|
|
1417
|
+
const mergedExecution = caseExecution ?? globalExecution;
|
|
1418
|
+
const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
|
|
1419
|
+
let evaluators;
|
|
1420
|
+
try {
|
|
1421
|
+
evaluators = await parseEvaluators(evalcase, mergedExecution, searchRoots, id ?? "unknown");
|
|
1422
|
+
} catch (error) {
|
|
1423
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1424
|
+
logError(`Skipping eval case '${id}' at line ${lineNumber}: ${message}`);
|
|
1425
|
+
continue;
|
|
1426
|
+
}
|
|
1427
|
+
const inlineRubrics = evalcase.rubrics;
|
|
1428
|
+
if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
|
|
1429
|
+
const rubricItems = inlineRubrics.filter((r) => isJsonObject(r) || typeof r === "string").map((rubric, index) => {
|
|
1430
|
+
if (typeof rubric === "string") {
|
|
1431
|
+
return {
|
|
1432
|
+
id: `rubric-${index + 1}`,
|
|
1433
|
+
description: rubric,
|
|
1434
|
+
weight: 1,
|
|
1435
|
+
required: true
|
|
1436
|
+
};
|
|
1437
|
+
}
|
|
1438
|
+
return {
|
|
1439
|
+
id: asString4(rubric.id) ?? `rubric-${index + 1}`,
|
|
1440
|
+
description: asString4(rubric.description) ?? "",
|
|
1441
|
+
weight: typeof rubric.weight === "number" ? rubric.weight : 1,
|
|
1442
|
+
required: typeof rubric.required === "boolean" ? rubric.required : true
|
|
1443
|
+
};
|
|
1444
|
+
}).filter((r) => r.description.length > 0);
|
|
1445
|
+
if (rubricItems.length > 0) {
|
|
1446
|
+
const rubricEvaluator = {
|
|
1447
|
+
name: "rubric",
|
|
1448
|
+
type: "llm_judge",
|
|
1449
|
+
rubrics: rubricItems
|
|
1450
|
+
};
|
|
1451
|
+
evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
|
|
1452
|
+
}
|
|
1453
|
+
}
|
|
1454
|
+
const userFilePaths = [];
|
|
1455
|
+
for (const segment of inputSegments) {
|
|
1456
|
+
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
1457
|
+
userFilePaths.push(segment.resolvedPath);
|
|
1458
|
+
}
|
|
1459
|
+
}
|
|
1460
|
+
const allFilePaths = [
|
|
1461
|
+
...guidelinePaths.map((guidelinePath) => import_node_path5.default.resolve(guidelinePath)),
|
|
1462
|
+
...userFilePaths
|
|
1463
|
+
];
|
|
1464
|
+
const testCase = {
|
|
1465
|
+
id,
|
|
1466
|
+
dataset: datasetName,
|
|
1467
|
+
conversation_id: conversationId,
|
|
1468
|
+
question,
|
|
1469
|
+
input_messages: inputMessages,
|
|
1470
|
+
input_segments: inputSegments,
|
|
1471
|
+
expected_messages: outputSegments,
|
|
1472
|
+
reference_answer: referenceAnswer,
|
|
1473
|
+
guideline_paths: guidelinePaths.map((guidelinePath) => import_node_path5.default.resolve(guidelinePath)),
|
|
1474
|
+
guideline_patterns: guidelinePatterns,
|
|
1475
|
+
file_paths: allFilePaths,
|
|
1476
|
+
expected_outcome: outcome,
|
|
1477
|
+
evaluator: evalCaseEvaluatorKind,
|
|
1478
|
+
evaluators
|
|
1479
|
+
};
|
|
1480
|
+
if (verbose) {
|
|
1481
|
+
console.log(`
|
|
1482
|
+
[Eval Case: ${id}]`);
|
|
1483
|
+
if (testCase.guideline_paths.length > 0) {
|
|
1484
|
+
console.log(` Guidelines used: ${testCase.guideline_paths.length}`);
|
|
1485
|
+
for (const guidelinePath of testCase.guideline_paths) {
|
|
1486
|
+
console.log(` - ${guidelinePath}`);
|
|
1487
|
+
}
|
|
1488
|
+
} else {
|
|
1489
|
+
console.log(" No guidelines found");
|
|
1490
|
+
}
|
|
1491
|
+
}
|
|
1492
|
+
results.push(testCase);
|
|
1493
|
+
}
|
|
1494
|
+
return results;
|
|
1495
|
+
}
|
|
1496
|
+
function asString4(value) {
|
|
1497
|
+
return typeof value === "string" ? value : void 0;
|
|
1498
|
+
}
|
|
1499
|
+
function logWarning4(message, details) {
|
|
1500
|
+
if (details && details.length > 0) {
|
|
1501
|
+
const detailBlock = details.join("\n");
|
|
1502
|
+
console.warn(`${ANSI_YELLOW5}Warning: ${message}
|
|
1503
|
+
${detailBlock}${ANSI_RESET5}`);
|
|
1504
|
+
} else {
|
|
1505
|
+
console.warn(`${ANSI_YELLOW5}Warning: ${message}${ANSI_RESET5}`);
|
|
1506
|
+
}
|
|
1507
|
+
}
|
|
1508
|
+
function logError(message, details) {
|
|
1509
|
+
if (details && details.length > 0) {
|
|
1510
|
+
const detailBlock = details.join("\n");
|
|
1511
|
+
console.error(`${ANSI_RED}Error: ${message}
|
|
1512
|
+
${detailBlock}${ANSI_RESET5}`);
|
|
1513
|
+
} else {
|
|
1514
|
+
console.error(`${ANSI_RED}Error: ${message}${ANSI_RESET5}`);
|
|
1515
|
+
}
|
|
1516
|
+
}
|
|
1517
|
+
|
|
1518
|
+
// src/evaluation/formatting/prompt-builder.ts
|
|
1519
|
+
var import_promises6 = require("fs/promises");
|
|
1520
|
+
var import_node_path6 = __toESM(require("path"), 1);
|
|
1521
|
+
var ANSI_YELLOW6 = "\x1B[33m";
|
|
1522
|
+
var ANSI_RESET6 = "\x1B[0m";
|
|
1281
1523
|
async function buildPromptInputs(testCase, mode = "lm") {
|
|
1282
1524
|
const guidelineParts = [];
|
|
1283
1525
|
for (const rawPath of testCase.guideline_paths) {
|
|
1284
|
-
const absolutePath =
|
|
1526
|
+
const absolutePath = import_node_path6.default.resolve(rawPath);
|
|
1285
1527
|
if (!await fileExists(absolutePath)) {
|
|
1286
|
-
|
|
1528
|
+
logWarning5(`Could not read guideline file ${absolutePath}: file does not exist`);
|
|
1287
1529
|
continue;
|
|
1288
1530
|
}
|
|
1289
1531
|
try {
|
|
1290
|
-
const content = (await (0,
|
|
1532
|
+
const content = (await (0, import_promises6.readFile)(absolutePath, "utf8")).replace(/\r\n/g, "\n").trim();
|
|
1291
1533
|
guidelineParts.push({
|
|
1292
1534
|
content,
|
|
1293
1535
|
isFile: true,
|
|
1294
|
-
displayPath:
|
|
1536
|
+
displayPath: import_node_path6.default.basename(absolutePath)
|
|
1295
1537
|
});
|
|
1296
1538
|
} catch (error) {
|
|
1297
|
-
|
|
1539
|
+
logWarning5(`Could not read guideline file ${absolutePath}: ${error.message}`);
|
|
1298
1540
|
}
|
|
1299
1541
|
}
|
|
1300
1542
|
const guidelines = formatFileContents(guidelineParts);
|
|
@@ -1318,9 +1560,9 @@ async function buildPromptInputs(testCase, mode = "lm") {
|
|
|
1318
1560
|
messageSegments.push({ type: "text", value: segment });
|
|
1319
1561
|
}
|
|
1320
1562
|
} else if (isJsonObject(segment)) {
|
|
1321
|
-
const type =
|
|
1563
|
+
const type = asString5(segment.type);
|
|
1322
1564
|
if (type === "file") {
|
|
1323
|
-
const value =
|
|
1565
|
+
const value = asString5(segment.value);
|
|
1324
1566
|
if (!value) continue;
|
|
1325
1567
|
if (testCase.guideline_patterns && isGuidelineFile(value, testCase.guideline_patterns)) {
|
|
1326
1568
|
messageSegments.push({ type: "guideline_ref", path: value });
|
|
@@ -1331,7 +1573,7 @@ async function buildPromptInputs(testCase, mode = "lm") {
|
|
|
1331
1573
|
messageSegments.push({ type: "file", text: fileText, path: value });
|
|
1332
1574
|
}
|
|
1333
1575
|
} else if (type === "text") {
|
|
1334
|
-
const textValue =
|
|
1576
|
+
const textValue = asString5(segment.value);
|
|
1335
1577
|
if (textValue && textValue.trim().length > 0) {
|
|
1336
1578
|
messageSegments.push({ type: "text", value: textValue });
|
|
1337
1579
|
}
|
|
@@ -1485,22 +1727,22 @@ ${guidelineContent.trim()}`);
|
|
|
1485
1727
|
}
|
|
1486
1728
|
return chatPrompt.length > 0 ? chatPrompt : void 0;
|
|
1487
1729
|
}
|
|
1488
|
-
function
|
|
1730
|
+
function asString5(value) {
|
|
1489
1731
|
return typeof value === "string" ? value : void 0;
|
|
1490
1732
|
}
|
|
1491
|
-
function
|
|
1492
|
-
console.warn(`${
|
|
1733
|
+
function logWarning5(message) {
|
|
1734
|
+
console.warn(`${ANSI_YELLOW6}Warning: ${message}${ANSI_RESET6}`);
|
|
1493
1735
|
}
|
|
1494
1736
|
|
|
1495
1737
|
// src/evaluation/yaml-parser.ts
|
|
1496
|
-
var
|
|
1497
|
-
var
|
|
1498
|
-
var
|
|
1738
|
+
var ANSI_YELLOW7 = "\x1B[33m";
|
|
1739
|
+
var ANSI_RED2 = "\x1B[31m";
|
|
1740
|
+
var ANSI_RESET7 = "\x1B[0m";
|
|
1499
1741
|
async function readTestSuiteMetadata(testFilePath) {
|
|
1500
1742
|
try {
|
|
1501
|
-
const absolutePath =
|
|
1502
|
-
const content = await (0,
|
|
1503
|
-
const parsed = (0,
|
|
1743
|
+
const absolutePath = import_node_path7.default.resolve(testFilePath);
|
|
1744
|
+
const content = await (0, import_promises7.readFile)(absolutePath, "utf8");
|
|
1745
|
+
const parsed = (0, import_yaml3.parse)(content);
|
|
1504
1746
|
if (!isJsonObject(parsed)) {
|
|
1505
1747
|
return {};
|
|
1506
1748
|
}
|
|
@@ -1510,21 +1752,25 @@ async function readTestSuiteMetadata(testFilePath) {
|
|
|
1510
1752
|
}
|
|
1511
1753
|
}
|
|
1512
1754
|
async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
1755
|
+
const format = detectFormat(evalFilePath);
|
|
1756
|
+
if (format === "jsonl") {
|
|
1757
|
+
return loadEvalCasesFromJsonl(evalFilePath, repoRoot, options);
|
|
1758
|
+
}
|
|
1513
1759
|
const verbose = options?.verbose ?? false;
|
|
1514
1760
|
const evalIdFilter = options?.evalId;
|
|
1515
|
-
const absoluteTestPath =
|
|
1761
|
+
const absoluteTestPath = import_node_path7.default.resolve(evalFilePath);
|
|
1516
1762
|
const repoRootPath = resolveToAbsolutePath(repoRoot);
|
|
1517
1763
|
const searchRoots = buildSearchRoots(absoluteTestPath, repoRootPath);
|
|
1518
1764
|
const config = await loadConfig(absoluteTestPath, repoRootPath);
|
|
1519
1765
|
const guidelinePatterns = config?.guideline_patterns;
|
|
1520
|
-
const rawFile = await (0,
|
|
1521
|
-
const parsed = (0,
|
|
1766
|
+
const rawFile = await (0, import_promises7.readFile)(absoluteTestPath, "utf8");
|
|
1767
|
+
const parsed = (0, import_yaml3.parse)(rawFile);
|
|
1522
1768
|
if (!isJsonObject(parsed)) {
|
|
1523
1769
|
throw new Error(`Invalid test file format: ${evalFilePath}`);
|
|
1524
1770
|
}
|
|
1525
1771
|
const suite = parsed;
|
|
1526
|
-
const datasetNameFromSuite =
|
|
1527
|
-
const fallbackDataset =
|
|
1772
|
+
const datasetNameFromSuite = asString6(suite.dataset)?.trim();
|
|
1773
|
+
const fallbackDataset = import_node_path7.default.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
|
|
1528
1774
|
const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
|
|
1529
1775
|
const rawTestcases = suite.evalcases;
|
|
1530
1776
|
if (!Array.isArray(rawTestcases)) {
|
|
@@ -1532,24 +1778,24 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1532
1778
|
}
|
|
1533
1779
|
const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm_judge";
|
|
1534
1780
|
const globalExecution = isJsonObject(suite.execution) ? suite.execution : void 0;
|
|
1535
|
-
const _globalTarget =
|
|
1781
|
+
const _globalTarget = asString6(globalExecution?.target) ?? asString6(suite.target);
|
|
1536
1782
|
const results = [];
|
|
1537
1783
|
for (const rawEvalcase of rawTestcases) {
|
|
1538
1784
|
if (!isJsonObject(rawEvalcase)) {
|
|
1539
|
-
|
|
1785
|
+
logWarning6("Skipping invalid eval case entry (expected object)");
|
|
1540
1786
|
continue;
|
|
1541
1787
|
}
|
|
1542
1788
|
const evalcase = rawEvalcase;
|
|
1543
|
-
const id =
|
|
1789
|
+
const id = asString6(evalcase.id);
|
|
1544
1790
|
if (evalIdFilter && id !== evalIdFilter) {
|
|
1545
1791
|
continue;
|
|
1546
1792
|
}
|
|
1547
|
-
const conversationId =
|
|
1548
|
-
const outcome =
|
|
1793
|
+
const conversationId = asString6(evalcase.conversation_id);
|
|
1794
|
+
const outcome = asString6(evalcase.expected_outcome) ?? asString6(evalcase.outcome);
|
|
1549
1795
|
const inputMessagesValue = evalcase.input_messages;
|
|
1550
1796
|
const expectedMessagesValue = evalcase.expected_messages;
|
|
1551
1797
|
if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
|
|
1552
|
-
|
|
1798
|
+
logError2(
|
|
1553
1799
|
`Skipping incomplete eval case: ${id ?? "unknown"}. Missing required fields: id, outcome, and/or input_messages`
|
|
1554
1800
|
);
|
|
1555
1801
|
continue;
|
|
@@ -1560,7 +1806,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1560
1806
|
);
|
|
1561
1807
|
const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
|
|
1562
1808
|
if (hasExpectedMessages && expectedMessages.length === 0) {
|
|
1563
|
-
|
|
1809
|
+
logError2(`No valid expected message found for eval case: ${id}`);
|
|
1564
1810
|
continue;
|
|
1565
1811
|
}
|
|
1566
1812
|
const guidelinePaths = [];
|
|
@@ -1601,7 +1847,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1601
1847
|
evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
|
|
1602
1848
|
} catch (error) {
|
|
1603
1849
|
const message = error instanceof Error ? error.message : String(error);
|
|
1604
|
-
|
|
1850
|
+
logError2(`Skipping eval case '${id}': ${message}`);
|
|
1605
1851
|
continue;
|
|
1606
1852
|
}
|
|
1607
1853
|
const inlineRubrics = evalcase.rubrics;
|
|
@@ -1616,8 +1862,8 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1616
1862
|
};
|
|
1617
1863
|
}
|
|
1618
1864
|
return {
|
|
1619
|
-
id:
|
|
1620
|
-
description:
|
|
1865
|
+
id: asString6(rubric.id) ?? `rubric-${index + 1}`,
|
|
1866
|
+
description: asString6(rubric.description) ?? "",
|
|
1621
1867
|
weight: typeof rubric.weight === "number" ? rubric.weight : 1,
|
|
1622
1868
|
required: typeof rubric.required === "boolean" ? rubric.required : true
|
|
1623
1869
|
};
|
|
@@ -1638,7 +1884,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1638
1884
|
}
|
|
1639
1885
|
}
|
|
1640
1886
|
const allFilePaths = [
|
|
1641
|
-
...guidelinePaths.map((guidelinePath) =>
|
|
1887
|
+
...guidelinePaths.map((guidelinePath) => import_node_path7.default.resolve(guidelinePath)),
|
|
1642
1888
|
...userFilePaths
|
|
1643
1889
|
];
|
|
1644
1890
|
const testCase = {
|
|
@@ -1650,7 +1896,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1650
1896
|
input_segments: inputSegments,
|
|
1651
1897
|
expected_messages: outputSegments,
|
|
1652
1898
|
reference_answer: referenceAnswer,
|
|
1653
|
-
guideline_paths: guidelinePaths.map((guidelinePath) =>
|
|
1899
|
+
guideline_paths: guidelinePaths.map((guidelinePath) => import_node_path7.default.resolve(guidelinePath)),
|
|
1654
1900
|
guideline_patterns: guidelinePatterns,
|
|
1655
1901
|
file_paths: allFilePaths,
|
|
1656
1902
|
expected_outcome: outcome,
|
|
@@ -1673,35 +1919,35 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1673
1919
|
}
|
|
1674
1920
|
return results;
|
|
1675
1921
|
}
|
|
1676
|
-
function
|
|
1922
|
+
function asString6(value) {
|
|
1677
1923
|
return typeof value === "string" ? value : void 0;
|
|
1678
1924
|
}
|
|
1679
|
-
function
|
|
1925
|
+
function logWarning6(message, details) {
|
|
1680
1926
|
if (details && details.length > 0) {
|
|
1681
1927
|
const detailBlock = details.join("\n");
|
|
1682
|
-
console.warn(`${
|
|
1683
|
-
${detailBlock}${
|
|
1928
|
+
console.warn(`${ANSI_YELLOW7}Warning: ${message}
|
|
1929
|
+
${detailBlock}${ANSI_RESET7}`);
|
|
1684
1930
|
} else {
|
|
1685
|
-
console.warn(`${
|
|
1931
|
+
console.warn(`${ANSI_YELLOW7}Warning: ${message}${ANSI_RESET7}`);
|
|
1686
1932
|
}
|
|
1687
1933
|
}
|
|
1688
|
-
function
|
|
1934
|
+
function logError2(message, details) {
|
|
1689
1935
|
if (details && details.length > 0) {
|
|
1690
1936
|
const detailBlock = details.join("\n");
|
|
1691
|
-
console.error(`${
|
|
1692
|
-
${detailBlock}${
|
|
1937
|
+
console.error(`${ANSI_RED2}Error: ${message}
|
|
1938
|
+
${detailBlock}${ANSI_RESET7}`);
|
|
1693
1939
|
} else {
|
|
1694
|
-
console.error(`${
|
|
1940
|
+
console.error(`${ANSI_RED2}Error: ${message}${ANSI_RESET7}`);
|
|
1695
1941
|
}
|
|
1696
1942
|
}
|
|
1697
1943
|
|
|
1698
1944
|
// src/evaluation/file-utils.ts
|
|
1699
1945
|
var import_node_fs2 = require("fs");
|
|
1700
|
-
var
|
|
1701
|
-
var
|
|
1946
|
+
var import_promises8 = require("fs/promises");
|
|
1947
|
+
var import_node_path8 = __toESM(require("path"), 1);
|
|
1702
1948
|
async function fileExists2(filePath) {
|
|
1703
1949
|
try {
|
|
1704
|
-
await (0,
|
|
1950
|
+
await (0, import_promises8.access)(filePath, import_node_fs2.constants.F_OK);
|
|
1705
1951
|
return true;
|
|
1706
1952
|
} catch {
|
|
1707
1953
|
return false;
|
|
@@ -1711,22 +1957,22 @@ function normalizeLineEndings(content) {
|
|
|
1711
1957
|
return content.replace(/\r\n/g, "\n");
|
|
1712
1958
|
}
|
|
1713
1959
|
async function readTextFile(filePath) {
|
|
1714
|
-
const content = await (0,
|
|
1960
|
+
const content = await (0, import_promises8.readFile)(filePath, "utf8");
|
|
1715
1961
|
return normalizeLineEndings(content);
|
|
1716
1962
|
}
|
|
1717
1963
|
async function readJsonFile(filePath) {
|
|
1718
|
-
const content = await (0,
|
|
1964
|
+
const content = await (0, import_promises8.readFile)(filePath, "utf8");
|
|
1719
1965
|
return JSON.parse(content);
|
|
1720
1966
|
}
|
|
1721
1967
|
async function findGitRoot(startPath) {
|
|
1722
|
-
let currentDir =
|
|
1723
|
-
const root =
|
|
1968
|
+
let currentDir = import_node_path8.default.dirname(import_node_path8.default.resolve(startPath));
|
|
1969
|
+
const root = import_node_path8.default.parse(currentDir).root;
|
|
1724
1970
|
while (currentDir !== root) {
|
|
1725
|
-
const gitPath =
|
|
1971
|
+
const gitPath = import_node_path8.default.join(currentDir, ".git");
|
|
1726
1972
|
if (await fileExists2(gitPath)) {
|
|
1727
1973
|
return currentDir;
|
|
1728
1974
|
}
|
|
1729
|
-
const parentDir =
|
|
1975
|
+
const parentDir = import_node_path8.default.dirname(currentDir);
|
|
1730
1976
|
if (parentDir === currentDir) {
|
|
1731
1977
|
break;
|
|
1732
1978
|
}
|
|
@@ -1737,8 +1983,8 @@ async function findGitRoot(startPath) {
|
|
|
1737
1983
|
function buildDirectoryChain2(filePath, repoRoot) {
|
|
1738
1984
|
const directories = [];
|
|
1739
1985
|
const seen = /* @__PURE__ */ new Set();
|
|
1740
|
-
const boundary =
|
|
1741
|
-
let current =
|
|
1986
|
+
const boundary = import_node_path8.default.resolve(repoRoot);
|
|
1987
|
+
let current = import_node_path8.default.resolve(import_node_path8.default.dirname(filePath));
|
|
1742
1988
|
while (current !== void 0) {
|
|
1743
1989
|
if (!seen.has(current)) {
|
|
1744
1990
|
directories.push(current);
|
|
@@ -1747,7 +1993,7 @@ function buildDirectoryChain2(filePath, repoRoot) {
|
|
|
1747
1993
|
if (current === boundary) {
|
|
1748
1994
|
break;
|
|
1749
1995
|
}
|
|
1750
|
-
const parent =
|
|
1996
|
+
const parent = import_node_path8.default.dirname(current);
|
|
1751
1997
|
if (parent === current) {
|
|
1752
1998
|
break;
|
|
1753
1999
|
}
|
|
@@ -1761,16 +2007,16 @@ function buildDirectoryChain2(filePath, repoRoot) {
|
|
|
1761
2007
|
function buildSearchRoots2(evalPath, repoRoot) {
|
|
1762
2008
|
const uniqueRoots = [];
|
|
1763
2009
|
const addRoot = (root) => {
|
|
1764
|
-
const normalized =
|
|
2010
|
+
const normalized = import_node_path8.default.resolve(root);
|
|
1765
2011
|
if (!uniqueRoots.includes(normalized)) {
|
|
1766
2012
|
uniqueRoots.push(normalized);
|
|
1767
2013
|
}
|
|
1768
2014
|
};
|
|
1769
|
-
let currentDir =
|
|
2015
|
+
let currentDir = import_node_path8.default.dirname(evalPath);
|
|
1770
2016
|
let reachedBoundary = false;
|
|
1771
2017
|
while (!reachedBoundary) {
|
|
1772
2018
|
addRoot(currentDir);
|
|
1773
|
-
const parentDir =
|
|
2019
|
+
const parentDir = import_node_path8.default.dirname(currentDir);
|
|
1774
2020
|
if (currentDir === repoRoot || parentDir === currentDir) {
|
|
1775
2021
|
reachedBoundary = true;
|
|
1776
2022
|
} else {
|
|
@@ -1788,16 +2034,16 @@ function trimLeadingSeparators2(value) {
|
|
|
1788
2034
|
async function resolveFileReference2(rawValue, searchRoots) {
|
|
1789
2035
|
const displayPath = trimLeadingSeparators2(rawValue);
|
|
1790
2036
|
const potentialPaths = [];
|
|
1791
|
-
if (
|
|
1792
|
-
potentialPaths.push(
|
|
2037
|
+
if (import_node_path8.default.isAbsolute(rawValue)) {
|
|
2038
|
+
potentialPaths.push(import_node_path8.default.normalize(rawValue));
|
|
1793
2039
|
}
|
|
1794
2040
|
for (const base of searchRoots) {
|
|
1795
|
-
potentialPaths.push(
|
|
2041
|
+
potentialPaths.push(import_node_path8.default.resolve(base, displayPath));
|
|
1796
2042
|
}
|
|
1797
2043
|
const attempted = [];
|
|
1798
2044
|
const seen = /* @__PURE__ */ new Set();
|
|
1799
2045
|
for (const candidate of potentialPaths) {
|
|
1800
|
-
const absoluteCandidate =
|
|
2046
|
+
const absoluteCandidate = import_node_path8.default.resolve(candidate);
|
|
1801
2047
|
if (seen.has(absoluteCandidate)) {
|
|
1802
2048
|
continue;
|
|
1803
2049
|
}
|
|
@@ -2147,9 +2393,9 @@ async function withRetry(fn, retryConfig, signal) {
|
|
|
2147
2393
|
var import_node_child_process = require("child_process");
|
|
2148
2394
|
var import_node_crypto = require("crypto");
|
|
2149
2395
|
var import_node_fs3 = require("fs");
|
|
2150
|
-
var
|
|
2396
|
+
var import_promises9 = require("fs/promises");
|
|
2151
2397
|
var import_node_os = require("os");
|
|
2152
|
-
var
|
|
2398
|
+
var import_node_path10 = __toESM(require("path"), 1);
|
|
2153
2399
|
|
|
2154
2400
|
// src/evaluation/providers/claude-code-log-tracker.ts
|
|
2155
2401
|
var GLOBAL_LOGS_KEY = Symbol.for("agentv.claudeCodeLogs");
|
|
@@ -2205,7 +2451,7 @@ function subscribeToClaudeCodeLogEntries(listener) {
|
|
|
2205
2451
|
}
|
|
2206
2452
|
|
|
2207
2453
|
// src/evaluation/providers/preread.ts
|
|
2208
|
-
var
|
|
2454
|
+
var import_node_path9 = __toESM(require("path"), 1);
|
|
2209
2455
|
function buildPromptDocument(request, inputFiles, options) {
|
|
2210
2456
|
const parts = [];
|
|
2211
2457
|
const guidelineFiles = collectGuidelineFiles(
|
|
@@ -2228,7 +2474,7 @@ function normalizeInputFiles(inputFiles) {
|
|
|
2228
2474
|
}
|
|
2229
2475
|
const deduped = /* @__PURE__ */ new Map();
|
|
2230
2476
|
for (const inputFile of inputFiles) {
|
|
2231
|
-
const absolutePath =
|
|
2477
|
+
const absolutePath = import_node_path9.default.resolve(inputFile);
|
|
2232
2478
|
if (!deduped.has(absolutePath)) {
|
|
2233
2479
|
deduped.set(absolutePath, absolutePath);
|
|
2234
2480
|
}
|
|
@@ -2241,14 +2487,14 @@ function collectGuidelineFiles(inputFiles, guidelinePatterns, overrides) {
|
|
|
2241
2487
|
}
|
|
2242
2488
|
const unique = /* @__PURE__ */ new Map();
|
|
2243
2489
|
for (const inputFile of inputFiles) {
|
|
2244
|
-
const absolutePath =
|
|
2490
|
+
const absolutePath = import_node_path9.default.resolve(inputFile);
|
|
2245
2491
|
if (overrides?.has(absolutePath)) {
|
|
2246
2492
|
if (!unique.has(absolutePath)) {
|
|
2247
2493
|
unique.set(absolutePath, absolutePath);
|
|
2248
2494
|
}
|
|
2249
2495
|
continue;
|
|
2250
2496
|
}
|
|
2251
|
-
const normalized = absolutePath.split(
|
|
2497
|
+
const normalized = absolutePath.split(import_node_path9.default.sep).join("/");
|
|
2252
2498
|
if (isGuidelineFile(normalized, guidelinePatterns)) {
|
|
2253
2499
|
if (!unique.has(absolutePath)) {
|
|
2254
2500
|
unique.set(absolutePath, absolutePath);
|
|
@@ -2263,7 +2509,7 @@ function collectInputFiles(inputFiles) {
|
|
|
2263
2509
|
}
|
|
2264
2510
|
const unique = /* @__PURE__ */ new Map();
|
|
2265
2511
|
for (const inputFile of inputFiles) {
|
|
2266
|
-
const absolutePath =
|
|
2512
|
+
const absolutePath = import_node_path9.default.resolve(inputFile);
|
|
2267
2513
|
if (!unique.has(absolutePath)) {
|
|
2268
2514
|
unique.set(absolutePath, absolutePath);
|
|
2269
2515
|
}
|
|
@@ -2275,7 +2521,7 @@ function buildMandatoryPrereadBlock(guidelineFiles, inputFiles) {
|
|
|
2275
2521
|
return "";
|
|
2276
2522
|
}
|
|
2277
2523
|
const buildList = (files) => files.map((absolutePath) => {
|
|
2278
|
-
const fileName =
|
|
2524
|
+
const fileName = import_node_path9.default.basename(absolutePath);
|
|
2279
2525
|
const fileUri = pathToFileUri(absolutePath);
|
|
2280
2526
|
return `* [${fileName}](${fileUri})`;
|
|
2281
2527
|
});
|
|
@@ -2295,7 +2541,7 @@ ${buildList(inputFiles).join("\n")}.`);
|
|
|
2295
2541
|
return sections.join("\n");
|
|
2296
2542
|
}
|
|
2297
2543
|
function pathToFileUri(filePath) {
|
|
2298
|
-
const absolutePath =
|
|
2544
|
+
const absolutePath = import_node_path9.default.isAbsolute(filePath) ? filePath : import_node_path9.default.resolve(filePath);
|
|
2299
2545
|
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
2300
2546
|
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
2301
2547
|
return `file:///${normalizedPath}`;
|
|
@@ -2332,8 +2578,8 @@ var ClaudeCodeProvider = class {
|
|
|
2332
2578
|
const workspaceRoot = await this.createWorkspace();
|
|
2333
2579
|
const logger = await this.createStreamLogger(request).catch(() => void 0);
|
|
2334
2580
|
try {
|
|
2335
|
-
const promptFile =
|
|
2336
|
-
await (0,
|
|
2581
|
+
const promptFile = import_node_path10.default.join(workspaceRoot, PROMPT_FILENAME);
|
|
2582
|
+
await (0, import_promises9.writeFile)(promptFile, request.question, "utf8");
|
|
2337
2583
|
const args = this.buildClaudeCodeArgs(request.question, inputFiles);
|
|
2338
2584
|
const cwd = this.resolveCwd();
|
|
2339
2585
|
const result = await this.executeClaudeCode(args, cwd, request.signal, logger);
|
|
@@ -2380,7 +2626,7 @@ var ClaudeCodeProvider = class {
|
|
|
2380
2626
|
if (!this.config.cwd) {
|
|
2381
2627
|
return process.cwd();
|
|
2382
2628
|
}
|
|
2383
|
-
return
|
|
2629
|
+
return import_node_path10.default.resolve(this.config.cwd);
|
|
2384
2630
|
}
|
|
2385
2631
|
buildClaudeCodeArgs(prompt, inputFiles) {
|
|
2386
2632
|
const args = [];
|
|
@@ -2437,11 +2683,11 @@ ${filesContext}`;
|
|
|
2437
2683
|
}
|
|
2438
2684
|
}
|
|
2439
2685
|
async createWorkspace() {
|
|
2440
|
-
return await (0,
|
|
2686
|
+
return await (0, import_promises9.mkdtemp)(import_node_path10.default.join((0, import_node_os.tmpdir)(), WORKSPACE_PREFIX));
|
|
2441
2687
|
}
|
|
2442
2688
|
async cleanupWorkspace(workspaceRoot) {
|
|
2443
2689
|
try {
|
|
2444
|
-
await (0,
|
|
2690
|
+
await (0, import_promises9.rm)(workspaceRoot, { recursive: true, force: true });
|
|
2445
2691
|
} catch {
|
|
2446
2692
|
}
|
|
2447
2693
|
}
|
|
@@ -2451,9 +2697,9 @@ ${filesContext}`;
|
|
|
2451
2697
|
return void 0;
|
|
2452
2698
|
}
|
|
2453
2699
|
if (this.config.logDir) {
|
|
2454
|
-
return
|
|
2700
|
+
return import_node_path10.default.resolve(this.config.logDir);
|
|
2455
2701
|
}
|
|
2456
|
-
return
|
|
2702
|
+
return import_node_path10.default.join(process.cwd(), ".agentv", "logs", "claude-code");
|
|
2457
2703
|
}
|
|
2458
2704
|
async createStreamLogger(request) {
|
|
2459
2705
|
const logDir = this.resolveLogDirectory();
|
|
@@ -2461,13 +2707,13 @@ ${filesContext}`;
|
|
|
2461
2707
|
return void 0;
|
|
2462
2708
|
}
|
|
2463
2709
|
try {
|
|
2464
|
-
await (0,
|
|
2710
|
+
await (0, import_promises9.mkdir)(logDir, { recursive: true });
|
|
2465
2711
|
} catch (error) {
|
|
2466
2712
|
const message = error instanceof Error ? error.message : String(error);
|
|
2467
2713
|
console.warn(`Skipping Claude Code stream logging (could not create ${logDir}): ${message}`);
|
|
2468
2714
|
return void 0;
|
|
2469
2715
|
}
|
|
2470
|
-
const filePath =
|
|
2716
|
+
const filePath = import_node_path10.default.join(logDir, buildLogFilename(request, this.targetName));
|
|
2471
2717
|
try {
|
|
2472
2718
|
const logger = await ClaudeCodeStreamLogger.create({
|
|
2473
2719
|
filePath,
|
|
@@ -2872,16 +3118,16 @@ function escapeShellArg(arg) {
|
|
|
2872
3118
|
}
|
|
2873
3119
|
async function defaultClaudeCodeRunner(options) {
|
|
2874
3120
|
const tempId = (0, import_node_crypto.randomUUID)();
|
|
2875
|
-
const stdoutFile =
|
|
2876
|
-
const stderrFile =
|
|
2877
|
-
const exitFile =
|
|
2878
|
-
const pidFile =
|
|
3121
|
+
const stdoutFile = import_node_path10.default.join((0, import_node_os.tmpdir)(), `agentv-cc-${tempId}-stdout`);
|
|
3122
|
+
const stderrFile = import_node_path10.default.join((0, import_node_os.tmpdir)(), `agentv-cc-${tempId}-stderr`);
|
|
3123
|
+
const exitFile = import_node_path10.default.join((0, import_node_os.tmpdir)(), `agentv-cc-${tempId}-exit`);
|
|
3124
|
+
const pidFile = import_node_path10.default.join((0, import_node_os.tmpdir)(), `agentv-cc-${tempId}-pid`);
|
|
2879
3125
|
try {
|
|
2880
3126
|
return await runClaudeCodeWithTempFiles(options, stdoutFile, stderrFile, exitFile, pidFile);
|
|
2881
3127
|
} finally {
|
|
2882
3128
|
for (const file of [stdoutFile, stderrFile, exitFile, pidFile]) {
|
|
2883
3129
|
try {
|
|
2884
|
-
await (0,
|
|
3130
|
+
await (0, import_promises9.rm)(file, { force: true });
|
|
2885
3131
|
} catch {
|
|
2886
3132
|
}
|
|
2887
3133
|
}
|
|
@@ -2915,8 +3161,8 @@ async function runClaudeCodeWithTempFiles(options, stdoutFile, stderrFile, exitF
|
|
|
2915
3161
|
let lastStdoutSize = 0;
|
|
2916
3162
|
const readFileIfExists = async (filePath) => {
|
|
2917
3163
|
try {
|
|
2918
|
-
const { readFile:
|
|
2919
|
-
return await
|
|
3164
|
+
const { readFile: readFile9 } = await import("fs/promises");
|
|
3165
|
+
return await readFile9(filePath, "utf8");
|
|
2920
3166
|
} catch {
|
|
2921
3167
|
return "";
|
|
2922
3168
|
}
|
|
@@ -2989,9 +3235,9 @@ async function runClaudeCodeWithTempFiles(options, stdoutFile, stderrFile, exitF
|
|
|
2989
3235
|
|
|
2990
3236
|
// src/evaluation/providers/cli.ts
|
|
2991
3237
|
var import_node_child_process2 = require("child_process");
|
|
2992
|
-
var
|
|
3238
|
+
var import_promises10 = __toESM(require("fs/promises"), 1);
|
|
2993
3239
|
var import_node_os2 = __toESM(require("os"), 1);
|
|
2994
|
-
var
|
|
3240
|
+
var import_node_path11 = __toESM(require("path"), 1);
|
|
2995
3241
|
var import_node_util = require("util");
|
|
2996
3242
|
var import_zod = require("zod");
|
|
2997
3243
|
var ToolCallSchema = import_zod.z.object({
|
|
@@ -3360,7 +3606,7 @@ var CliProvider = class {
|
|
|
3360
3606
|
throw new Error(`Failed to read output file '${filePath}': ${errorMsg}`);
|
|
3361
3607
|
} finally {
|
|
3362
3608
|
if (!this.keepTempFiles) {
|
|
3363
|
-
await
|
|
3609
|
+
await import_promises10.default.unlink(filePath).catch(() => {
|
|
3364
3610
|
});
|
|
3365
3611
|
}
|
|
3366
3612
|
}
|
|
@@ -3448,7 +3694,7 @@ function normalizeInputFiles2(inputFiles) {
|
|
|
3448
3694
|
}
|
|
3449
3695
|
const unique = /* @__PURE__ */ new Map();
|
|
3450
3696
|
for (const inputFile of inputFiles) {
|
|
3451
|
-
const absolutePath =
|
|
3697
|
+
const absolutePath = import_node_path11.default.resolve(inputFile);
|
|
3452
3698
|
if (!unique.has(absolutePath)) {
|
|
3453
3699
|
unique.set(absolutePath, absolutePath);
|
|
3454
3700
|
}
|
|
@@ -3462,7 +3708,7 @@ function formatFileList(files, template) {
|
|
|
3462
3708
|
const formatter = template ?? "{path}";
|
|
3463
3709
|
return files.map((filePath) => {
|
|
3464
3710
|
const escapedPath = shellEscape(filePath);
|
|
3465
|
-
const escapedName = shellEscape(
|
|
3711
|
+
const escapedName = shellEscape(import_node_path11.default.basename(filePath));
|
|
3466
3712
|
return formatter.replaceAll("{path}", escapedPath).replaceAll("{basename}", escapedName);
|
|
3467
3713
|
}).join(" ");
|
|
3468
3714
|
}
|
|
@@ -3486,7 +3732,7 @@ function generateOutputFilePath(evalCaseId, extension = ".json") {
|
|
|
3486
3732
|
const safeEvalId = evalCaseId || "unknown";
|
|
3487
3733
|
const timestamp = Date.now();
|
|
3488
3734
|
const random = Math.random().toString(36).substring(2, 9);
|
|
3489
|
-
return
|
|
3735
|
+
return import_node_path11.default.join(import_node_os2.default.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}${extension}`);
|
|
3490
3736
|
}
|
|
3491
3737
|
function formatTimeoutSuffix2(timeoutMs) {
|
|
3492
3738
|
if (!timeoutMs || timeoutMs <= 0) {
|
|
@@ -3500,9 +3746,9 @@ function formatTimeoutSuffix2(timeoutMs) {
|
|
|
3500
3746
|
var import_node_child_process3 = require("child_process");
|
|
3501
3747
|
var import_node_crypto2 = require("crypto");
|
|
3502
3748
|
var import_node_fs4 = require("fs");
|
|
3503
|
-
var
|
|
3749
|
+
var import_promises11 = require("fs/promises");
|
|
3504
3750
|
var import_node_os3 = require("os");
|
|
3505
|
-
var
|
|
3751
|
+
var import_node_path12 = __toESM(require("path"), 1);
|
|
3506
3752
|
var import_node_util2 = require("util");
|
|
3507
3753
|
|
|
3508
3754
|
// src/evaluation/providers/codex-log-tracker.ts
|
|
@@ -3597,8 +3843,8 @@ var CodexProvider = class {
|
|
|
3597
3843
|
const promptContent = `${systemPrompt}
|
|
3598
3844
|
|
|
3599
3845
|
${basePrompt}`;
|
|
3600
|
-
const promptFile =
|
|
3601
|
-
await (0,
|
|
3846
|
+
const promptFile = import_node_path12.default.join(workspaceRoot, PROMPT_FILENAME2);
|
|
3847
|
+
await (0, import_promises11.writeFile)(promptFile, promptContent, "utf8");
|
|
3602
3848
|
const args = this.buildCodexArgs();
|
|
3603
3849
|
const cwd = this.resolveCwd(workspaceRoot);
|
|
3604
3850
|
const result = await this.executeCodex(args, cwd, promptContent, request.signal, logger);
|
|
@@ -3647,7 +3893,7 @@ ${basePrompt}`;
|
|
|
3647
3893
|
if (!this.config.cwd) {
|
|
3648
3894
|
return workspaceRoot;
|
|
3649
3895
|
}
|
|
3650
|
-
return
|
|
3896
|
+
return import_node_path12.default.resolve(this.config.cwd);
|
|
3651
3897
|
}
|
|
3652
3898
|
buildCodexArgs() {
|
|
3653
3899
|
const args = [
|
|
@@ -3689,11 +3935,11 @@ ${basePrompt}`;
|
|
|
3689
3935
|
}
|
|
3690
3936
|
}
|
|
3691
3937
|
async createWorkspace() {
|
|
3692
|
-
return await (0,
|
|
3938
|
+
return await (0, import_promises11.mkdtemp)(import_node_path12.default.join((0, import_node_os3.tmpdir)(), WORKSPACE_PREFIX2));
|
|
3693
3939
|
}
|
|
3694
3940
|
async cleanupWorkspace(workspaceRoot) {
|
|
3695
3941
|
try {
|
|
3696
|
-
await (0,
|
|
3942
|
+
await (0, import_promises11.rm)(workspaceRoot, { recursive: true, force: true });
|
|
3697
3943
|
} catch {
|
|
3698
3944
|
}
|
|
3699
3945
|
}
|
|
@@ -3703,9 +3949,9 @@ ${basePrompt}`;
|
|
|
3703
3949
|
return void 0;
|
|
3704
3950
|
}
|
|
3705
3951
|
if (this.config.logDir) {
|
|
3706
|
-
return
|
|
3952
|
+
return import_node_path12.default.resolve(this.config.logDir);
|
|
3707
3953
|
}
|
|
3708
|
-
return
|
|
3954
|
+
return import_node_path12.default.join(process.cwd(), ".agentv", "logs", "codex");
|
|
3709
3955
|
}
|
|
3710
3956
|
async createStreamLogger(request) {
|
|
3711
3957
|
const logDir = this.resolveLogDirectory();
|
|
@@ -3713,13 +3959,13 @@ ${basePrompt}`;
|
|
|
3713
3959
|
return void 0;
|
|
3714
3960
|
}
|
|
3715
3961
|
try {
|
|
3716
|
-
await (0,
|
|
3962
|
+
await (0, import_promises11.mkdir)(logDir, { recursive: true });
|
|
3717
3963
|
} catch (error) {
|
|
3718
3964
|
const message = error instanceof Error ? error.message : String(error);
|
|
3719
3965
|
console.warn(`Skipping Codex stream logging (could not create ${logDir}): ${message}`);
|
|
3720
3966
|
return void 0;
|
|
3721
3967
|
}
|
|
3722
|
-
const filePath =
|
|
3968
|
+
const filePath = import_node_path12.default.join(logDir, buildLogFilename2(request, this.targetName));
|
|
3723
3969
|
try {
|
|
3724
3970
|
const logger = await CodexStreamLogger.create({
|
|
3725
3971
|
filePath,
|
|
@@ -3934,9 +4180,9 @@ function tryParseJsonValue2(rawLine) {
|
|
|
3934
4180
|
async function locateExecutable(candidate) {
|
|
3935
4181
|
const includesPathSeparator = candidate.includes("/") || candidate.includes("\\");
|
|
3936
4182
|
if (includesPathSeparator) {
|
|
3937
|
-
const resolved =
|
|
4183
|
+
const resolved = import_node_path12.default.isAbsolute(candidate) ? candidate : import_node_path12.default.resolve(candidate);
|
|
3938
4184
|
const executablePath = await ensureWindowsExecutableVariant(resolved);
|
|
3939
|
-
await (0,
|
|
4185
|
+
await (0, import_promises11.access)(executablePath, import_node_fs4.constants.F_OK);
|
|
3940
4186
|
return executablePath;
|
|
3941
4187
|
}
|
|
3942
4188
|
const locator = process.platform === "win32" ? "where" : "which";
|
|
@@ -3946,7 +4192,7 @@ async function locateExecutable(candidate) {
|
|
|
3946
4192
|
const preferred = selectExecutableCandidate(lines);
|
|
3947
4193
|
if (preferred) {
|
|
3948
4194
|
const executablePath = await ensureWindowsExecutableVariant(preferred);
|
|
3949
|
-
await (0,
|
|
4195
|
+
await (0, import_promises11.access)(executablePath, import_node_fs4.constants.F_OK);
|
|
3950
4196
|
return executablePath;
|
|
3951
4197
|
}
|
|
3952
4198
|
} catch {
|
|
@@ -3980,7 +4226,7 @@ async function ensureWindowsExecutableVariant(candidate) {
|
|
|
3980
4226
|
for (const ext of extensions) {
|
|
3981
4227
|
const withExtension = `${candidate}${ext}`;
|
|
3982
4228
|
try {
|
|
3983
|
-
await (0,
|
|
4229
|
+
await (0, import_promises11.access)(withExtension, import_node_fs4.constants.F_OK);
|
|
3984
4230
|
return withExtension;
|
|
3985
4231
|
} catch {
|
|
3986
4232
|
}
|
|
@@ -4445,9 +4691,9 @@ function extractToolCalls2(content) {
|
|
|
4445
4691
|
var import_node_child_process4 = require("child_process");
|
|
4446
4692
|
var import_node_crypto3 = require("crypto");
|
|
4447
4693
|
var import_node_fs5 = require("fs");
|
|
4448
|
-
var
|
|
4694
|
+
var import_promises12 = require("fs/promises");
|
|
4449
4695
|
var import_node_os4 = require("os");
|
|
4450
|
-
var
|
|
4696
|
+
var import_node_path13 = __toESM(require("path"), 1);
|
|
4451
4697
|
|
|
4452
4698
|
// src/evaluation/providers/pi-log-tracker.ts
|
|
4453
4699
|
var GLOBAL_LOGS_KEY3 = Symbol.for("agentv.piLogs");
|
|
@@ -4531,8 +4777,8 @@ var PiCodingAgentProvider = class {
|
|
|
4531
4777
|
const workspaceRoot = await this.createWorkspace();
|
|
4532
4778
|
const logger = await this.createStreamLogger(request).catch(() => void 0);
|
|
4533
4779
|
try {
|
|
4534
|
-
const promptFile =
|
|
4535
|
-
await (0,
|
|
4780
|
+
const promptFile = import_node_path13.default.join(workspaceRoot, PROMPT_FILENAME3);
|
|
4781
|
+
await (0, import_promises12.writeFile)(promptFile, request.question, "utf8");
|
|
4536
4782
|
const args = this.buildPiArgs(request.question, inputFiles);
|
|
4537
4783
|
const cwd = this.resolveCwd(workspaceRoot);
|
|
4538
4784
|
const result = await this.executePi(args, cwd, request.signal, logger);
|
|
@@ -4573,7 +4819,7 @@ var PiCodingAgentProvider = class {
|
|
|
4573
4819
|
if (!this.config.cwd) {
|
|
4574
4820
|
return workspaceRoot;
|
|
4575
4821
|
}
|
|
4576
|
-
return
|
|
4822
|
+
return import_node_path13.default.resolve(this.config.cwd);
|
|
4577
4823
|
}
|
|
4578
4824
|
buildPiArgs(prompt, inputFiles) {
|
|
4579
4825
|
const args = [];
|
|
@@ -4662,19 +4908,19 @@ ${prompt}`;
|
|
|
4662
4908
|
return env;
|
|
4663
4909
|
}
|
|
4664
4910
|
async createWorkspace() {
|
|
4665
|
-
return await (0,
|
|
4911
|
+
return await (0, import_promises12.mkdtemp)(import_node_path13.default.join((0, import_node_os4.tmpdir)(), WORKSPACE_PREFIX3));
|
|
4666
4912
|
}
|
|
4667
4913
|
async cleanupWorkspace(workspaceRoot) {
|
|
4668
4914
|
try {
|
|
4669
|
-
await (0,
|
|
4915
|
+
await (0, import_promises12.rm)(workspaceRoot, { recursive: true, force: true });
|
|
4670
4916
|
} catch {
|
|
4671
4917
|
}
|
|
4672
4918
|
}
|
|
4673
4919
|
resolveLogDirectory() {
|
|
4674
4920
|
if (this.config.logDir) {
|
|
4675
|
-
return
|
|
4921
|
+
return import_node_path13.default.resolve(this.config.logDir);
|
|
4676
4922
|
}
|
|
4677
|
-
return
|
|
4923
|
+
return import_node_path13.default.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
|
|
4678
4924
|
}
|
|
4679
4925
|
async createStreamLogger(request) {
|
|
4680
4926
|
const logDir = this.resolveLogDirectory();
|
|
@@ -4682,13 +4928,13 @@ ${prompt}`;
|
|
|
4682
4928
|
return void 0;
|
|
4683
4929
|
}
|
|
4684
4930
|
try {
|
|
4685
|
-
await (0,
|
|
4931
|
+
await (0, import_promises12.mkdir)(logDir, { recursive: true });
|
|
4686
4932
|
} catch (error) {
|
|
4687
4933
|
const message = error instanceof Error ? error.message : String(error);
|
|
4688
4934
|
console.warn(`Skipping Pi stream logging (could not create ${logDir}): ${message}`);
|
|
4689
4935
|
return void 0;
|
|
4690
4936
|
}
|
|
4691
|
-
const filePath =
|
|
4937
|
+
const filePath = import_node_path13.default.join(logDir, buildLogFilename3(request, this.targetName));
|
|
4692
4938
|
try {
|
|
4693
4939
|
const logger = await PiStreamLogger.create({
|
|
4694
4940
|
filePath,
|
|
@@ -5121,7 +5367,7 @@ async function defaultPiRunner(options) {
|
|
|
5121
5367
|
}
|
|
5122
5368
|
|
|
5123
5369
|
// src/evaluation/providers/targets.ts
|
|
5124
|
-
var
|
|
5370
|
+
var import_node_path14 = __toESM(require("path"), 1);
|
|
5125
5371
|
var import_zod2 = require("zod");
|
|
5126
5372
|
var CliHealthcheckHttpInputSchema = import_zod2.z.object({
|
|
5127
5373
|
type: import_zod2.z.literal("http"),
|
|
@@ -5227,11 +5473,11 @@ function normalizeCliHealthcheck(input, env, targetName, evalFilePath) {
|
|
|
5227
5473
|
allowLiteral: true,
|
|
5228
5474
|
optionalEnv: true
|
|
5229
5475
|
});
|
|
5230
|
-
if (cwd && evalFilePath && !
|
|
5231
|
-
cwd =
|
|
5476
|
+
if (cwd && evalFilePath && !import_node_path14.default.isAbsolute(cwd)) {
|
|
5477
|
+
cwd = import_node_path14.default.resolve(import_node_path14.default.dirname(import_node_path14.default.resolve(evalFilePath)), cwd);
|
|
5232
5478
|
}
|
|
5233
5479
|
if (!cwd && evalFilePath) {
|
|
5234
|
-
cwd =
|
|
5480
|
+
cwd = import_node_path14.default.dirname(import_node_path14.default.resolve(evalFilePath));
|
|
5235
5481
|
}
|
|
5236
5482
|
return {
|
|
5237
5483
|
type: "command",
|
|
@@ -5258,11 +5504,11 @@ function normalizeCliTargetInput(input, env, evalFilePath) {
|
|
|
5258
5504
|
allowLiteral: true,
|
|
5259
5505
|
optionalEnv: true
|
|
5260
5506
|
});
|
|
5261
|
-
if (cwd && evalFilePath && !
|
|
5262
|
-
cwd =
|
|
5507
|
+
if (cwd && evalFilePath && !import_node_path14.default.isAbsolute(cwd)) {
|
|
5508
|
+
cwd = import_node_path14.default.resolve(import_node_path14.default.dirname(import_node_path14.default.resolve(evalFilePath)), cwd);
|
|
5263
5509
|
}
|
|
5264
5510
|
if (!cwd && evalFilePath) {
|
|
5265
|
-
cwd =
|
|
5511
|
+
cwd = import_node_path14.default.dirname(import_node_path14.default.resolve(evalFilePath));
|
|
5266
5512
|
}
|
|
5267
5513
|
const timeoutSeconds = input.timeout_seconds ?? input.timeoutSeconds;
|
|
5268
5514
|
const timeoutMs = timeoutSeconds !== void 0 ? Math.floor(timeoutSeconds * 1e3) : void 0;
|
|
@@ -5767,8 +6013,8 @@ function resolveCliConfig(target, env, evalFilePath) {
|
|
|
5767
6013
|
const parseResult = CliTargetInputSchema.safeParse(target, { errorMap: cliErrorMap });
|
|
5768
6014
|
if (!parseResult.success) {
|
|
5769
6015
|
const firstError = parseResult.error.errors[0];
|
|
5770
|
-
const
|
|
5771
|
-
const prefix =
|
|
6016
|
+
const path18 = firstError?.path.join(".") || "";
|
|
6017
|
+
const prefix = path18 ? `${target.name} ${path18}: ` : `${target.name}: `;
|
|
5772
6018
|
throw new Error(`${prefix}${firstError?.message}`);
|
|
5773
6019
|
}
|
|
5774
6020
|
const normalized = normalizeCliTargetInput(parseResult.data, env, evalFilePath);
|
|
@@ -5956,7 +6202,7 @@ function resolveOptionalNumberArray(source, description) {
|
|
|
5956
6202
|
}
|
|
5957
6203
|
|
|
5958
6204
|
// src/evaluation/providers/vscode.ts
|
|
5959
|
-
var
|
|
6205
|
+
var import_node_path15 = __toESM(require("path"), 1);
|
|
5960
6206
|
var import_subagent = require("subagent");
|
|
5961
6207
|
|
|
5962
6208
|
// src/evaluation/providers/vscode-templates.ts
|
|
@@ -6126,7 +6372,7 @@ function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
|
|
|
6126
6372
|
return "";
|
|
6127
6373
|
}
|
|
6128
6374
|
const buildList = (files) => files.map((absolutePath) => {
|
|
6129
|
-
const fileName =
|
|
6375
|
+
const fileName = import_node_path15.default.basename(absolutePath);
|
|
6130
6376
|
const fileUri = pathToFileUri2(absolutePath);
|
|
6131
6377
|
return `* [${fileName}](${fileUri})`;
|
|
6132
6378
|
});
|
|
@@ -6151,8 +6397,8 @@ function collectGuidelineFiles2(attachments, guidelinePatterns) {
|
|
|
6151
6397
|
}
|
|
6152
6398
|
const unique = /* @__PURE__ */ new Map();
|
|
6153
6399
|
for (const attachment of attachments) {
|
|
6154
|
-
const absolutePath =
|
|
6155
|
-
const normalized = absolutePath.split(
|
|
6400
|
+
const absolutePath = import_node_path15.default.resolve(attachment);
|
|
6401
|
+
const normalized = absolutePath.split(import_node_path15.default.sep).join("/");
|
|
6156
6402
|
if (isGuidelineFile(normalized, guidelinePatterns)) {
|
|
6157
6403
|
if (!unique.has(absolutePath)) {
|
|
6158
6404
|
unique.set(absolutePath, absolutePath);
|
|
@@ -6167,7 +6413,7 @@ function collectAttachmentFiles(attachments) {
|
|
|
6167
6413
|
}
|
|
6168
6414
|
const unique = /* @__PURE__ */ new Map();
|
|
6169
6415
|
for (const attachment of attachments) {
|
|
6170
|
-
const absolutePath =
|
|
6416
|
+
const absolutePath = import_node_path15.default.resolve(attachment);
|
|
6171
6417
|
if (!unique.has(absolutePath)) {
|
|
6172
6418
|
unique.set(absolutePath, absolutePath);
|
|
6173
6419
|
}
|
|
@@ -6175,7 +6421,7 @@ function collectAttachmentFiles(attachments) {
|
|
|
6175
6421
|
return Array.from(unique.values());
|
|
6176
6422
|
}
|
|
6177
6423
|
function pathToFileUri2(filePath) {
|
|
6178
|
-
const absolutePath =
|
|
6424
|
+
const absolutePath = import_node_path15.default.isAbsolute(filePath) ? filePath : import_node_path15.default.resolve(filePath);
|
|
6179
6425
|
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
6180
6426
|
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
6181
6427
|
return `file:///${normalizedPath}`;
|
|
@@ -6188,7 +6434,7 @@ function normalizeAttachments(attachments) {
|
|
|
6188
6434
|
}
|
|
6189
6435
|
const deduped = /* @__PURE__ */ new Set();
|
|
6190
6436
|
for (const attachment of attachments) {
|
|
6191
|
-
deduped.add(
|
|
6437
|
+
deduped.add(import_node_path15.default.resolve(attachment));
|
|
6192
6438
|
}
|
|
6193
6439
|
return Array.from(deduped);
|
|
6194
6440
|
}
|
|
@@ -6197,7 +6443,7 @@ function mergeAttachments(all) {
|
|
|
6197
6443
|
for (const list of all) {
|
|
6198
6444
|
if (!list) continue;
|
|
6199
6445
|
for (const inputFile of list) {
|
|
6200
|
-
deduped.add(
|
|
6446
|
+
deduped.add(import_node_path15.default.resolve(inputFile));
|
|
6201
6447
|
}
|
|
6202
6448
|
}
|
|
6203
6449
|
return deduped.size > 0 ? Array.from(deduped) : void 0;
|
|
@@ -6245,9 +6491,9 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
|
|
|
6245
6491
|
|
|
6246
6492
|
// src/evaluation/providers/targets-file.ts
|
|
6247
6493
|
var import_node_fs6 = require("fs");
|
|
6248
|
-
var
|
|
6249
|
-
var
|
|
6250
|
-
var
|
|
6494
|
+
var import_promises13 = require("fs/promises");
|
|
6495
|
+
var import_node_path16 = __toESM(require("path"), 1);
|
|
6496
|
+
var import_yaml4 = require("yaml");
|
|
6251
6497
|
function isRecord(value) {
|
|
6252
6498
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
6253
6499
|
}
|
|
@@ -6276,19 +6522,19 @@ function assertTargetDefinition(value, index, filePath) {
|
|
|
6276
6522
|
}
|
|
6277
6523
|
async function fileExists3(filePath) {
|
|
6278
6524
|
try {
|
|
6279
|
-
await (0,
|
|
6525
|
+
await (0, import_promises13.access)(filePath, import_node_fs6.constants.F_OK);
|
|
6280
6526
|
return true;
|
|
6281
6527
|
} catch {
|
|
6282
6528
|
return false;
|
|
6283
6529
|
}
|
|
6284
6530
|
}
|
|
6285
6531
|
async function readTargetDefinitions(filePath) {
|
|
6286
|
-
const absolutePath =
|
|
6532
|
+
const absolutePath = import_node_path16.default.resolve(filePath);
|
|
6287
6533
|
if (!await fileExists3(absolutePath)) {
|
|
6288
6534
|
throw new Error(`targets.yaml not found at ${absolutePath}`);
|
|
6289
6535
|
}
|
|
6290
|
-
const raw = await (0,
|
|
6291
|
-
const parsed = (0,
|
|
6536
|
+
const raw = await (0, import_promises13.readFile)(absolutePath, "utf8");
|
|
6537
|
+
const parsed = (0, import_yaml4.parse)(raw);
|
|
6292
6538
|
if (!isRecord(parsed)) {
|
|
6293
6539
|
throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with a 'targets' field`);
|
|
6294
6540
|
}
|
|
@@ -6494,15 +6740,15 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
|
|
|
6494
6740
|
});
|
|
6495
6741
|
}
|
|
6496
6742
|
async function execShellWithStdin(command, stdinPayload, options = {}) {
|
|
6497
|
-
const { mkdir: mkdir4, readFile:
|
|
6743
|
+
const { mkdir: mkdir4, readFile: readFile9, rm: rm4, writeFile: writeFile4 } = await import("fs/promises");
|
|
6498
6744
|
const { tmpdir: tmpdir4 } = await import("os");
|
|
6499
|
-
const
|
|
6745
|
+
const path18 = await import("path");
|
|
6500
6746
|
const { randomUUID: randomUUID4 } = await import("crypto");
|
|
6501
|
-
const dir =
|
|
6747
|
+
const dir = path18.join(tmpdir4(), `agentv-exec-${randomUUID4()}`);
|
|
6502
6748
|
await mkdir4(dir, { recursive: true });
|
|
6503
|
-
const stdinPath =
|
|
6504
|
-
const stdoutPath =
|
|
6505
|
-
const stderrPath =
|
|
6749
|
+
const stdinPath = path18.join(dir, "stdin.txt");
|
|
6750
|
+
const stdoutPath = path18.join(dir, "stdout.txt");
|
|
6751
|
+
const stderrPath = path18.join(dir, "stderr.txt");
|
|
6506
6752
|
await writeFile4(stdinPath, stdinPayload, "utf8");
|
|
6507
6753
|
const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
|
|
6508
6754
|
const { spawn: spawn4 } = await import("child_process");
|
|
@@ -6532,8 +6778,8 @@ async function execShellWithStdin(command, stdinPayload, options = {}) {
|
|
|
6532
6778
|
resolve(code ?? 0);
|
|
6533
6779
|
});
|
|
6534
6780
|
});
|
|
6535
|
-
const stdout = (await
|
|
6536
|
-
const stderr = (await
|
|
6781
|
+
const stdout = (await readFile9(stdoutPath, "utf8")).replace(/\r\n/g, "\n");
|
|
6782
|
+
const stderr = (await readFile9(stderrPath, "utf8")).replace(/\r\n/g, "\n");
|
|
6537
6783
|
return { stdout, stderr, exitCode };
|
|
6538
6784
|
} finally {
|
|
6539
6785
|
await rm4(dir, { recursive: true, force: true });
|
|
@@ -6805,7 +7051,7 @@ var CodeEvaluator = class {
|
|
|
6805
7051
|
outputMessages: context.outputMessages ?? null,
|
|
6806
7052
|
guidelineFiles: context.evalCase.guideline_paths,
|
|
6807
7053
|
inputFiles: context.evalCase.file_paths.filter(
|
|
6808
|
-
(
|
|
7054
|
+
(path18) => !context.evalCase.guideline_paths.includes(path18)
|
|
6809
7055
|
),
|
|
6810
7056
|
inputMessages: context.evalCase.input_messages,
|
|
6811
7057
|
traceSummary: context.traceSummary ?? null,
|
|
@@ -7591,115 +7837,115 @@ var FieldAccuracyEvaluator = class {
|
|
|
7591
7837
|
* Evaluate a single field against the expected value.
|
|
7592
7838
|
*/
|
|
7593
7839
|
evaluateField(fieldConfig, candidateData, expectedData) {
|
|
7594
|
-
const { path:
|
|
7595
|
-
const candidateValue = resolvePath(candidateData,
|
|
7596
|
-
const expectedValue = resolvePath(expectedData,
|
|
7840
|
+
const { path: path18, match, required = true, weight = 1 } = fieldConfig;
|
|
7841
|
+
const candidateValue = resolvePath(candidateData, path18);
|
|
7842
|
+
const expectedValue = resolvePath(expectedData, path18);
|
|
7597
7843
|
if (expectedValue === void 0) {
|
|
7598
7844
|
return {
|
|
7599
|
-
path:
|
|
7845
|
+
path: path18,
|
|
7600
7846
|
score: 1,
|
|
7601
7847
|
// No expected value means no comparison needed
|
|
7602
7848
|
weight,
|
|
7603
7849
|
hit: true,
|
|
7604
|
-
message: `${
|
|
7850
|
+
message: `${path18}: no expected value`
|
|
7605
7851
|
};
|
|
7606
7852
|
}
|
|
7607
7853
|
if (candidateValue === void 0) {
|
|
7608
7854
|
if (required) {
|
|
7609
7855
|
return {
|
|
7610
|
-
path:
|
|
7856
|
+
path: path18,
|
|
7611
7857
|
score: 0,
|
|
7612
7858
|
weight,
|
|
7613
7859
|
hit: false,
|
|
7614
|
-
message: `${
|
|
7860
|
+
message: `${path18} (required, missing)`
|
|
7615
7861
|
};
|
|
7616
7862
|
}
|
|
7617
7863
|
return {
|
|
7618
|
-
path:
|
|
7864
|
+
path: path18,
|
|
7619
7865
|
score: 1,
|
|
7620
7866
|
// Don't penalize missing optional fields
|
|
7621
7867
|
weight: 0,
|
|
7622
7868
|
// Zero weight means it won't affect the score
|
|
7623
7869
|
hit: true,
|
|
7624
|
-
message: `${
|
|
7870
|
+
message: `${path18}: optional field missing`
|
|
7625
7871
|
};
|
|
7626
7872
|
}
|
|
7627
7873
|
switch (match) {
|
|
7628
7874
|
case "exact":
|
|
7629
|
-
return this.compareExact(
|
|
7875
|
+
return this.compareExact(path18, candidateValue, expectedValue, weight);
|
|
7630
7876
|
case "numeric_tolerance":
|
|
7631
7877
|
return this.compareNumericTolerance(
|
|
7632
|
-
|
|
7878
|
+
path18,
|
|
7633
7879
|
candidateValue,
|
|
7634
7880
|
expectedValue,
|
|
7635
7881
|
fieldConfig,
|
|
7636
7882
|
weight
|
|
7637
7883
|
);
|
|
7638
7884
|
case "date":
|
|
7639
|
-
return this.compareDate(
|
|
7885
|
+
return this.compareDate(path18, candidateValue, expectedValue, fieldConfig, weight);
|
|
7640
7886
|
default:
|
|
7641
7887
|
return {
|
|
7642
|
-
path:
|
|
7888
|
+
path: path18,
|
|
7643
7889
|
score: 0,
|
|
7644
7890
|
weight,
|
|
7645
7891
|
hit: false,
|
|
7646
|
-
message: `${
|
|
7892
|
+
message: `${path18}: unknown match type "${match}"`
|
|
7647
7893
|
};
|
|
7648
7894
|
}
|
|
7649
7895
|
}
|
|
7650
7896
|
/**
|
|
7651
7897
|
* Exact equality comparison.
|
|
7652
7898
|
*/
|
|
7653
|
-
compareExact(
|
|
7899
|
+
compareExact(path18, candidateValue, expectedValue, weight) {
|
|
7654
7900
|
if (deepEqual(candidateValue, expectedValue)) {
|
|
7655
7901
|
return {
|
|
7656
|
-
path:
|
|
7902
|
+
path: path18,
|
|
7657
7903
|
score: 1,
|
|
7658
7904
|
weight,
|
|
7659
7905
|
hit: true,
|
|
7660
|
-
message:
|
|
7906
|
+
message: path18
|
|
7661
7907
|
};
|
|
7662
7908
|
}
|
|
7663
7909
|
if (typeof candidateValue !== typeof expectedValue) {
|
|
7664
7910
|
return {
|
|
7665
|
-
path:
|
|
7911
|
+
path: path18,
|
|
7666
7912
|
score: 0,
|
|
7667
7913
|
weight,
|
|
7668
7914
|
hit: false,
|
|
7669
|
-
message: `${
|
|
7915
|
+
message: `${path18} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
|
|
7670
7916
|
};
|
|
7671
7917
|
}
|
|
7672
7918
|
return {
|
|
7673
|
-
path:
|
|
7919
|
+
path: path18,
|
|
7674
7920
|
score: 0,
|
|
7675
7921
|
weight,
|
|
7676
7922
|
hit: false,
|
|
7677
|
-
message: `${
|
|
7923
|
+
message: `${path18} (value mismatch)`
|
|
7678
7924
|
};
|
|
7679
7925
|
}
|
|
7680
7926
|
/**
|
|
7681
7927
|
* Numeric comparison with absolute or relative tolerance.
|
|
7682
7928
|
*/
|
|
7683
|
-
compareNumericTolerance(
|
|
7929
|
+
compareNumericTolerance(path18, candidateValue, expectedValue, fieldConfig, weight) {
|
|
7684
7930
|
const { tolerance = 0, relative = false } = fieldConfig;
|
|
7685
7931
|
const candidateNum = toNumber(candidateValue);
|
|
7686
7932
|
const expectedNum = toNumber(expectedValue);
|
|
7687
7933
|
if (candidateNum === null || expectedNum === null) {
|
|
7688
7934
|
return {
|
|
7689
|
-
path:
|
|
7935
|
+
path: path18,
|
|
7690
7936
|
score: 0,
|
|
7691
7937
|
weight,
|
|
7692
7938
|
hit: false,
|
|
7693
|
-
message: `${
|
|
7939
|
+
message: `${path18} (non-numeric value)`
|
|
7694
7940
|
};
|
|
7695
7941
|
}
|
|
7696
7942
|
if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
|
|
7697
7943
|
return {
|
|
7698
|
-
path:
|
|
7944
|
+
path: path18,
|
|
7699
7945
|
score: 0,
|
|
7700
7946
|
weight,
|
|
7701
7947
|
hit: false,
|
|
7702
|
-
message: `${
|
|
7948
|
+
message: `${path18} (invalid numeric value)`
|
|
7703
7949
|
};
|
|
7704
7950
|
}
|
|
7705
7951
|
const diff = Math.abs(candidateNum - expectedNum);
|
|
@@ -7712,61 +7958,61 @@ var FieldAccuracyEvaluator = class {
|
|
|
7712
7958
|
}
|
|
7713
7959
|
if (withinTolerance) {
|
|
7714
7960
|
return {
|
|
7715
|
-
path:
|
|
7961
|
+
path: path18,
|
|
7716
7962
|
score: 1,
|
|
7717
7963
|
weight,
|
|
7718
7964
|
hit: true,
|
|
7719
|
-
message: `${
|
|
7965
|
+
message: `${path18} (within tolerance: diff=${diff.toFixed(2)})`
|
|
7720
7966
|
};
|
|
7721
7967
|
}
|
|
7722
7968
|
return {
|
|
7723
|
-
path:
|
|
7969
|
+
path: path18,
|
|
7724
7970
|
score: 0,
|
|
7725
7971
|
weight,
|
|
7726
7972
|
hit: false,
|
|
7727
|
-
message: `${
|
|
7973
|
+
message: `${path18} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
|
|
7728
7974
|
};
|
|
7729
7975
|
}
|
|
7730
7976
|
/**
|
|
7731
7977
|
* Date comparison with format normalization.
|
|
7732
7978
|
*/
|
|
7733
|
-
compareDate(
|
|
7979
|
+
compareDate(path18, candidateValue, expectedValue, fieldConfig, weight) {
|
|
7734
7980
|
const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
|
|
7735
7981
|
const candidateDate = parseDate(String(candidateValue), formats);
|
|
7736
7982
|
const expectedDate = parseDate(String(expectedValue), formats);
|
|
7737
7983
|
if (candidateDate === null) {
|
|
7738
7984
|
return {
|
|
7739
|
-
path:
|
|
7985
|
+
path: path18,
|
|
7740
7986
|
score: 0,
|
|
7741
7987
|
weight,
|
|
7742
7988
|
hit: false,
|
|
7743
|
-
message: `${
|
|
7989
|
+
message: `${path18} (unparseable candidate date)`
|
|
7744
7990
|
};
|
|
7745
7991
|
}
|
|
7746
7992
|
if (expectedDate === null) {
|
|
7747
7993
|
return {
|
|
7748
|
-
path:
|
|
7994
|
+
path: path18,
|
|
7749
7995
|
score: 0,
|
|
7750
7996
|
weight,
|
|
7751
7997
|
hit: false,
|
|
7752
|
-
message: `${
|
|
7998
|
+
message: `${path18} (unparseable expected date)`
|
|
7753
7999
|
};
|
|
7754
8000
|
}
|
|
7755
8001
|
if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
|
|
7756
8002
|
return {
|
|
7757
|
-
path:
|
|
8003
|
+
path: path18,
|
|
7758
8004
|
score: 1,
|
|
7759
8005
|
weight,
|
|
7760
8006
|
hit: true,
|
|
7761
|
-
message:
|
|
8007
|
+
message: path18
|
|
7762
8008
|
};
|
|
7763
8009
|
}
|
|
7764
8010
|
return {
|
|
7765
|
-
path:
|
|
8011
|
+
path: path18,
|
|
7766
8012
|
score: 0,
|
|
7767
8013
|
weight,
|
|
7768
8014
|
hit: false,
|
|
7769
|
-
message: `${
|
|
8015
|
+
message: `${path18} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
|
|
7770
8016
|
};
|
|
7771
8017
|
}
|
|
7772
8018
|
/**
|
|
@@ -7806,11 +8052,11 @@ var FieldAccuracyEvaluator = class {
|
|
|
7806
8052
|
};
|
|
7807
8053
|
}
|
|
7808
8054
|
};
|
|
7809
|
-
function resolvePath(obj,
|
|
7810
|
-
if (!
|
|
8055
|
+
function resolvePath(obj, path18) {
|
|
8056
|
+
if (!path18 || !obj) {
|
|
7811
8057
|
return void 0;
|
|
7812
8058
|
}
|
|
7813
|
-
const parts =
|
|
8059
|
+
const parts = path18.split(/\.|\[|\]/).filter((p) => p.length > 0);
|
|
7814
8060
|
let current = obj;
|
|
7815
8061
|
for (const part of parts) {
|
|
7816
8062
|
if (current === null || current === void 0) {
|
|
@@ -8246,7 +8492,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
8246
8492
|
|
|
8247
8493
|
// src/evaluation/orchestrator.ts
|
|
8248
8494
|
var import_node_crypto5 = require("crypto");
|
|
8249
|
-
var
|
|
8495
|
+
var import_node_path17 = __toESM(require("path"), 1);
|
|
8250
8496
|
|
|
8251
8497
|
// ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
|
|
8252
8498
|
var Node = class {
|
|
@@ -9045,7 +9291,7 @@ async function runEvaluatorList(options) {
|
|
|
9045
9291
|
});
|
|
9046
9292
|
}
|
|
9047
9293
|
if (evaluator.type === "composite") {
|
|
9048
|
-
const evalFileDir = evalCase.guideline_paths[0] ?
|
|
9294
|
+
const evalFileDir = evalCase.guideline_paths[0] ? import_node_path17.default.dirname(evalCase.guideline_paths[0]) : process.cwd();
|
|
9049
9295
|
const createEvaluator = (memberConfig) => {
|
|
9050
9296
|
switch (memberConfig.type) {
|
|
9051
9297
|
case "llm_judge":
|
|
@@ -9620,6 +9866,7 @@ function createAgentKernel() {
|
|
|
9620
9866
|
createAgentKernel,
|
|
9621
9867
|
createProvider,
|
|
9622
9868
|
deepEqual,
|
|
9869
|
+
detectFormat,
|
|
9623
9870
|
ensureVSCodeSubagents,
|
|
9624
9871
|
executeScript,
|
|
9625
9872
|
explorationRatio,
|