@agentv/core 2.1.1 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +456 -202
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +8 -2
- package/dist/index.d.ts +8 -2
- package/dist/index.js +403 -150
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.cjs
CHANGED
|
@@ -53,6 +53,7 @@ __export(index_exports, {
|
|
|
53
53
|
createAgentKernel: () => createAgentKernel,
|
|
54
54
|
createProvider: () => createProvider,
|
|
55
55
|
deepEqual: () => deepEqual,
|
|
56
|
+
detectFormat: () => detectFormat,
|
|
56
57
|
ensureVSCodeSubagents: () => ensureVSCodeSubagents,
|
|
57
58
|
executeScript: () => executeScript,
|
|
58
59
|
explorationRatio: () => explorationRatio,
|
|
@@ -226,9 +227,9 @@ function mergeExecutionMetrics(summary, metrics) {
|
|
|
226
227
|
}
|
|
227
228
|
|
|
228
229
|
// src/evaluation/yaml-parser.ts
|
|
229
|
-
var
|
|
230
|
-
var
|
|
231
|
-
var
|
|
230
|
+
var import_promises7 = require("fs/promises");
|
|
231
|
+
var import_node_path7 = __toESM(require("path"), 1);
|
|
232
|
+
var import_yaml3 = require("yaml");
|
|
232
233
|
|
|
233
234
|
// src/evaluation/loaders/config-loader.ts
|
|
234
235
|
var import_promises2 = require("fs/promises");
|
|
@@ -1006,6 +1007,11 @@ function isValidFieldAggregationType(value) {
|
|
|
1006
1007
|
return typeof value === "string" && VALID_FIELD_AGGREGATION_TYPES.has(value);
|
|
1007
1008
|
}
|
|
1008
1009
|
|
|
1010
|
+
// src/evaluation/loaders/jsonl-parser.ts
|
|
1011
|
+
var import_promises5 = require("fs/promises");
|
|
1012
|
+
var import_node_path5 = __toESM(require("path"), 1);
|
|
1013
|
+
var import_yaml2 = require("yaml");
|
|
1014
|
+
|
|
1009
1015
|
// src/evaluation/loaders/message-processor.ts
|
|
1010
1016
|
var import_promises4 = require("fs/promises");
|
|
1011
1017
|
var import_node_path4 = __toESM(require("path"), 1);
|
|
@@ -1266,28 +1272,271 @@ async function processExpectedMessages(options) {
|
|
|
1266
1272
|
return segments;
|
|
1267
1273
|
}
|
|
1268
1274
|
|
|
1269
|
-
// src/evaluation/
|
|
1270
|
-
var import_promises5 = require("fs/promises");
|
|
1271
|
-
var import_node_path5 = __toESM(require("path"), 1);
|
|
1275
|
+
// src/evaluation/loaders/jsonl-parser.ts
|
|
1272
1276
|
var ANSI_YELLOW5 = "\x1B[33m";
|
|
1277
|
+
var ANSI_RED = "\x1B[31m";
|
|
1273
1278
|
var ANSI_RESET5 = "\x1B[0m";
|
|
1279
|
+
function detectFormat(filePath) {
|
|
1280
|
+
const ext = import_node_path5.default.extname(filePath).toLowerCase();
|
|
1281
|
+
if (ext === ".jsonl") return "jsonl";
|
|
1282
|
+
if (ext === ".yaml" || ext === ".yml") return "yaml";
|
|
1283
|
+
throw new Error(`Unsupported file format: '${ext}'. Supported formats: .yaml, .yml, .jsonl`);
|
|
1284
|
+
}
|
|
1285
|
+
async function loadSidecarMetadata(jsonlPath, verbose) {
|
|
1286
|
+
const dir = import_node_path5.default.dirname(jsonlPath);
|
|
1287
|
+
const base = import_node_path5.default.basename(jsonlPath, ".jsonl");
|
|
1288
|
+
const sidecarPath = import_node_path5.default.join(dir, `${base}.yaml`);
|
|
1289
|
+
if (!await fileExists(sidecarPath)) {
|
|
1290
|
+
if (verbose) {
|
|
1291
|
+
logWarning4(`Sidecar metadata file not found: ${sidecarPath} (using defaults)`);
|
|
1292
|
+
}
|
|
1293
|
+
return {};
|
|
1294
|
+
}
|
|
1295
|
+
try {
|
|
1296
|
+
const content = await (0, import_promises5.readFile)(sidecarPath, "utf8");
|
|
1297
|
+
const parsed = (0, import_yaml2.parse)(content);
|
|
1298
|
+
if (!isJsonObject(parsed)) {
|
|
1299
|
+
logWarning4(`Invalid sidecar metadata format in ${sidecarPath}`);
|
|
1300
|
+
return {};
|
|
1301
|
+
}
|
|
1302
|
+
return {
|
|
1303
|
+
description: asString4(parsed.description),
|
|
1304
|
+
dataset: asString4(parsed.dataset),
|
|
1305
|
+
execution: isJsonObject(parsed.execution) ? parsed.execution : void 0,
|
|
1306
|
+
evaluator: parsed.evaluator
|
|
1307
|
+
};
|
|
1308
|
+
} catch (error) {
|
|
1309
|
+
logWarning4(`Could not read sidecar metadata from ${sidecarPath}: ${error.message}`);
|
|
1310
|
+
return {};
|
|
1311
|
+
}
|
|
1312
|
+
}
|
|
1313
|
+
function parseJsonlContent(content, filePath) {
|
|
1314
|
+
const lines = content.split("\n");
|
|
1315
|
+
const cases = [];
|
|
1316
|
+
for (let i = 0; i < lines.length; i++) {
|
|
1317
|
+
const line = lines[i].trim();
|
|
1318
|
+
if (line === "") continue;
|
|
1319
|
+
try {
|
|
1320
|
+
const parsed = JSON.parse(line);
|
|
1321
|
+
if (!isJsonObject(parsed)) {
|
|
1322
|
+
throw new Error("Expected JSON object");
|
|
1323
|
+
}
|
|
1324
|
+
cases.push(parsed);
|
|
1325
|
+
} catch (error) {
|
|
1326
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1327
|
+
throw new Error(`Line ${i + 1}: Invalid JSON - ${message}
|
|
1328
|
+
File: ${filePath}`);
|
|
1329
|
+
}
|
|
1330
|
+
}
|
|
1331
|
+
return cases;
|
|
1332
|
+
}
|
|
1333
|
+
async function loadEvalCasesFromJsonl(evalFilePath, repoRoot, options) {
|
|
1334
|
+
const verbose = options?.verbose ?? false;
|
|
1335
|
+
const evalIdFilter = options?.evalId;
|
|
1336
|
+
const absoluteTestPath = import_node_path5.default.resolve(evalFilePath);
|
|
1337
|
+
const repoRootPath = resolveToAbsolutePath(repoRoot);
|
|
1338
|
+
const searchRoots = buildSearchRoots(absoluteTestPath, repoRootPath);
|
|
1339
|
+
const config = await loadConfig(absoluteTestPath, repoRootPath);
|
|
1340
|
+
const guidelinePatterns = config?.guideline_patterns;
|
|
1341
|
+
const sidecar = await loadSidecarMetadata(absoluteTestPath, verbose);
|
|
1342
|
+
const rawFile = await (0, import_promises5.readFile)(absoluteTestPath, "utf8");
|
|
1343
|
+
const rawCases = parseJsonlContent(rawFile, evalFilePath);
|
|
1344
|
+
const fallbackDataset = import_node_path5.default.basename(absoluteTestPath, ".jsonl") || "eval";
|
|
1345
|
+
const datasetName = sidecar.dataset && sidecar.dataset.trim().length > 0 ? sidecar.dataset : fallbackDataset;
|
|
1346
|
+
const globalEvaluator = coerceEvaluator(sidecar.evaluator, "sidecar") ?? "llm_judge";
|
|
1347
|
+
const globalExecution = sidecar.execution;
|
|
1348
|
+
if (verbose) {
|
|
1349
|
+
console.log(`
|
|
1350
|
+
[JSONL Dataset: ${evalFilePath}]`);
|
|
1351
|
+
console.log(` Cases: ${rawCases.length}`);
|
|
1352
|
+
console.log(` Dataset name: ${datasetName}`);
|
|
1353
|
+
if (sidecar.description) {
|
|
1354
|
+
console.log(` Description: ${sidecar.description}`);
|
|
1355
|
+
}
|
|
1356
|
+
}
|
|
1357
|
+
const results = [];
|
|
1358
|
+
for (let lineIndex = 0; lineIndex < rawCases.length; lineIndex++) {
|
|
1359
|
+
const evalcase = rawCases[lineIndex];
|
|
1360
|
+
const lineNumber = lineIndex + 1;
|
|
1361
|
+
const id = asString4(evalcase.id);
|
|
1362
|
+
if (evalIdFilter && id !== evalIdFilter) {
|
|
1363
|
+
continue;
|
|
1364
|
+
}
|
|
1365
|
+
const conversationId = asString4(evalcase.conversation_id);
|
|
1366
|
+
const outcome = asString4(evalcase.expected_outcome) ?? asString4(evalcase.outcome);
|
|
1367
|
+
const inputMessagesValue = evalcase.input_messages;
|
|
1368
|
+
const expectedMessagesValue = evalcase.expected_messages;
|
|
1369
|
+
if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
|
|
1370
|
+
logError(
|
|
1371
|
+
`Skipping incomplete eval case at line ${lineNumber}: ${id ?? "unknown"}. Missing required fields: id, expected_outcome, and/or input_messages`
|
|
1372
|
+
);
|
|
1373
|
+
continue;
|
|
1374
|
+
}
|
|
1375
|
+
const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
|
|
1376
|
+
const inputMessages = inputMessagesValue.filter(
|
|
1377
|
+
(msg) => isTestMessage(msg)
|
|
1378
|
+
);
|
|
1379
|
+
const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
|
|
1380
|
+
if (hasExpectedMessages && expectedMessages.length === 0) {
|
|
1381
|
+
logError(`Line ${lineNumber}: No valid expected message found for eval case: ${id}`);
|
|
1382
|
+
continue;
|
|
1383
|
+
}
|
|
1384
|
+
const guidelinePaths = [];
|
|
1385
|
+
const inputTextParts = [];
|
|
1386
|
+
const inputSegments = await processMessages({
|
|
1387
|
+
messages: inputMessages,
|
|
1388
|
+
searchRoots,
|
|
1389
|
+
repoRootPath,
|
|
1390
|
+
guidelinePatterns,
|
|
1391
|
+
guidelinePaths,
|
|
1392
|
+
textParts: inputTextParts,
|
|
1393
|
+
messageType: "input",
|
|
1394
|
+
verbose
|
|
1395
|
+
});
|
|
1396
|
+
const outputSegments = hasExpectedMessages ? await processExpectedMessages({
|
|
1397
|
+
messages: expectedMessages,
|
|
1398
|
+
searchRoots,
|
|
1399
|
+
repoRootPath,
|
|
1400
|
+
verbose
|
|
1401
|
+
}) : [];
|
|
1402
|
+
let referenceAnswer = "";
|
|
1403
|
+
if (outputSegments.length > 0) {
|
|
1404
|
+
const lastMessage = outputSegments[outputSegments.length - 1];
|
|
1405
|
+
const content = lastMessage.content;
|
|
1406
|
+
const toolCalls = lastMessage.tool_calls;
|
|
1407
|
+
if (typeof content === "string") {
|
|
1408
|
+
referenceAnswer = content;
|
|
1409
|
+
} else if (content !== void 0 && content !== null) {
|
|
1410
|
+
referenceAnswer = JSON.stringify(content, null, 2);
|
|
1411
|
+
} else if (toolCalls !== void 0 && toolCalls !== null) {
|
|
1412
|
+
referenceAnswer = JSON.stringify(toolCalls, null, 2);
|
|
1413
|
+
}
|
|
1414
|
+
}
|
|
1415
|
+
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
1416
|
+
const caseExecution = isJsonObject(evalcase.execution) ? evalcase.execution : void 0;
|
|
1417
|
+
const mergedExecution = caseExecution ?? globalExecution;
|
|
1418
|
+
const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
|
|
1419
|
+
let evaluators;
|
|
1420
|
+
try {
|
|
1421
|
+
evaluators = await parseEvaluators(evalcase, mergedExecution, searchRoots, id ?? "unknown");
|
|
1422
|
+
} catch (error) {
|
|
1423
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1424
|
+
logError(`Skipping eval case '${id}' at line ${lineNumber}: ${message}`);
|
|
1425
|
+
continue;
|
|
1426
|
+
}
|
|
1427
|
+
const inlineRubrics = evalcase.rubrics;
|
|
1428
|
+
if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
|
|
1429
|
+
const rubricItems = inlineRubrics.filter((r) => isJsonObject(r) || typeof r === "string").map((rubric, index) => {
|
|
1430
|
+
if (typeof rubric === "string") {
|
|
1431
|
+
return {
|
|
1432
|
+
id: `rubric-${index + 1}`,
|
|
1433
|
+
description: rubric,
|
|
1434
|
+
weight: 1,
|
|
1435
|
+
required: true
|
|
1436
|
+
};
|
|
1437
|
+
}
|
|
1438
|
+
return {
|
|
1439
|
+
id: asString4(rubric.id) ?? `rubric-${index + 1}`,
|
|
1440
|
+
description: asString4(rubric.description) ?? "",
|
|
1441
|
+
weight: typeof rubric.weight === "number" ? rubric.weight : 1,
|
|
1442
|
+
required: typeof rubric.required === "boolean" ? rubric.required : true
|
|
1443
|
+
};
|
|
1444
|
+
}).filter((r) => r.description.length > 0);
|
|
1445
|
+
if (rubricItems.length > 0) {
|
|
1446
|
+
const rubricEvaluator = {
|
|
1447
|
+
name: "rubric",
|
|
1448
|
+
type: "llm_judge",
|
|
1449
|
+
rubrics: rubricItems
|
|
1450
|
+
};
|
|
1451
|
+
evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
|
|
1452
|
+
}
|
|
1453
|
+
}
|
|
1454
|
+
const userFilePaths = [];
|
|
1455
|
+
for (const segment of inputSegments) {
|
|
1456
|
+
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
1457
|
+
userFilePaths.push(segment.resolvedPath);
|
|
1458
|
+
}
|
|
1459
|
+
}
|
|
1460
|
+
const allFilePaths = [
|
|
1461
|
+
...guidelinePaths.map((guidelinePath) => import_node_path5.default.resolve(guidelinePath)),
|
|
1462
|
+
...userFilePaths
|
|
1463
|
+
];
|
|
1464
|
+
const testCase = {
|
|
1465
|
+
id,
|
|
1466
|
+
dataset: datasetName,
|
|
1467
|
+
conversation_id: conversationId,
|
|
1468
|
+
question,
|
|
1469
|
+
input_messages: inputMessages,
|
|
1470
|
+
input_segments: inputSegments,
|
|
1471
|
+
expected_messages: outputSegments,
|
|
1472
|
+
reference_answer: referenceAnswer,
|
|
1473
|
+
guideline_paths: guidelinePaths.map((guidelinePath) => import_node_path5.default.resolve(guidelinePath)),
|
|
1474
|
+
guideline_patterns: guidelinePatterns,
|
|
1475
|
+
file_paths: allFilePaths,
|
|
1476
|
+
expected_outcome: outcome,
|
|
1477
|
+
evaluator: evalCaseEvaluatorKind,
|
|
1478
|
+
evaluators
|
|
1479
|
+
};
|
|
1480
|
+
if (verbose) {
|
|
1481
|
+
console.log(`
|
|
1482
|
+
[Eval Case: ${id}]`);
|
|
1483
|
+
if (testCase.guideline_paths.length > 0) {
|
|
1484
|
+
console.log(` Guidelines used: ${testCase.guideline_paths.length}`);
|
|
1485
|
+
for (const guidelinePath of testCase.guideline_paths) {
|
|
1486
|
+
console.log(` - ${guidelinePath}`);
|
|
1487
|
+
}
|
|
1488
|
+
} else {
|
|
1489
|
+
console.log(" No guidelines found");
|
|
1490
|
+
}
|
|
1491
|
+
}
|
|
1492
|
+
results.push(testCase);
|
|
1493
|
+
}
|
|
1494
|
+
return results;
|
|
1495
|
+
}
|
|
1496
|
+
function asString4(value) {
|
|
1497
|
+
return typeof value === "string" ? value : void 0;
|
|
1498
|
+
}
|
|
1499
|
+
function logWarning4(message, details) {
|
|
1500
|
+
if (details && details.length > 0) {
|
|
1501
|
+
const detailBlock = details.join("\n");
|
|
1502
|
+
console.warn(`${ANSI_YELLOW5}Warning: ${message}
|
|
1503
|
+
${detailBlock}${ANSI_RESET5}`);
|
|
1504
|
+
} else {
|
|
1505
|
+
console.warn(`${ANSI_YELLOW5}Warning: ${message}${ANSI_RESET5}`);
|
|
1506
|
+
}
|
|
1507
|
+
}
|
|
1508
|
+
function logError(message, details) {
|
|
1509
|
+
if (details && details.length > 0) {
|
|
1510
|
+
const detailBlock = details.join("\n");
|
|
1511
|
+
console.error(`${ANSI_RED}Error: ${message}
|
|
1512
|
+
${detailBlock}${ANSI_RESET5}`);
|
|
1513
|
+
} else {
|
|
1514
|
+
console.error(`${ANSI_RED}Error: ${message}${ANSI_RESET5}`);
|
|
1515
|
+
}
|
|
1516
|
+
}
|
|
1517
|
+
|
|
1518
|
+
// src/evaluation/formatting/prompt-builder.ts
|
|
1519
|
+
var import_promises6 = require("fs/promises");
|
|
1520
|
+
var import_node_path6 = __toESM(require("path"), 1);
|
|
1521
|
+
var ANSI_YELLOW6 = "\x1B[33m";
|
|
1522
|
+
var ANSI_RESET6 = "\x1B[0m";
|
|
1274
1523
|
async function buildPromptInputs(testCase, mode = "lm") {
|
|
1275
1524
|
const guidelineParts = [];
|
|
1276
1525
|
for (const rawPath of testCase.guideline_paths) {
|
|
1277
|
-
const absolutePath =
|
|
1526
|
+
const absolutePath = import_node_path6.default.resolve(rawPath);
|
|
1278
1527
|
if (!await fileExists(absolutePath)) {
|
|
1279
|
-
|
|
1528
|
+
logWarning5(`Could not read guideline file ${absolutePath}: file does not exist`);
|
|
1280
1529
|
continue;
|
|
1281
1530
|
}
|
|
1282
1531
|
try {
|
|
1283
|
-
const content = (await (0,
|
|
1532
|
+
const content = (await (0, import_promises6.readFile)(absolutePath, "utf8")).replace(/\r\n/g, "\n").trim();
|
|
1284
1533
|
guidelineParts.push({
|
|
1285
1534
|
content,
|
|
1286
1535
|
isFile: true,
|
|
1287
|
-
displayPath:
|
|
1536
|
+
displayPath: import_node_path6.default.basename(absolutePath)
|
|
1288
1537
|
});
|
|
1289
1538
|
} catch (error) {
|
|
1290
|
-
|
|
1539
|
+
logWarning5(`Could not read guideline file ${absolutePath}: ${error.message}`);
|
|
1291
1540
|
}
|
|
1292
1541
|
}
|
|
1293
1542
|
const guidelines = formatFileContents(guidelineParts);
|
|
@@ -1311,9 +1560,9 @@ async function buildPromptInputs(testCase, mode = "lm") {
|
|
|
1311
1560
|
messageSegments.push({ type: "text", value: segment });
|
|
1312
1561
|
}
|
|
1313
1562
|
} else if (isJsonObject(segment)) {
|
|
1314
|
-
const type =
|
|
1563
|
+
const type = asString5(segment.type);
|
|
1315
1564
|
if (type === "file") {
|
|
1316
|
-
const value =
|
|
1565
|
+
const value = asString5(segment.value);
|
|
1317
1566
|
if (!value) continue;
|
|
1318
1567
|
if (testCase.guideline_patterns && isGuidelineFile(value, testCase.guideline_patterns)) {
|
|
1319
1568
|
messageSegments.push({ type: "guideline_ref", path: value });
|
|
@@ -1324,7 +1573,7 @@ async function buildPromptInputs(testCase, mode = "lm") {
|
|
|
1324
1573
|
messageSegments.push({ type: "file", text: fileText, path: value });
|
|
1325
1574
|
}
|
|
1326
1575
|
} else if (type === "text") {
|
|
1327
|
-
const textValue =
|
|
1576
|
+
const textValue = asString5(segment.value);
|
|
1328
1577
|
if (textValue && textValue.trim().length > 0) {
|
|
1329
1578
|
messageSegments.push({ type: "text", value: textValue });
|
|
1330
1579
|
}
|
|
@@ -1478,22 +1727,22 @@ ${guidelineContent.trim()}`);
|
|
|
1478
1727
|
}
|
|
1479
1728
|
return chatPrompt.length > 0 ? chatPrompt : void 0;
|
|
1480
1729
|
}
|
|
1481
|
-
function
|
|
1730
|
+
function asString5(value) {
|
|
1482
1731
|
return typeof value === "string" ? value : void 0;
|
|
1483
1732
|
}
|
|
1484
|
-
function
|
|
1485
|
-
console.warn(`${
|
|
1733
|
+
function logWarning5(message) {
|
|
1734
|
+
console.warn(`${ANSI_YELLOW6}Warning: ${message}${ANSI_RESET6}`);
|
|
1486
1735
|
}
|
|
1487
1736
|
|
|
1488
1737
|
// src/evaluation/yaml-parser.ts
|
|
1489
|
-
var
|
|
1490
|
-
var
|
|
1491
|
-
var
|
|
1738
|
+
var ANSI_YELLOW7 = "\x1B[33m";
|
|
1739
|
+
var ANSI_RED2 = "\x1B[31m";
|
|
1740
|
+
var ANSI_RESET7 = "\x1B[0m";
|
|
1492
1741
|
async function readTestSuiteMetadata(testFilePath) {
|
|
1493
1742
|
try {
|
|
1494
|
-
const absolutePath =
|
|
1495
|
-
const content = await (0,
|
|
1496
|
-
const parsed = (0,
|
|
1743
|
+
const absolutePath = import_node_path7.default.resolve(testFilePath);
|
|
1744
|
+
const content = await (0, import_promises7.readFile)(absolutePath, "utf8");
|
|
1745
|
+
const parsed = (0, import_yaml3.parse)(content);
|
|
1497
1746
|
if (!isJsonObject(parsed)) {
|
|
1498
1747
|
return {};
|
|
1499
1748
|
}
|
|
@@ -1503,21 +1752,25 @@ async function readTestSuiteMetadata(testFilePath) {
|
|
|
1503
1752
|
}
|
|
1504
1753
|
}
|
|
1505
1754
|
async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
1755
|
+
const format = detectFormat(evalFilePath);
|
|
1756
|
+
if (format === "jsonl") {
|
|
1757
|
+
return loadEvalCasesFromJsonl(evalFilePath, repoRoot, options);
|
|
1758
|
+
}
|
|
1506
1759
|
const verbose = options?.verbose ?? false;
|
|
1507
1760
|
const evalIdFilter = options?.evalId;
|
|
1508
|
-
const absoluteTestPath =
|
|
1761
|
+
const absoluteTestPath = import_node_path7.default.resolve(evalFilePath);
|
|
1509
1762
|
const repoRootPath = resolveToAbsolutePath(repoRoot);
|
|
1510
1763
|
const searchRoots = buildSearchRoots(absoluteTestPath, repoRootPath);
|
|
1511
1764
|
const config = await loadConfig(absoluteTestPath, repoRootPath);
|
|
1512
1765
|
const guidelinePatterns = config?.guideline_patterns;
|
|
1513
|
-
const rawFile = await (0,
|
|
1514
|
-
const parsed = (0,
|
|
1766
|
+
const rawFile = await (0, import_promises7.readFile)(absoluteTestPath, "utf8");
|
|
1767
|
+
const parsed = (0, import_yaml3.parse)(rawFile);
|
|
1515
1768
|
if (!isJsonObject(parsed)) {
|
|
1516
1769
|
throw new Error(`Invalid test file format: ${evalFilePath}`);
|
|
1517
1770
|
}
|
|
1518
1771
|
const suite = parsed;
|
|
1519
|
-
const datasetNameFromSuite =
|
|
1520
|
-
const fallbackDataset =
|
|
1772
|
+
const datasetNameFromSuite = asString6(suite.dataset)?.trim();
|
|
1773
|
+
const fallbackDataset = import_node_path7.default.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
|
|
1521
1774
|
const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
|
|
1522
1775
|
const rawTestcases = suite.evalcases;
|
|
1523
1776
|
if (!Array.isArray(rawTestcases)) {
|
|
@@ -1525,24 +1778,24 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1525
1778
|
}
|
|
1526
1779
|
const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm_judge";
|
|
1527
1780
|
const globalExecution = isJsonObject(suite.execution) ? suite.execution : void 0;
|
|
1528
|
-
const _globalTarget =
|
|
1781
|
+
const _globalTarget = asString6(globalExecution?.target) ?? asString6(suite.target);
|
|
1529
1782
|
const results = [];
|
|
1530
1783
|
for (const rawEvalcase of rawTestcases) {
|
|
1531
1784
|
if (!isJsonObject(rawEvalcase)) {
|
|
1532
|
-
|
|
1785
|
+
logWarning6("Skipping invalid eval case entry (expected object)");
|
|
1533
1786
|
continue;
|
|
1534
1787
|
}
|
|
1535
1788
|
const evalcase = rawEvalcase;
|
|
1536
|
-
const id =
|
|
1789
|
+
const id = asString6(evalcase.id);
|
|
1537
1790
|
if (evalIdFilter && id !== evalIdFilter) {
|
|
1538
1791
|
continue;
|
|
1539
1792
|
}
|
|
1540
|
-
const conversationId =
|
|
1541
|
-
const outcome =
|
|
1793
|
+
const conversationId = asString6(evalcase.conversation_id);
|
|
1794
|
+
const outcome = asString6(evalcase.expected_outcome) ?? asString6(evalcase.outcome);
|
|
1542
1795
|
const inputMessagesValue = evalcase.input_messages;
|
|
1543
1796
|
const expectedMessagesValue = evalcase.expected_messages;
|
|
1544
1797
|
if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
|
|
1545
|
-
|
|
1798
|
+
logError2(
|
|
1546
1799
|
`Skipping incomplete eval case: ${id ?? "unknown"}. Missing required fields: id, outcome, and/or input_messages`
|
|
1547
1800
|
);
|
|
1548
1801
|
continue;
|
|
@@ -1553,7 +1806,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1553
1806
|
);
|
|
1554
1807
|
const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
|
|
1555
1808
|
if (hasExpectedMessages && expectedMessages.length === 0) {
|
|
1556
|
-
|
|
1809
|
+
logError2(`No valid expected message found for eval case: ${id}`);
|
|
1557
1810
|
continue;
|
|
1558
1811
|
}
|
|
1559
1812
|
const guidelinePaths = [];
|
|
@@ -1594,7 +1847,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1594
1847
|
evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
|
|
1595
1848
|
} catch (error) {
|
|
1596
1849
|
const message = error instanceof Error ? error.message : String(error);
|
|
1597
|
-
|
|
1850
|
+
logError2(`Skipping eval case '${id}': ${message}`);
|
|
1598
1851
|
continue;
|
|
1599
1852
|
}
|
|
1600
1853
|
const inlineRubrics = evalcase.rubrics;
|
|
@@ -1609,8 +1862,8 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1609
1862
|
};
|
|
1610
1863
|
}
|
|
1611
1864
|
return {
|
|
1612
|
-
id:
|
|
1613
|
-
description:
|
|
1865
|
+
id: asString6(rubric.id) ?? `rubric-${index + 1}`,
|
|
1866
|
+
description: asString6(rubric.description) ?? "",
|
|
1614
1867
|
weight: typeof rubric.weight === "number" ? rubric.weight : 1,
|
|
1615
1868
|
required: typeof rubric.required === "boolean" ? rubric.required : true
|
|
1616
1869
|
};
|
|
@@ -1631,7 +1884,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1631
1884
|
}
|
|
1632
1885
|
}
|
|
1633
1886
|
const allFilePaths = [
|
|
1634
|
-
...guidelinePaths.map((guidelinePath) =>
|
|
1887
|
+
...guidelinePaths.map((guidelinePath) => import_node_path7.default.resolve(guidelinePath)),
|
|
1635
1888
|
...userFilePaths
|
|
1636
1889
|
];
|
|
1637
1890
|
const testCase = {
|
|
@@ -1643,7 +1896,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1643
1896
|
input_segments: inputSegments,
|
|
1644
1897
|
expected_messages: outputSegments,
|
|
1645
1898
|
reference_answer: referenceAnswer,
|
|
1646
|
-
guideline_paths: guidelinePaths.map((guidelinePath) =>
|
|
1899
|
+
guideline_paths: guidelinePaths.map((guidelinePath) => import_node_path7.default.resolve(guidelinePath)),
|
|
1647
1900
|
guideline_patterns: guidelinePatterns,
|
|
1648
1901
|
file_paths: allFilePaths,
|
|
1649
1902
|
expected_outcome: outcome,
|
|
@@ -1666,35 +1919,35 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1666
1919
|
}
|
|
1667
1920
|
return results;
|
|
1668
1921
|
}
|
|
1669
|
-
function
|
|
1922
|
+
function asString6(value) {
|
|
1670
1923
|
return typeof value === "string" ? value : void 0;
|
|
1671
1924
|
}
|
|
1672
|
-
function
|
|
1925
|
+
function logWarning6(message, details) {
|
|
1673
1926
|
if (details && details.length > 0) {
|
|
1674
1927
|
const detailBlock = details.join("\n");
|
|
1675
|
-
console.warn(`${
|
|
1676
|
-
${detailBlock}${
|
|
1928
|
+
console.warn(`${ANSI_YELLOW7}Warning: ${message}
|
|
1929
|
+
${detailBlock}${ANSI_RESET7}`);
|
|
1677
1930
|
} else {
|
|
1678
|
-
console.warn(`${
|
|
1931
|
+
console.warn(`${ANSI_YELLOW7}Warning: ${message}${ANSI_RESET7}`);
|
|
1679
1932
|
}
|
|
1680
1933
|
}
|
|
1681
|
-
function
|
|
1934
|
+
function logError2(message, details) {
|
|
1682
1935
|
if (details && details.length > 0) {
|
|
1683
1936
|
const detailBlock = details.join("\n");
|
|
1684
|
-
console.error(`${
|
|
1685
|
-
${detailBlock}${
|
|
1937
|
+
console.error(`${ANSI_RED2}Error: ${message}
|
|
1938
|
+
${detailBlock}${ANSI_RESET7}`);
|
|
1686
1939
|
} else {
|
|
1687
|
-
console.error(`${
|
|
1940
|
+
console.error(`${ANSI_RED2}Error: ${message}${ANSI_RESET7}`);
|
|
1688
1941
|
}
|
|
1689
1942
|
}
|
|
1690
1943
|
|
|
1691
1944
|
// src/evaluation/file-utils.ts
|
|
1692
1945
|
var import_node_fs2 = require("fs");
|
|
1693
|
-
var
|
|
1694
|
-
var
|
|
1946
|
+
var import_promises8 = require("fs/promises");
|
|
1947
|
+
var import_node_path8 = __toESM(require("path"), 1);
|
|
1695
1948
|
async function fileExists2(filePath) {
|
|
1696
1949
|
try {
|
|
1697
|
-
await (0,
|
|
1950
|
+
await (0, import_promises8.access)(filePath, import_node_fs2.constants.F_OK);
|
|
1698
1951
|
return true;
|
|
1699
1952
|
} catch {
|
|
1700
1953
|
return false;
|
|
@@ -1704,22 +1957,22 @@ function normalizeLineEndings(content) {
|
|
|
1704
1957
|
return content.replace(/\r\n/g, "\n");
|
|
1705
1958
|
}
|
|
1706
1959
|
async function readTextFile(filePath) {
|
|
1707
|
-
const content = await (0,
|
|
1960
|
+
const content = await (0, import_promises8.readFile)(filePath, "utf8");
|
|
1708
1961
|
return normalizeLineEndings(content);
|
|
1709
1962
|
}
|
|
1710
1963
|
async function readJsonFile(filePath) {
|
|
1711
|
-
const content = await (0,
|
|
1964
|
+
const content = await (0, import_promises8.readFile)(filePath, "utf8");
|
|
1712
1965
|
return JSON.parse(content);
|
|
1713
1966
|
}
|
|
1714
1967
|
async function findGitRoot(startPath) {
|
|
1715
|
-
let currentDir =
|
|
1716
|
-
const root =
|
|
1968
|
+
let currentDir = import_node_path8.default.dirname(import_node_path8.default.resolve(startPath));
|
|
1969
|
+
const root = import_node_path8.default.parse(currentDir).root;
|
|
1717
1970
|
while (currentDir !== root) {
|
|
1718
|
-
const gitPath =
|
|
1971
|
+
const gitPath = import_node_path8.default.join(currentDir, ".git");
|
|
1719
1972
|
if (await fileExists2(gitPath)) {
|
|
1720
1973
|
return currentDir;
|
|
1721
1974
|
}
|
|
1722
|
-
const parentDir =
|
|
1975
|
+
const parentDir = import_node_path8.default.dirname(currentDir);
|
|
1723
1976
|
if (parentDir === currentDir) {
|
|
1724
1977
|
break;
|
|
1725
1978
|
}
|
|
@@ -1730,8 +1983,8 @@ async function findGitRoot(startPath) {
|
|
|
1730
1983
|
function buildDirectoryChain2(filePath, repoRoot) {
|
|
1731
1984
|
const directories = [];
|
|
1732
1985
|
const seen = /* @__PURE__ */ new Set();
|
|
1733
|
-
const boundary =
|
|
1734
|
-
let current =
|
|
1986
|
+
const boundary = import_node_path8.default.resolve(repoRoot);
|
|
1987
|
+
let current = import_node_path8.default.resolve(import_node_path8.default.dirname(filePath));
|
|
1735
1988
|
while (current !== void 0) {
|
|
1736
1989
|
if (!seen.has(current)) {
|
|
1737
1990
|
directories.push(current);
|
|
@@ -1740,7 +1993,7 @@ function buildDirectoryChain2(filePath, repoRoot) {
|
|
|
1740
1993
|
if (current === boundary) {
|
|
1741
1994
|
break;
|
|
1742
1995
|
}
|
|
1743
|
-
const parent =
|
|
1996
|
+
const parent = import_node_path8.default.dirname(current);
|
|
1744
1997
|
if (parent === current) {
|
|
1745
1998
|
break;
|
|
1746
1999
|
}
|
|
@@ -1754,16 +2007,16 @@ function buildDirectoryChain2(filePath, repoRoot) {
|
|
|
1754
2007
|
function buildSearchRoots2(evalPath, repoRoot) {
|
|
1755
2008
|
const uniqueRoots = [];
|
|
1756
2009
|
const addRoot = (root) => {
|
|
1757
|
-
const normalized =
|
|
2010
|
+
const normalized = import_node_path8.default.resolve(root);
|
|
1758
2011
|
if (!uniqueRoots.includes(normalized)) {
|
|
1759
2012
|
uniqueRoots.push(normalized);
|
|
1760
2013
|
}
|
|
1761
2014
|
};
|
|
1762
|
-
let currentDir =
|
|
2015
|
+
let currentDir = import_node_path8.default.dirname(evalPath);
|
|
1763
2016
|
let reachedBoundary = false;
|
|
1764
2017
|
while (!reachedBoundary) {
|
|
1765
2018
|
addRoot(currentDir);
|
|
1766
|
-
const parentDir =
|
|
2019
|
+
const parentDir = import_node_path8.default.dirname(currentDir);
|
|
1767
2020
|
if (currentDir === repoRoot || parentDir === currentDir) {
|
|
1768
2021
|
reachedBoundary = true;
|
|
1769
2022
|
} else {
|
|
@@ -1781,16 +2034,16 @@ function trimLeadingSeparators2(value) {
|
|
|
1781
2034
|
async function resolveFileReference2(rawValue, searchRoots) {
|
|
1782
2035
|
const displayPath = trimLeadingSeparators2(rawValue);
|
|
1783
2036
|
const potentialPaths = [];
|
|
1784
|
-
if (
|
|
1785
|
-
potentialPaths.push(
|
|
2037
|
+
if (import_node_path8.default.isAbsolute(rawValue)) {
|
|
2038
|
+
potentialPaths.push(import_node_path8.default.normalize(rawValue));
|
|
1786
2039
|
}
|
|
1787
2040
|
for (const base of searchRoots) {
|
|
1788
|
-
potentialPaths.push(
|
|
2041
|
+
potentialPaths.push(import_node_path8.default.resolve(base, displayPath));
|
|
1789
2042
|
}
|
|
1790
2043
|
const attempted = [];
|
|
1791
2044
|
const seen = /* @__PURE__ */ new Set();
|
|
1792
2045
|
for (const candidate of potentialPaths) {
|
|
1793
|
-
const absoluteCandidate =
|
|
2046
|
+
const absoluteCandidate = import_node_path8.default.resolve(candidate);
|
|
1794
2047
|
if (seen.has(absoluteCandidate)) {
|
|
1795
2048
|
continue;
|
|
1796
2049
|
}
|
|
@@ -2140,9 +2393,9 @@ async function withRetry(fn, retryConfig, signal) {
|
|
|
2140
2393
|
var import_node_child_process = require("child_process");
|
|
2141
2394
|
var import_node_crypto = require("crypto");
|
|
2142
2395
|
var import_node_fs3 = require("fs");
|
|
2143
|
-
var
|
|
2396
|
+
var import_promises9 = require("fs/promises");
|
|
2144
2397
|
var import_node_os = require("os");
|
|
2145
|
-
var
|
|
2398
|
+
var import_node_path10 = __toESM(require("path"), 1);
|
|
2146
2399
|
|
|
2147
2400
|
// src/evaluation/providers/claude-code-log-tracker.ts
|
|
2148
2401
|
var GLOBAL_LOGS_KEY = Symbol.for("agentv.claudeCodeLogs");
|
|
@@ -2198,7 +2451,7 @@ function subscribeToClaudeCodeLogEntries(listener) {
|
|
|
2198
2451
|
}
|
|
2199
2452
|
|
|
2200
2453
|
// src/evaluation/providers/preread.ts
|
|
2201
|
-
var
|
|
2454
|
+
var import_node_path9 = __toESM(require("path"), 1);
|
|
2202
2455
|
function buildPromptDocument(request, inputFiles, options) {
|
|
2203
2456
|
const parts = [];
|
|
2204
2457
|
const guidelineFiles = collectGuidelineFiles(
|
|
@@ -2221,7 +2474,7 @@ function normalizeInputFiles(inputFiles) {
|
|
|
2221
2474
|
}
|
|
2222
2475
|
const deduped = /* @__PURE__ */ new Map();
|
|
2223
2476
|
for (const inputFile of inputFiles) {
|
|
2224
|
-
const absolutePath =
|
|
2477
|
+
const absolutePath = import_node_path9.default.resolve(inputFile);
|
|
2225
2478
|
if (!deduped.has(absolutePath)) {
|
|
2226
2479
|
deduped.set(absolutePath, absolutePath);
|
|
2227
2480
|
}
|
|
@@ -2234,14 +2487,14 @@ function collectGuidelineFiles(inputFiles, guidelinePatterns, overrides) {
|
|
|
2234
2487
|
}
|
|
2235
2488
|
const unique = /* @__PURE__ */ new Map();
|
|
2236
2489
|
for (const inputFile of inputFiles) {
|
|
2237
|
-
const absolutePath =
|
|
2490
|
+
const absolutePath = import_node_path9.default.resolve(inputFile);
|
|
2238
2491
|
if (overrides?.has(absolutePath)) {
|
|
2239
2492
|
if (!unique.has(absolutePath)) {
|
|
2240
2493
|
unique.set(absolutePath, absolutePath);
|
|
2241
2494
|
}
|
|
2242
2495
|
continue;
|
|
2243
2496
|
}
|
|
2244
|
-
const normalized = absolutePath.split(
|
|
2497
|
+
const normalized = absolutePath.split(import_node_path9.default.sep).join("/");
|
|
2245
2498
|
if (isGuidelineFile(normalized, guidelinePatterns)) {
|
|
2246
2499
|
if (!unique.has(absolutePath)) {
|
|
2247
2500
|
unique.set(absolutePath, absolutePath);
|
|
@@ -2256,7 +2509,7 @@ function collectInputFiles(inputFiles) {
|
|
|
2256
2509
|
}
|
|
2257
2510
|
const unique = /* @__PURE__ */ new Map();
|
|
2258
2511
|
for (const inputFile of inputFiles) {
|
|
2259
|
-
const absolutePath =
|
|
2512
|
+
const absolutePath = import_node_path9.default.resolve(inputFile);
|
|
2260
2513
|
if (!unique.has(absolutePath)) {
|
|
2261
2514
|
unique.set(absolutePath, absolutePath);
|
|
2262
2515
|
}
|
|
@@ -2268,7 +2521,7 @@ function buildMandatoryPrereadBlock(guidelineFiles, inputFiles) {
|
|
|
2268
2521
|
return "";
|
|
2269
2522
|
}
|
|
2270
2523
|
const buildList = (files) => files.map((absolutePath) => {
|
|
2271
|
-
const fileName =
|
|
2524
|
+
const fileName = import_node_path9.default.basename(absolutePath);
|
|
2272
2525
|
const fileUri = pathToFileUri(absolutePath);
|
|
2273
2526
|
return `* [${fileName}](${fileUri})`;
|
|
2274
2527
|
});
|
|
@@ -2288,7 +2541,7 @@ ${buildList(inputFiles).join("\n")}.`);
|
|
|
2288
2541
|
return sections.join("\n");
|
|
2289
2542
|
}
|
|
2290
2543
|
function pathToFileUri(filePath) {
|
|
2291
|
-
const absolutePath =
|
|
2544
|
+
const absolutePath = import_node_path9.default.isAbsolute(filePath) ? filePath : import_node_path9.default.resolve(filePath);
|
|
2292
2545
|
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
2293
2546
|
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
2294
2547
|
return `file:///${normalizedPath}`;
|
|
@@ -2325,8 +2578,8 @@ var ClaudeCodeProvider = class {
|
|
|
2325
2578
|
const workspaceRoot = await this.createWorkspace();
|
|
2326
2579
|
const logger = await this.createStreamLogger(request).catch(() => void 0);
|
|
2327
2580
|
try {
|
|
2328
|
-
const promptFile =
|
|
2329
|
-
await (0,
|
|
2581
|
+
const promptFile = import_node_path10.default.join(workspaceRoot, PROMPT_FILENAME);
|
|
2582
|
+
await (0, import_promises9.writeFile)(promptFile, request.question, "utf8");
|
|
2330
2583
|
const args = this.buildClaudeCodeArgs(request.question, inputFiles);
|
|
2331
2584
|
const cwd = this.resolveCwd();
|
|
2332
2585
|
const result = await this.executeClaudeCode(args, cwd, request.signal, logger);
|
|
@@ -2373,7 +2626,7 @@ var ClaudeCodeProvider = class {
|
|
|
2373
2626
|
if (!this.config.cwd) {
|
|
2374
2627
|
return process.cwd();
|
|
2375
2628
|
}
|
|
2376
|
-
return
|
|
2629
|
+
return import_node_path10.default.resolve(this.config.cwd);
|
|
2377
2630
|
}
|
|
2378
2631
|
buildClaudeCodeArgs(prompt, inputFiles) {
|
|
2379
2632
|
const args = [];
|
|
@@ -2430,11 +2683,11 @@ ${filesContext}`;
|
|
|
2430
2683
|
}
|
|
2431
2684
|
}
|
|
2432
2685
|
async createWorkspace() {
|
|
2433
|
-
return await (0,
|
|
2686
|
+
return await (0, import_promises9.mkdtemp)(import_node_path10.default.join((0, import_node_os.tmpdir)(), WORKSPACE_PREFIX));
|
|
2434
2687
|
}
|
|
2435
2688
|
async cleanupWorkspace(workspaceRoot) {
|
|
2436
2689
|
try {
|
|
2437
|
-
await (0,
|
|
2690
|
+
await (0, import_promises9.rm)(workspaceRoot, { recursive: true, force: true });
|
|
2438
2691
|
} catch {
|
|
2439
2692
|
}
|
|
2440
2693
|
}
|
|
@@ -2444,9 +2697,9 @@ ${filesContext}`;
|
|
|
2444
2697
|
return void 0;
|
|
2445
2698
|
}
|
|
2446
2699
|
if (this.config.logDir) {
|
|
2447
|
-
return
|
|
2700
|
+
return import_node_path10.default.resolve(this.config.logDir);
|
|
2448
2701
|
}
|
|
2449
|
-
return
|
|
2702
|
+
return import_node_path10.default.join(process.cwd(), ".agentv", "logs", "claude-code");
|
|
2450
2703
|
}
|
|
2451
2704
|
async createStreamLogger(request) {
|
|
2452
2705
|
const logDir = this.resolveLogDirectory();
|
|
@@ -2454,13 +2707,13 @@ ${filesContext}`;
|
|
|
2454
2707
|
return void 0;
|
|
2455
2708
|
}
|
|
2456
2709
|
try {
|
|
2457
|
-
await (0,
|
|
2710
|
+
await (0, import_promises9.mkdir)(logDir, { recursive: true });
|
|
2458
2711
|
} catch (error) {
|
|
2459
2712
|
const message = error instanceof Error ? error.message : String(error);
|
|
2460
2713
|
console.warn(`Skipping Claude Code stream logging (could not create ${logDir}): ${message}`);
|
|
2461
2714
|
return void 0;
|
|
2462
2715
|
}
|
|
2463
|
-
const filePath =
|
|
2716
|
+
const filePath = import_node_path10.default.join(logDir, buildLogFilename(request, this.targetName));
|
|
2464
2717
|
try {
|
|
2465
2718
|
const logger = await ClaudeCodeStreamLogger.create({
|
|
2466
2719
|
filePath,
|
|
@@ -2865,16 +3118,16 @@ function escapeShellArg(arg) {
|
|
|
2865
3118
|
}
|
|
2866
3119
|
async function defaultClaudeCodeRunner(options) {
|
|
2867
3120
|
const tempId = (0, import_node_crypto.randomUUID)();
|
|
2868
|
-
const stdoutFile =
|
|
2869
|
-
const stderrFile =
|
|
2870
|
-
const exitFile =
|
|
2871
|
-
const pidFile =
|
|
3121
|
+
const stdoutFile = import_node_path10.default.join((0, import_node_os.tmpdir)(), `agentv-cc-${tempId}-stdout`);
|
|
3122
|
+
const stderrFile = import_node_path10.default.join((0, import_node_os.tmpdir)(), `agentv-cc-${tempId}-stderr`);
|
|
3123
|
+
const exitFile = import_node_path10.default.join((0, import_node_os.tmpdir)(), `agentv-cc-${tempId}-exit`);
|
|
3124
|
+
const pidFile = import_node_path10.default.join((0, import_node_os.tmpdir)(), `agentv-cc-${tempId}-pid`);
|
|
2872
3125
|
try {
|
|
2873
3126
|
return await runClaudeCodeWithTempFiles(options, stdoutFile, stderrFile, exitFile, pidFile);
|
|
2874
3127
|
} finally {
|
|
2875
3128
|
for (const file of [stdoutFile, stderrFile, exitFile, pidFile]) {
|
|
2876
3129
|
try {
|
|
2877
|
-
await (0,
|
|
3130
|
+
await (0, import_promises9.rm)(file, { force: true });
|
|
2878
3131
|
} catch {
|
|
2879
3132
|
}
|
|
2880
3133
|
}
|
|
@@ -2908,8 +3161,8 @@ async function runClaudeCodeWithTempFiles(options, stdoutFile, stderrFile, exitF
|
|
|
2908
3161
|
let lastStdoutSize = 0;
|
|
2909
3162
|
const readFileIfExists = async (filePath) => {
|
|
2910
3163
|
try {
|
|
2911
|
-
const { readFile:
|
|
2912
|
-
return await
|
|
3164
|
+
const { readFile: readFile9 } = await import("fs/promises");
|
|
3165
|
+
return await readFile9(filePath, "utf8");
|
|
2913
3166
|
} catch {
|
|
2914
3167
|
return "";
|
|
2915
3168
|
}
|
|
@@ -2982,9 +3235,9 @@ async function runClaudeCodeWithTempFiles(options, stdoutFile, stderrFile, exitF
|
|
|
2982
3235
|
|
|
2983
3236
|
// src/evaluation/providers/cli.ts
|
|
2984
3237
|
var import_node_child_process2 = require("child_process");
|
|
2985
|
-
var
|
|
3238
|
+
var import_promises10 = __toESM(require("fs/promises"), 1);
|
|
2986
3239
|
var import_node_os2 = __toESM(require("os"), 1);
|
|
2987
|
-
var
|
|
3240
|
+
var import_node_path11 = __toESM(require("path"), 1);
|
|
2988
3241
|
var import_node_util = require("util");
|
|
2989
3242
|
var import_zod = require("zod");
|
|
2990
3243
|
var ToolCallSchema = import_zod.z.object({
|
|
@@ -3353,7 +3606,7 @@ var CliProvider = class {
|
|
|
3353
3606
|
throw new Error(`Failed to read output file '${filePath}': ${errorMsg}`);
|
|
3354
3607
|
} finally {
|
|
3355
3608
|
if (!this.keepTempFiles) {
|
|
3356
|
-
await
|
|
3609
|
+
await import_promises10.default.unlink(filePath).catch(() => {
|
|
3357
3610
|
});
|
|
3358
3611
|
}
|
|
3359
3612
|
}
|
|
@@ -3441,7 +3694,7 @@ function normalizeInputFiles2(inputFiles) {
|
|
|
3441
3694
|
}
|
|
3442
3695
|
const unique = /* @__PURE__ */ new Map();
|
|
3443
3696
|
for (const inputFile of inputFiles) {
|
|
3444
|
-
const absolutePath =
|
|
3697
|
+
const absolutePath = import_node_path11.default.resolve(inputFile);
|
|
3445
3698
|
if (!unique.has(absolutePath)) {
|
|
3446
3699
|
unique.set(absolutePath, absolutePath);
|
|
3447
3700
|
}
|
|
@@ -3455,7 +3708,7 @@ function formatFileList(files, template) {
|
|
|
3455
3708
|
const formatter = template ?? "{path}";
|
|
3456
3709
|
return files.map((filePath) => {
|
|
3457
3710
|
const escapedPath = shellEscape(filePath);
|
|
3458
|
-
const escapedName = shellEscape(
|
|
3711
|
+
const escapedName = shellEscape(import_node_path11.default.basename(filePath));
|
|
3459
3712
|
return formatter.replaceAll("{path}", escapedPath).replaceAll("{basename}", escapedName);
|
|
3460
3713
|
}).join(" ");
|
|
3461
3714
|
}
|
|
@@ -3479,7 +3732,7 @@ function generateOutputFilePath(evalCaseId, extension = ".json") {
|
|
|
3479
3732
|
const safeEvalId = evalCaseId || "unknown";
|
|
3480
3733
|
const timestamp = Date.now();
|
|
3481
3734
|
const random = Math.random().toString(36).substring(2, 9);
|
|
3482
|
-
return
|
|
3735
|
+
return import_node_path11.default.join(import_node_os2.default.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}${extension}`);
|
|
3483
3736
|
}
|
|
3484
3737
|
function formatTimeoutSuffix2(timeoutMs) {
|
|
3485
3738
|
if (!timeoutMs || timeoutMs <= 0) {
|
|
@@ -3493,9 +3746,9 @@ function formatTimeoutSuffix2(timeoutMs) {
|
|
|
3493
3746
|
var import_node_child_process3 = require("child_process");
|
|
3494
3747
|
var import_node_crypto2 = require("crypto");
|
|
3495
3748
|
var import_node_fs4 = require("fs");
|
|
3496
|
-
var
|
|
3749
|
+
var import_promises11 = require("fs/promises");
|
|
3497
3750
|
var import_node_os3 = require("os");
|
|
3498
|
-
var
|
|
3751
|
+
var import_node_path12 = __toESM(require("path"), 1);
|
|
3499
3752
|
var import_node_util2 = require("util");
|
|
3500
3753
|
|
|
3501
3754
|
// src/evaluation/providers/codex-log-tracker.ts
|
|
@@ -3590,8 +3843,8 @@ var CodexProvider = class {
|
|
|
3590
3843
|
const promptContent = `${systemPrompt}
|
|
3591
3844
|
|
|
3592
3845
|
${basePrompt}`;
|
|
3593
|
-
const promptFile =
|
|
3594
|
-
await (0,
|
|
3846
|
+
const promptFile = import_node_path12.default.join(workspaceRoot, PROMPT_FILENAME2);
|
|
3847
|
+
await (0, import_promises11.writeFile)(promptFile, promptContent, "utf8");
|
|
3595
3848
|
const args = this.buildCodexArgs();
|
|
3596
3849
|
const cwd = this.resolveCwd(workspaceRoot);
|
|
3597
3850
|
const result = await this.executeCodex(args, cwd, promptContent, request.signal, logger);
|
|
@@ -3640,7 +3893,7 @@ ${basePrompt}`;
|
|
|
3640
3893
|
if (!this.config.cwd) {
|
|
3641
3894
|
return workspaceRoot;
|
|
3642
3895
|
}
|
|
3643
|
-
return
|
|
3896
|
+
return import_node_path12.default.resolve(this.config.cwd);
|
|
3644
3897
|
}
|
|
3645
3898
|
buildCodexArgs() {
|
|
3646
3899
|
const args = [
|
|
@@ -3682,11 +3935,11 @@ ${basePrompt}`;
|
|
|
3682
3935
|
}
|
|
3683
3936
|
}
|
|
3684
3937
|
async createWorkspace() {
|
|
3685
|
-
return await (0,
|
|
3938
|
+
return await (0, import_promises11.mkdtemp)(import_node_path12.default.join((0, import_node_os3.tmpdir)(), WORKSPACE_PREFIX2));
|
|
3686
3939
|
}
|
|
3687
3940
|
async cleanupWorkspace(workspaceRoot) {
|
|
3688
3941
|
try {
|
|
3689
|
-
await (0,
|
|
3942
|
+
await (0, import_promises11.rm)(workspaceRoot, { recursive: true, force: true });
|
|
3690
3943
|
} catch {
|
|
3691
3944
|
}
|
|
3692
3945
|
}
|
|
@@ -3696,9 +3949,9 @@ ${basePrompt}`;
|
|
|
3696
3949
|
return void 0;
|
|
3697
3950
|
}
|
|
3698
3951
|
if (this.config.logDir) {
|
|
3699
|
-
return
|
|
3952
|
+
return import_node_path12.default.resolve(this.config.logDir);
|
|
3700
3953
|
}
|
|
3701
|
-
return
|
|
3954
|
+
return import_node_path12.default.join(process.cwd(), ".agentv", "logs", "codex");
|
|
3702
3955
|
}
|
|
3703
3956
|
async createStreamLogger(request) {
|
|
3704
3957
|
const logDir = this.resolveLogDirectory();
|
|
@@ -3706,13 +3959,13 @@ ${basePrompt}`;
|
|
|
3706
3959
|
return void 0;
|
|
3707
3960
|
}
|
|
3708
3961
|
try {
|
|
3709
|
-
await (0,
|
|
3962
|
+
await (0, import_promises11.mkdir)(logDir, { recursive: true });
|
|
3710
3963
|
} catch (error) {
|
|
3711
3964
|
const message = error instanceof Error ? error.message : String(error);
|
|
3712
3965
|
console.warn(`Skipping Codex stream logging (could not create ${logDir}): ${message}`);
|
|
3713
3966
|
return void 0;
|
|
3714
3967
|
}
|
|
3715
|
-
const filePath =
|
|
3968
|
+
const filePath = import_node_path12.default.join(logDir, buildLogFilename2(request, this.targetName));
|
|
3716
3969
|
try {
|
|
3717
3970
|
const logger = await CodexStreamLogger.create({
|
|
3718
3971
|
filePath,
|
|
@@ -3927,9 +4180,9 @@ function tryParseJsonValue2(rawLine) {
|
|
|
3927
4180
|
async function locateExecutable(candidate) {
|
|
3928
4181
|
const includesPathSeparator = candidate.includes("/") || candidate.includes("\\");
|
|
3929
4182
|
if (includesPathSeparator) {
|
|
3930
|
-
const resolved =
|
|
4183
|
+
const resolved = import_node_path12.default.isAbsolute(candidate) ? candidate : import_node_path12.default.resolve(candidate);
|
|
3931
4184
|
const executablePath = await ensureWindowsExecutableVariant(resolved);
|
|
3932
|
-
await (0,
|
|
4185
|
+
await (0, import_promises11.access)(executablePath, import_node_fs4.constants.F_OK);
|
|
3933
4186
|
return executablePath;
|
|
3934
4187
|
}
|
|
3935
4188
|
const locator = process.platform === "win32" ? "where" : "which";
|
|
@@ -3939,7 +4192,7 @@ async function locateExecutable(candidate) {
|
|
|
3939
4192
|
const preferred = selectExecutableCandidate(lines);
|
|
3940
4193
|
if (preferred) {
|
|
3941
4194
|
const executablePath = await ensureWindowsExecutableVariant(preferred);
|
|
3942
|
-
await (0,
|
|
4195
|
+
await (0, import_promises11.access)(executablePath, import_node_fs4.constants.F_OK);
|
|
3943
4196
|
return executablePath;
|
|
3944
4197
|
}
|
|
3945
4198
|
} catch {
|
|
@@ -3973,7 +4226,7 @@ async function ensureWindowsExecutableVariant(candidate) {
|
|
|
3973
4226
|
for (const ext of extensions) {
|
|
3974
4227
|
const withExtension = `${candidate}${ext}`;
|
|
3975
4228
|
try {
|
|
3976
|
-
await (0,
|
|
4229
|
+
await (0, import_promises11.access)(withExtension, import_node_fs4.constants.F_OK);
|
|
3977
4230
|
return withExtension;
|
|
3978
4231
|
} catch {
|
|
3979
4232
|
}
|
|
@@ -4438,9 +4691,9 @@ function extractToolCalls2(content) {
|
|
|
4438
4691
|
var import_node_child_process4 = require("child_process");
|
|
4439
4692
|
var import_node_crypto3 = require("crypto");
|
|
4440
4693
|
var import_node_fs5 = require("fs");
|
|
4441
|
-
var
|
|
4694
|
+
var import_promises12 = require("fs/promises");
|
|
4442
4695
|
var import_node_os4 = require("os");
|
|
4443
|
-
var
|
|
4696
|
+
var import_node_path13 = __toESM(require("path"), 1);
|
|
4444
4697
|
|
|
4445
4698
|
// src/evaluation/providers/pi-log-tracker.ts
|
|
4446
4699
|
var GLOBAL_LOGS_KEY3 = Symbol.for("agentv.piLogs");
|
|
@@ -4524,8 +4777,8 @@ var PiCodingAgentProvider = class {
|
|
|
4524
4777
|
const workspaceRoot = await this.createWorkspace();
|
|
4525
4778
|
const logger = await this.createStreamLogger(request).catch(() => void 0);
|
|
4526
4779
|
try {
|
|
4527
|
-
const promptFile =
|
|
4528
|
-
await (0,
|
|
4780
|
+
const promptFile = import_node_path13.default.join(workspaceRoot, PROMPT_FILENAME3);
|
|
4781
|
+
await (0, import_promises12.writeFile)(promptFile, request.question, "utf8");
|
|
4529
4782
|
const args = this.buildPiArgs(request.question, inputFiles);
|
|
4530
4783
|
const cwd = this.resolveCwd(workspaceRoot);
|
|
4531
4784
|
const result = await this.executePi(args, cwd, request.signal, logger);
|
|
@@ -4566,7 +4819,7 @@ var PiCodingAgentProvider = class {
|
|
|
4566
4819
|
if (!this.config.cwd) {
|
|
4567
4820
|
return workspaceRoot;
|
|
4568
4821
|
}
|
|
4569
|
-
return
|
|
4822
|
+
return import_node_path13.default.resolve(this.config.cwd);
|
|
4570
4823
|
}
|
|
4571
4824
|
buildPiArgs(prompt, inputFiles) {
|
|
4572
4825
|
const args = [];
|
|
@@ -4655,19 +4908,19 @@ ${prompt}`;
|
|
|
4655
4908
|
return env;
|
|
4656
4909
|
}
|
|
4657
4910
|
async createWorkspace() {
|
|
4658
|
-
return await (0,
|
|
4911
|
+
return await (0, import_promises12.mkdtemp)(import_node_path13.default.join((0, import_node_os4.tmpdir)(), WORKSPACE_PREFIX3));
|
|
4659
4912
|
}
|
|
4660
4913
|
async cleanupWorkspace(workspaceRoot) {
|
|
4661
4914
|
try {
|
|
4662
|
-
await (0,
|
|
4915
|
+
await (0, import_promises12.rm)(workspaceRoot, { recursive: true, force: true });
|
|
4663
4916
|
} catch {
|
|
4664
4917
|
}
|
|
4665
4918
|
}
|
|
4666
4919
|
resolveLogDirectory() {
|
|
4667
4920
|
if (this.config.logDir) {
|
|
4668
|
-
return
|
|
4921
|
+
return import_node_path13.default.resolve(this.config.logDir);
|
|
4669
4922
|
}
|
|
4670
|
-
return
|
|
4923
|
+
return import_node_path13.default.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
|
|
4671
4924
|
}
|
|
4672
4925
|
async createStreamLogger(request) {
|
|
4673
4926
|
const logDir = this.resolveLogDirectory();
|
|
@@ -4675,13 +4928,13 @@ ${prompt}`;
|
|
|
4675
4928
|
return void 0;
|
|
4676
4929
|
}
|
|
4677
4930
|
try {
|
|
4678
|
-
await (0,
|
|
4931
|
+
await (0, import_promises12.mkdir)(logDir, { recursive: true });
|
|
4679
4932
|
} catch (error) {
|
|
4680
4933
|
const message = error instanceof Error ? error.message : String(error);
|
|
4681
4934
|
console.warn(`Skipping Pi stream logging (could not create ${logDir}): ${message}`);
|
|
4682
4935
|
return void 0;
|
|
4683
4936
|
}
|
|
4684
|
-
const filePath =
|
|
4937
|
+
const filePath = import_node_path13.default.join(logDir, buildLogFilename3(request, this.targetName));
|
|
4685
4938
|
try {
|
|
4686
4939
|
const logger = await PiStreamLogger.create({
|
|
4687
4940
|
filePath,
|
|
@@ -5114,7 +5367,7 @@ async function defaultPiRunner(options) {
|
|
|
5114
5367
|
}
|
|
5115
5368
|
|
|
5116
5369
|
// src/evaluation/providers/targets.ts
|
|
5117
|
-
var
|
|
5370
|
+
var import_node_path14 = __toESM(require("path"), 1);
|
|
5118
5371
|
var import_zod2 = require("zod");
|
|
5119
5372
|
var CliHealthcheckHttpInputSchema = import_zod2.z.object({
|
|
5120
5373
|
type: import_zod2.z.literal("http"),
|
|
@@ -5220,11 +5473,11 @@ function normalizeCliHealthcheck(input, env, targetName, evalFilePath) {
|
|
|
5220
5473
|
allowLiteral: true,
|
|
5221
5474
|
optionalEnv: true
|
|
5222
5475
|
});
|
|
5223
|
-
if (cwd && evalFilePath && !
|
|
5224
|
-
cwd =
|
|
5476
|
+
if (cwd && evalFilePath && !import_node_path14.default.isAbsolute(cwd)) {
|
|
5477
|
+
cwd = import_node_path14.default.resolve(import_node_path14.default.dirname(import_node_path14.default.resolve(evalFilePath)), cwd);
|
|
5225
5478
|
}
|
|
5226
5479
|
if (!cwd && evalFilePath) {
|
|
5227
|
-
cwd =
|
|
5480
|
+
cwd = import_node_path14.default.dirname(import_node_path14.default.resolve(evalFilePath));
|
|
5228
5481
|
}
|
|
5229
5482
|
return {
|
|
5230
5483
|
type: "command",
|
|
@@ -5251,11 +5504,11 @@ function normalizeCliTargetInput(input, env, evalFilePath) {
|
|
|
5251
5504
|
allowLiteral: true,
|
|
5252
5505
|
optionalEnv: true
|
|
5253
5506
|
});
|
|
5254
|
-
if (cwd && evalFilePath && !
|
|
5255
|
-
cwd =
|
|
5507
|
+
if (cwd && evalFilePath && !import_node_path14.default.isAbsolute(cwd)) {
|
|
5508
|
+
cwd = import_node_path14.default.resolve(import_node_path14.default.dirname(import_node_path14.default.resolve(evalFilePath)), cwd);
|
|
5256
5509
|
}
|
|
5257
5510
|
if (!cwd && evalFilePath) {
|
|
5258
|
-
cwd =
|
|
5511
|
+
cwd = import_node_path14.default.dirname(import_node_path14.default.resolve(evalFilePath));
|
|
5259
5512
|
}
|
|
5260
5513
|
const timeoutSeconds = input.timeout_seconds ?? input.timeoutSeconds;
|
|
5261
5514
|
const timeoutMs = timeoutSeconds !== void 0 ? Math.floor(timeoutSeconds * 1e3) : void 0;
|
|
@@ -5760,8 +6013,8 @@ function resolveCliConfig(target, env, evalFilePath) {
|
|
|
5760
6013
|
const parseResult = CliTargetInputSchema.safeParse(target, { errorMap: cliErrorMap });
|
|
5761
6014
|
if (!parseResult.success) {
|
|
5762
6015
|
const firstError = parseResult.error.errors[0];
|
|
5763
|
-
const
|
|
5764
|
-
const prefix =
|
|
6016
|
+
const path18 = firstError?.path.join(".") || "";
|
|
6017
|
+
const prefix = path18 ? `${target.name} ${path18}: ` : `${target.name}: `;
|
|
5765
6018
|
throw new Error(`${prefix}${firstError?.message}`);
|
|
5766
6019
|
}
|
|
5767
6020
|
const normalized = normalizeCliTargetInput(parseResult.data, env, evalFilePath);
|
|
@@ -5949,7 +6202,7 @@ function resolveOptionalNumberArray(source, description) {
|
|
|
5949
6202
|
}
|
|
5950
6203
|
|
|
5951
6204
|
// src/evaluation/providers/vscode.ts
|
|
5952
|
-
var
|
|
6205
|
+
var import_node_path15 = __toESM(require("path"), 1);
|
|
5953
6206
|
var import_subagent = require("subagent");
|
|
5954
6207
|
|
|
5955
6208
|
// src/evaluation/providers/vscode-templates.ts
|
|
@@ -6119,7 +6372,7 @@ function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
|
|
|
6119
6372
|
return "";
|
|
6120
6373
|
}
|
|
6121
6374
|
const buildList = (files) => files.map((absolutePath) => {
|
|
6122
|
-
const fileName =
|
|
6375
|
+
const fileName = import_node_path15.default.basename(absolutePath);
|
|
6123
6376
|
const fileUri = pathToFileUri2(absolutePath);
|
|
6124
6377
|
return `* [${fileName}](${fileUri})`;
|
|
6125
6378
|
});
|
|
@@ -6144,8 +6397,8 @@ function collectGuidelineFiles2(attachments, guidelinePatterns) {
|
|
|
6144
6397
|
}
|
|
6145
6398
|
const unique = /* @__PURE__ */ new Map();
|
|
6146
6399
|
for (const attachment of attachments) {
|
|
6147
|
-
const absolutePath =
|
|
6148
|
-
const normalized = absolutePath.split(
|
|
6400
|
+
const absolutePath = import_node_path15.default.resolve(attachment);
|
|
6401
|
+
const normalized = absolutePath.split(import_node_path15.default.sep).join("/");
|
|
6149
6402
|
if (isGuidelineFile(normalized, guidelinePatterns)) {
|
|
6150
6403
|
if (!unique.has(absolutePath)) {
|
|
6151
6404
|
unique.set(absolutePath, absolutePath);
|
|
@@ -6160,7 +6413,7 @@ function collectAttachmentFiles(attachments) {
|
|
|
6160
6413
|
}
|
|
6161
6414
|
const unique = /* @__PURE__ */ new Map();
|
|
6162
6415
|
for (const attachment of attachments) {
|
|
6163
|
-
const absolutePath =
|
|
6416
|
+
const absolutePath = import_node_path15.default.resolve(attachment);
|
|
6164
6417
|
if (!unique.has(absolutePath)) {
|
|
6165
6418
|
unique.set(absolutePath, absolutePath);
|
|
6166
6419
|
}
|
|
@@ -6168,7 +6421,7 @@ function collectAttachmentFiles(attachments) {
|
|
|
6168
6421
|
return Array.from(unique.values());
|
|
6169
6422
|
}
|
|
6170
6423
|
function pathToFileUri2(filePath) {
|
|
6171
|
-
const absolutePath =
|
|
6424
|
+
const absolutePath = import_node_path15.default.isAbsolute(filePath) ? filePath : import_node_path15.default.resolve(filePath);
|
|
6172
6425
|
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
6173
6426
|
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
6174
6427
|
return `file:///${normalizedPath}`;
|
|
@@ -6181,7 +6434,7 @@ function normalizeAttachments(attachments) {
|
|
|
6181
6434
|
}
|
|
6182
6435
|
const deduped = /* @__PURE__ */ new Set();
|
|
6183
6436
|
for (const attachment of attachments) {
|
|
6184
|
-
deduped.add(
|
|
6437
|
+
deduped.add(import_node_path15.default.resolve(attachment));
|
|
6185
6438
|
}
|
|
6186
6439
|
return Array.from(deduped);
|
|
6187
6440
|
}
|
|
@@ -6190,7 +6443,7 @@ function mergeAttachments(all) {
|
|
|
6190
6443
|
for (const list of all) {
|
|
6191
6444
|
if (!list) continue;
|
|
6192
6445
|
for (const inputFile of list) {
|
|
6193
|
-
deduped.add(
|
|
6446
|
+
deduped.add(import_node_path15.default.resolve(inputFile));
|
|
6194
6447
|
}
|
|
6195
6448
|
}
|
|
6196
6449
|
return deduped.size > 0 ? Array.from(deduped) : void 0;
|
|
@@ -6238,9 +6491,9 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
|
|
|
6238
6491
|
|
|
6239
6492
|
// src/evaluation/providers/targets-file.ts
|
|
6240
6493
|
var import_node_fs6 = require("fs");
|
|
6241
|
-
var
|
|
6242
|
-
var
|
|
6243
|
-
var
|
|
6494
|
+
var import_promises13 = require("fs/promises");
|
|
6495
|
+
var import_node_path16 = __toESM(require("path"), 1);
|
|
6496
|
+
var import_yaml4 = require("yaml");
|
|
6244
6497
|
function isRecord(value) {
|
|
6245
6498
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
6246
6499
|
}
|
|
@@ -6269,19 +6522,19 @@ function assertTargetDefinition(value, index, filePath) {
|
|
|
6269
6522
|
}
|
|
6270
6523
|
async function fileExists3(filePath) {
|
|
6271
6524
|
try {
|
|
6272
|
-
await (0,
|
|
6525
|
+
await (0, import_promises13.access)(filePath, import_node_fs6.constants.F_OK);
|
|
6273
6526
|
return true;
|
|
6274
6527
|
} catch {
|
|
6275
6528
|
return false;
|
|
6276
6529
|
}
|
|
6277
6530
|
}
|
|
6278
6531
|
async function readTargetDefinitions(filePath) {
|
|
6279
|
-
const absolutePath =
|
|
6532
|
+
const absolutePath = import_node_path16.default.resolve(filePath);
|
|
6280
6533
|
if (!await fileExists3(absolutePath)) {
|
|
6281
6534
|
throw new Error(`targets.yaml not found at ${absolutePath}`);
|
|
6282
6535
|
}
|
|
6283
|
-
const raw = await (0,
|
|
6284
|
-
const parsed = (0,
|
|
6536
|
+
const raw = await (0, import_promises13.readFile)(absolutePath, "utf8");
|
|
6537
|
+
const parsed = (0, import_yaml4.parse)(raw);
|
|
6285
6538
|
if (!isRecord(parsed)) {
|
|
6286
6539
|
throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with a 'targets' field`);
|
|
6287
6540
|
}
|
|
@@ -6487,15 +6740,15 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
|
|
|
6487
6740
|
});
|
|
6488
6741
|
}
|
|
6489
6742
|
async function execShellWithStdin(command, stdinPayload, options = {}) {
|
|
6490
|
-
const { mkdir: mkdir4, readFile:
|
|
6743
|
+
const { mkdir: mkdir4, readFile: readFile9, rm: rm4, writeFile: writeFile4 } = await import("fs/promises");
|
|
6491
6744
|
const { tmpdir: tmpdir4 } = await import("os");
|
|
6492
|
-
const
|
|
6745
|
+
const path18 = await import("path");
|
|
6493
6746
|
const { randomUUID: randomUUID4 } = await import("crypto");
|
|
6494
|
-
const dir =
|
|
6747
|
+
const dir = path18.join(tmpdir4(), `agentv-exec-${randomUUID4()}`);
|
|
6495
6748
|
await mkdir4(dir, { recursive: true });
|
|
6496
|
-
const stdinPath =
|
|
6497
|
-
const stdoutPath =
|
|
6498
|
-
const stderrPath =
|
|
6749
|
+
const stdinPath = path18.join(dir, "stdin.txt");
|
|
6750
|
+
const stdoutPath = path18.join(dir, "stdout.txt");
|
|
6751
|
+
const stderrPath = path18.join(dir, "stderr.txt");
|
|
6499
6752
|
await writeFile4(stdinPath, stdinPayload, "utf8");
|
|
6500
6753
|
const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
|
|
6501
6754
|
const { spawn: spawn4 } = await import("child_process");
|
|
@@ -6525,8 +6778,8 @@ async function execShellWithStdin(command, stdinPayload, options = {}) {
|
|
|
6525
6778
|
resolve(code ?? 0);
|
|
6526
6779
|
});
|
|
6527
6780
|
});
|
|
6528
|
-
const stdout = (await
|
|
6529
|
-
const stderr = (await
|
|
6781
|
+
const stdout = (await readFile9(stdoutPath, "utf8")).replace(/\r\n/g, "\n");
|
|
6782
|
+
const stderr = (await readFile9(stderrPath, "utf8")).replace(/\r\n/g, "\n");
|
|
6530
6783
|
return { stdout, stderr, exitCode };
|
|
6531
6784
|
} finally {
|
|
6532
6785
|
await rm4(dir, { recursive: true, force: true });
|
|
@@ -6798,7 +7051,7 @@ var CodeEvaluator = class {
|
|
|
6798
7051
|
outputMessages: context.outputMessages ?? null,
|
|
6799
7052
|
guidelineFiles: context.evalCase.guideline_paths,
|
|
6800
7053
|
inputFiles: context.evalCase.file_paths.filter(
|
|
6801
|
-
(
|
|
7054
|
+
(path18) => !context.evalCase.guideline_paths.includes(path18)
|
|
6802
7055
|
),
|
|
6803
7056
|
inputMessages: context.evalCase.input_messages,
|
|
6804
7057
|
traceSummary: context.traceSummary ?? null,
|
|
@@ -7584,115 +7837,115 @@ var FieldAccuracyEvaluator = class {
|
|
|
7584
7837
|
* Evaluate a single field against the expected value.
|
|
7585
7838
|
*/
|
|
7586
7839
|
evaluateField(fieldConfig, candidateData, expectedData) {
|
|
7587
|
-
const { path:
|
|
7588
|
-
const candidateValue = resolvePath(candidateData,
|
|
7589
|
-
const expectedValue = resolvePath(expectedData,
|
|
7840
|
+
const { path: path18, match, required = true, weight = 1 } = fieldConfig;
|
|
7841
|
+
const candidateValue = resolvePath(candidateData, path18);
|
|
7842
|
+
const expectedValue = resolvePath(expectedData, path18);
|
|
7590
7843
|
if (expectedValue === void 0) {
|
|
7591
7844
|
return {
|
|
7592
|
-
path:
|
|
7845
|
+
path: path18,
|
|
7593
7846
|
score: 1,
|
|
7594
7847
|
// No expected value means no comparison needed
|
|
7595
7848
|
weight,
|
|
7596
7849
|
hit: true,
|
|
7597
|
-
message: `${
|
|
7850
|
+
message: `${path18}: no expected value`
|
|
7598
7851
|
};
|
|
7599
7852
|
}
|
|
7600
7853
|
if (candidateValue === void 0) {
|
|
7601
7854
|
if (required) {
|
|
7602
7855
|
return {
|
|
7603
|
-
path:
|
|
7856
|
+
path: path18,
|
|
7604
7857
|
score: 0,
|
|
7605
7858
|
weight,
|
|
7606
7859
|
hit: false,
|
|
7607
|
-
message: `${
|
|
7860
|
+
message: `${path18} (required, missing)`
|
|
7608
7861
|
};
|
|
7609
7862
|
}
|
|
7610
7863
|
return {
|
|
7611
|
-
path:
|
|
7864
|
+
path: path18,
|
|
7612
7865
|
score: 1,
|
|
7613
7866
|
// Don't penalize missing optional fields
|
|
7614
7867
|
weight: 0,
|
|
7615
7868
|
// Zero weight means it won't affect the score
|
|
7616
7869
|
hit: true,
|
|
7617
|
-
message: `${
|
|
7870
|
+
message: `${path18}: optional field missing`
|
|
7618
7871
|
};
|
|
7619
7872
|
}
|
|
7620
7873
|
switch (match) {
|
|
7621
7874
|
case "exact":
|
|
7622
|
-
return this.compareExact(
|
|
7875
|
+
return this.compareExact(path18, candidateValue, expectedValue, weight);
|
|
7623
7876
|
case "numeric_tolerance":
|
|
7624
7877
|
return this.compareNumericTolerance(
|
|
7625
|
-
|
|
7878
|
+
path18,
|
|
7626
7879
|
candidateValue,
|
|
7627
7880
|
expectedValue,
|
|
7628
7881
|
fieldConfig,
|
|
7629
7882
|
weight
|
|
7630
7883
|
);
|
|
7631
7884
|
case "date":
|
|
7632
|
-
return this.compareDate(
|
|
7885
|
+
return this.compareDate(path18, candidateValue, expectedValue, fieldConfig, weight);
|
|
7633
7886
|
default:
|
|
7634
7887
|
return {
|
|
7635
|
-
path:
|
|
7888
|
+
path: path18,
|
|
7636
7889
|
score: 0,
|
|
7637
7890
|
weight,
|
|
7638
7891
|
hit: false,
|
|
7639
|
-
message: `${
|
|
7892
|
+
message: `${path18}: unknown match type "${match}"`
|
|
7640
7893
|
};
|
|
7641
7894
|
}
|
|
7642
7895
|
}
|
|
7643
7896
|
/**
|
|
7644
7897
|
* Exact equality comparison.
|
|
7645
7898
|
*/
|
|
7646
|
-
compareExact(
|
|
7899
|
+
compareExact(path18, candidateValue, expectedValue, weight) {
|
|
7647
7900
|
if (deepEqual(candidateValue, expectedValue)) {
|
|
7648
7901
|
return {
|
|
7649
|
-
path:
|
|
7902
|
+
path: path18,
|
|
7650
7903
|
score: 1,
|
|
7651
7904
|
weight,
|
|
7652
7905
|
hit: true,
|
|
7653
|
-
message:
|
|
7906
|
+
message: path18
|
|
7654
7907
|
};
|
|
7655
7908
|
}
|
|
7656
7909
|
if (typeof candidateValue !== typeof expectedValue) {
|
|
7657
7910
|
return {
|
|
7658
|
-
path:
|
|
7911
|
+
path: path18,
|
|
7659
7912
|
score: 0,
|
|
7660
7913
|
weight,
|
|
7661
7914
|
hit: false,
|
|
7662
|
-
message: `${
|
|
7915
|
+
message: `${path18} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
|
|
7663
7916
|
};
|
|
7664
7917
|
}
|
|
7665
7918
|
return {
|
|
7666
|
-
path:
|
|
7919
|
+
path: path18,
|
|
7667
7920
|
score: 0,
|
|
7668
7921
|
weight,
|
|
7669
7922
|
hit: false,
|
|
7670
|
-
message: `${
|
|
7923
|
+
message: `${path18} (value mismatch)`
|
|
7671
7924
|
};
|
|
7672
7925
|
}
|
|
7673
7926
|
/**
|
|
7674
7927
|
* Numeric comparison with absolute or relative tolerance.
|
|
7675
7928
|
*/
|
|
7676
|
-
compareNumericTolerance(
|
|
7929
|
+
compareNumericTolerance(path18, candidateValue, expectedValue, fieldConfig, weight) {
|
|
7677
7930
|
const { tolerance = 0, relative = false } = fieldConfig;
|
|
7678
7931
|
const candidateNum = toNumber(candidateValue);
|
|
7679
7932
|
const expectedNum = toNumber(expectedValue);
|
|
7680
7933
|
if (candidateNum === null || expectedNum === null) {
|
|
7681
7934
|
return {
|
|
7682
|
-
path:
|
|
7935
|
+
path: path18,
|
|
7683
7936
|
score: 0,
|
|
7684
7937
|
weight,
|
|
7685
7938
|
hit: false,
|
|
7686
|
-
message: `${
|
|
7939
|
+
message: `${path18} (non-numeric value)`
|
|
7687
7940
|
};
|
|
7688
7941
|
}
|
|
7689
7942
|
if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
|
|
7690
7943
|
return {
|
|
7691
|
-
path:
|
|
7944
|
+
path: path18,
|
|
7692
7945
|
score: 0,
|
|
7693
7946
|
weight,
|
|
7694
7947
|
hit: false,
|
|
7695
|
-
message: `${
|
|
7948
|
+
message: `${path18} (invalid numeric value)`
|
|
7696
7949
|
};
|
|
7697
7950
|
}
|
|
7698
7951
|
const diff = Math.abs(candidateNum - expectedNum);
|
|
@@ -7705,61 +7958,61 @@ var FieldAccuracyEvaluator = class {
|
|
|
7705
7958
|
}
|
|
7706
7959
|
if (withinTolerance) {
|
|
7707
7960
|
return {
|
|
7708
|
-
path:
|
|
7961
|
+
path: path18,
|
|
7709
7962
|
score: 1,
|
|
7710
7963
|
weight,
|
|
7711
7964
|
hit: true,
|
|
7712
|
-
message: `${
|
|
7965
|
+
message: `${path18} (within tolerance: diff=${diff.toFixed(2)})`
|
|
7713
7966
|
};
|
|
7714
7967
|
}
|
|
7715
7968
|
return {
|
|
7716
|
-
path:
|
|
7969
|
+
path: path18,
|
|
7717
7970
|
score: 0,
|
|
7718
7971
|
weight,
|
|
7719
7972
|
hit: false,
|
|
7720
|
-
message: `${
|
|
7973
|
+
message: `${path18} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
|
|
7721
7974
|
};
|
|
7722
7975
|
}
|
|
7723
7976
|
/**
|
|
7724
7977
|
* Date comparison with format normalization.
|
|
7725
7978
|
*/
|
|
7726
|
-
compareDate(
|
|
7979
|
+
compareDate(path18, candidateValue, expectedValue, fieldConfig, weight) {
|
|
7727
7980
|
const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
|
|
7728
7981
|
const candidateDate = parseDate(String(candidateValue), formats);
|
|
7729
7982
|
const expectedDate = parseDate(String(expectedValue), formats);
|
|
7730
7983
|
if (candidateDate === null) {
|
|
7731
7984
|
return {
|
|
7732
|
-
path:
|
|
7985
|
+
path: path18,
|
|
7733
7986
|
score: 0,
|
|
7734
7987
|
weight,
|
|
7735
7988
|
hit: false,
|
|
7736
|
-
message: `${
|
|
7989
|
+
message: `${path18} (unparseable candidate date)`
|
|
7737
7990
|
};
|
|
7738
7991
|
}
|
|
7739
7992
|
if (expectedDate === null) {
|
|
7740
7993
|
return {
|
|
7741
|
-
path:
|
|
7994
|
+
path: path18,
|
|
7742
7995
|
score: 0,
|
|
7743
7996
|
weight,
|
|
7744
7997
|
hit: false,
|
|
7745
|
-
message: `${
|
|
7998
|
+
message: `${path18} (unparseable expected date)`
|
|
7746
7999
|
};
|
|
7747
8000
|
}
|
|
7748
8001
|
if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
|
|
7749
8002
|
return {
|
|
7750
|
-
path:
|
|
8003
|
+
path: path18,
|
|
7751
8004
|
score: 1,
|
|
7752
8005
|
weight,
|
|
7753
8006
|
hit: true,
|
|
7754
|
-
message:
|
|
8007
|
+
message: path18
|
|
7755
8008
|
};
|
|
7756
8009
|
}
|
|
7757
8010
|
return {
|
|
7758
|
-
path:
|
|
8011
|
+
path: path18,
|
|
7759
8012
|
score: 0,
|
|
7760
8013
|
weight,
|
|
7761
8014
|
hit: false,
|
|
7762
|
-
message: `${
|
|
8015
|
+
message: `${path18} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
|
|
7763
8016
|
};
|
|
7764
8017
|
}
|
|
7765
8018
|
/**
|
|
@@ -7799,11 +8052,11 @@ var FieldAccuracyEvaluator = class {
|
|
|
7799
8052
|
};
|
|
7800
8053
|
}
|
|
7801
8054
|
};
|
|
7802
|
-
function resolvePath(obj,
|
|
7803
|
-
if (!
|
|
8055
|
+
function resolvePath(obj, path18) {
|
|
8056
|
+
if (!path18 || !obj) {
|
|
7804
8057
|
return void 0;
|
|
7805
8058
|
}
|
|
7806
|
-
const parts =
|
|
8059
|
+
const parts = path18.split(/\.|\[|\]/).filter((p) => p.length > 0);
|
|
7807
8060
|
let current = obj;
|
|
7808
8061
|
for (const part of parts) {
|
|
7809
8062
|
if (current === null || current === void 0) {
|
|
@@ -8239,7 +8492,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
8239
8492
|
|
|
8240
8493
|
// src/evaluation/orchestrator.ts
|
|
8241
8494
|
var import_node_crypto5 = require("crypto");
|
|
8242
|
-
var
|
|
8495
|
+
var import_node_path17 = __toESM(require("path"), 1);
|
|
8243
8496
|
|
|
8244
8497
|
// ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
|
|
8245
8498
|
var Node = class {
|
|
@@ -9038,7 +9291,7 @@ async function runEvaluatorList(options) {
|
|
|
9038
9291
|
});
|
|
9039
9292
|
}
|
|
9040
9293
|
if (evaluator.type === "composite") {
|
|
9041
|
-
const evalFileDir = evalCase.guideline_paths[0] ?
|
|
9294
|
+
const evalFileDir = evalCase.guideline_paths[0] ? import_node_path17.default.dirname(evalCase.guideline_paths[0]) : process.cwd();
|
|
9042
9295
|
const createEvaluator = (memberConfig) => {
|
|
9043
9296
|
switch (memberConfig.type) {
|
|
9044
9297
|
case "llm_judge":
|
|
@@ -9613,6 +9866,7 @@ function createAgentKernel() {
|
|
|
9613
9866
|
createAgentKernel,
|
|
9614
9867
|
createProvider,
|
|
9615
9868
|
deepEqual,
|
|
9869
|
+
detectFormat,
|
|
9616
9870
|
ensureVSCodeSubagents,
|
|
9617
9871
|
executeScript,
|
|
9618
9872
|
explorationRatio,
|