@agentv/core 2.1.0 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/evaluation/validation/index.cjs +0 -11
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +0 -11
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +458 -211
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +8 -2
- package/dist/index.d.ts +8 -2
- package/dist/index.js +405 -159
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -146,8 +146,8 @@ function mergeExecutionMetrics(summary, metrics) {
|
|
|
146
146
|
}
|
|
147
147
|
|
|
148
148
|
// src/evaluation/yaml-parser.ts
|
|
149
|
-
import { readFile as
|
|
150
|
-
import
|
|
149
|
+
import { readFile as readFile6 } from "node:fs/promises";
|
|
150
|
+
import path7 from "node:path";
|
|
151
151
|
import { parse as parse2 } from "yaml";
|
|
152
152
|
|
|
153
153
|
// src/evaluation/loaders/config-loader.ts
|
|
@@ -257,7 +257,6 @@ async function resolveFileReference2(rawValue, searchRoots) {
|
|
|
257
257
|
}
|
|
258
258
|
|
|
259
259
|
// src/evaluation/loaders/config-loader.ts
|
|
260
|
-
var SCHEMA_CONFIG_V2 = "agentv-config-v2";
|
|
261
260
|
var ANSI_YELLOW = "\x1B[33m";
|
|
262
261
|
var ANSI_RESET = "\x1B[0m";
|
|
263
262
|
async function loadConfig(evalFilePath, repoRoot) {
|
|
@@ -275,13 +274,6 @@ async function loadConfig(evalFilePath, repoRoot) {
|
|
|
275
274
|
continue;
|
|
276
275
|
}
|
|
277
276
|
const config = parsed;
|
|
278
|
-
const schema = config.$schema;
|
|
279
|
-
if (schema !== SCHEMA_CONFIG_V2) {
|
|
280
|
-
const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${configPath}. Expected '${SCHEMA_CONFIG_V2}'` : `Missing required field '$schema' in ${configPath}.
|
|
281
|
-
Please add '$schema: ${SCHEMA_CONFIG_V2}' at the top of the file.`;
|
|
282
|
-
logWarning(message);
|
|
283
|
-
continue;
|
|
284
|
-
}
|
|
285
277
|
const guidelinePatterns = config.guideline_patterns;
|
|
286
278
|
if (guidelinePatterns !== void 0 && !Array.isArray(guidelinePatterns)) {
|
|
287
279
|
logWarning(`Invalid guideline_patterns in ${configPath}, expected array`);
|
|
@@ -390,7 +382,8 @@ var ANSI_YELLOW3 = "\x1B[33m";
|
|
|
390
382
|
var ANSI_RESET3 = "\x1B[0m";
|
|
391
383
|
async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
|
|
392
384
|
const execution = rawEvalCase.execution;
|
|
393
|
-
const
|
|
385
|
+
const executionObject = isJsonObject2(execution) ? execution : void 0;
|
|
386
|
+
const candidateEvaluators = (executionObject ? executionObject.evaluators : void 0) ?? rawEvalCase.evaluators ?? globalExecution?.evaluators;
|
|
394
387
|
if (candidateEvaluators === void 0) {
|
|
395
388
|
return void 0;
|
|
396
389
|
}
|
|
@@ -933,6 +926,11 @@ function isValidFieldAggregationType(value) {
|
|
|
933
926
|
return typeof value === "string" && VALID_FIELD_AGGREGATION_TYPES.has(value);
|
|
934
927
|
}
|
|
935
928
|
|
|
929
|
+
// src/evaluation/loaders/jsonl-parser.ts
|
|
930
|
+
import { readFile as readFile4 } from "node:fs/promises";
|
|
931
|
+
import path5 from "node:path";
|
|
932
|
+
import { parse as parseYaml } from "yaml";
|
|
933
|
+
|
|
936
934
|
// src/evaluation/loaders/message-processor.ts
|
|
937
935
|
import { readFile as readFile3 } from "node:fs/promises";
|
|
938
936
|
import path4 from "node:path";
|
|
@@ -1193,28 +1191,271 @@ async function processExpectedMessages(options) {
|
|
|
1193
1191
|
return segments;
|
|
1194
1192
|
}
|
|
1195
1193
|
|
|
1196
|
-
// src/evaluation/
|
|
1197
|
-
import { readFile as readFile4 } from "node:fs/promises";
|
|
1198
|
-
import path5 from "node:path";
|
|
1194
|
+
// src/evaluation/loaders/jsonl-parser.ts
|
|
1199
1195
|
var ANSI_YELLOW5 = "\x1B[33m";
|
|
1196
|
+
var ANSI_RED = "\x1B[31m";
|
|
1200
1197
|
var ANSI_RESET5 = "\x1B[0m";
|
|
1198
|
+
function detectFormat(filePath) {
|
|
1199
|
+
const ext = path5.extname(filePath).toLowerCase();
|
|
1200
|
+
if (ext === ".jsonl") return "jsonl";
|
|
1201
|
+
if (ext === ".yaml" || ext === ".yml") return "yaml";
|
|
1202
|
+
throw new Error(`Unsupported file format: '${ext}'. Supported formats: .yaml, .yml, .jsonl`);
|
|
1203
|
+
}
|
|
1204
|
+
async function loadSidecarMetadata(jsonlPath, verbose) {
|
|
1205
|
+
const dir = path5.dirname(jsonlPath);
|
|
1206
|
+
const base = path5.basename(jsonlPath, ".jsonl");
|
|
1207
|
+
const sidecarPath = path5.join(dir, `${base}.yaml`);
|
|
1208
|
+
if (!await fileExists2(sidecarPath)) {
|
|
1209
|
+
if (verbose) {
|
|
1210
|
+
logWarning4(`Sidecar metadata file not found: ${sidecarPath} (using defaults)`);
|
|
1211
|
+
}
|
|
1212
|
+
return {};
|
|
1213
|
+
}
|
|
1214
|
+
try {
|
|
1215
|
+
const content = await readFile4(sidecarPath, "utf8");
|
|
1216
|
+
const parsed = parseYaml(content);
|
|
1217
|
+
if (!isJsonObject(parsed)) {
|
|
1218
|
+
logWarning4(`Invalid sidecar metadata format in ${sidecarPath}`);
|
|
1219
|
+
return {};
|
|
1220
|
+
}
|
|
1221
|
+
return {
|
|
1222
|
+
description: asString4(parsed.description),
|
|
1223
|
+
dataset: asString4(parsed.dataset),
|
|
1224
|
+
execution: isJsonObject(parsed.execution) ? parsed.execution : void 0,
|
|
1225
|
+
evaluator: parsed.evaluator
|
|
1226
|
+
};
|
|
1227
|
+
} catch (error) {
|
|
1228
|
+
logWarning4(`Could not read sidecar metadata from ${sidecarPath}: ${error.message}`);
|
|
1229
|
+
return {};
|
|
1230
|
+
}
|
|
1231
|
+
}
|
|
1232
|
+
function parseJsonlContent(content, filePath) {
|
|
1233
|
+
const lines = content.split("\n");
|
|
1234
|
+
const cases = [];
|
|
1235
|
+
for (let i = 0; i < lines.length; i++) {
|
|
1236
|
+
const line = lines[i].trim();
|
|
1237
|
+
if (line === "") continue;
|
|
1238
|
+
try {
|
|
1239
|
+
const parsed = JSON.parse(line);
|
|
1240
|
+
if (!isJsonObject(parsed)) {
|
|
1241
|
+
throw new Error("Expected JSON object");
|
|
1242
|
+
}
|
|
1243
|
+
cases.push(parsed);
|
|
1244
|
+
} catch (error) {
|
|
1245
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1246
|
+
throw new Error(`Line ${i + 1}: Invalid JSON - ${message}
|
|
1247
|
+
File: ${filePath}`);
|
|
1248
|
+
}
|
|
1249
|
+
}
|
|
1250
|
+
return cases;
|
|
1251
|
+
}
|
|
1252
|
+
async function loadEvalCasesFromJsonl(evalFilePath, repoRoot, options) {
|
|
1253
|
+
const verbose = options?.verbose ?? false;
|
|
1254
|
+
const evalIdFilter = options?.evalId;
|
|
1255
|
+
const absoluteTestPath = path5.resolve(evalFilePath);
|
|
1256
|
+
const repoRootPath = resolveToAbsolutePath(repoRoot);
|
|
1257
|
+
const searchRoots = buildSearchRoots2(absoluteTestPath, repoRootPath);
|
|
1258
|
+
const config = await loadConfig(absoluteTestPath, repoRootPath);
|
|
1259
|
+
const guidelinePatterns = config?.guideline_patterns;
|
|
1260
|
+
const sidecar = await loadSidecarMetadata(absoluteTestPath, verbose);
|
|
1261
|
+
const rawFile = await readFile4(absoluteTestPath, "utf8");
|
|
1262
|
+
const rawCases = parseJsonlContent(rawFile, evalFilePath);
|
|
1263
|
+
const fallbackDataset = path5.basename(absoluteTestPath, ".jsonl") || "eval";
|
|
1264
|
+
const datasetName = sidecar.dataset && sidecar.dataset.trim().length > 0 ? sidecar.dataset : fallbackDataset;
|
|
1265
|
+
const globalEvaluator = coerceEvaluator(sidecar.evaluator, "sidecar") ?? "llm_judge";
|
|
1266
|
+
const globalExecution = sidecar.execution;
|
|
1267
|
+
if (verbose) {
|
|
1268
|
+
console.log(`
|
|
1269
|
+
[JSONL Dataset: ${evalFilePath}]`);
|
|
1270
|
+
console.log(` Cases: ${rawCases.length}`);
|
|
1271
|
+
console.log(` Dataset name: ${datasetName}`);
|
|
1272
|
+
if (sidecar.description) {
|
|
1273
|
+
console.log(` Description: ${sidecar.description}`);
|
|
1274
|
+
}
|
|
1275
|
+
}
|
|
1276
|
+
const results = [];
|
|
1277
|
+
for (let lineIndex = 0; lineIndex < rawCases.length; lineIndex++) {
|
|
1278
|
+
const evalcase = rawCases[lineIndex];
|
|
1279
|
+
const lineNumber = lineIndex + 1;
|
|
1280
|
+
const id = asString4(evalcase.id);
|
|
1281
|
+
if (evalIdFilter && id !== evalIdFilter) {
|
|
1282
|
+
continue;
|
|
1283
|
+
}
|
|
1284
|
+
const conversationId = asString4(evalcase.conversation_id);
|
|
1285
|
+
const outcome = asString4(evalcase.expected_outcome) ?? asString4(evalcase.outcome);
|
|
1286
|
+
const inputMessagesValue = evalcase.input_messages;
|
|
1287
|
+
const expectedMessagesValue = evalcase.expected_messages;
|
|
1288
|
+
if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
|
|
1289
|
+
logError(
|
|
1290
|
+
`Skipping incomplete eval case at line ${lineNumber}: ${id ?? "unknown"}. Missing required fields: id, expected_outcome, and/or input_messages`
|
|
1291
|
+
);
|
|
1292
|
+
continue;
|
|
1293
|
+
}
|
|
1294
|
+
const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
|
|
1295
|
+
const inputMessages = inputMessagesValue.filter(
|
|
1296
|
+
(msg) => isTestMessage(msg)
|
|
1297
|
+
);
|
|
1298
|
+
const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
|
|
1299
|
+
if (hasExpectedMessages && expectedMessages.length === 0) {
|
|
1300
|
+
logError(`Line ${lineNumber}: No valid expected message found for eval case: ${id}`);
|
|
1301
|
+
continue;
|
|
1302
|
+
}
|
|
1303
|
+
const guidelinePaths = [];
|
|
1304
|
+
const inputTextParts = [];
|
|
1305
|
+
const inputSegments = await processMessages({
|
|
1306
|
+
messages: inputMessages,
|
|
1307
|
+
searchRoots,
|
|
1308
|
+
repoRootPath,
|
|
1309
|
+
guidelinePatterns,
|
|
1310
|
+
guidelinePaths,
|
|
1311
|
+
textParts: inputTextParts,
|
|
1312
|
+
messageType: "input",
|
|
1313
|
+
verbose
|
|
1314
|
+
});
|
|
1315
|
+
const outputSegments = hasExpectedMessages ? await processExpectedMessages({
|
|
1316
|
+
messages: expectedMessages,
|
|
1317
|
+
searchRoots,
|
|
1318
|
+
repoRootPath,
|
|
1319
|
+
verbose
|
|
1320
|
+
}) : [];
|
|
1321
|
+
let referenceAnswer = "";
|
|
1322
|
+
if (outputSegments.length > 0) {
|
|
1323
|
+
const lastMessage = outputSegments[outputSegments.length - 1];
|
|
1324
|
+
const content = lastMessage.content;
|
|
1325
|
+
const toolCalls = lastMessage.tool_calls;
|
|
1326
|
+
if (typeof content === "string") {
|
|
1327
|
+
referenceAnswer = content;
|
|
1328
|
+
} else if (content !== void 0 && content !== null) {
|
|
1329
|
+
referenceAnswer = JSON.stringify(content, null, 2);
|
|
1330
|
+
} else if (toolCalls !== void 0 && toolCalls !== null) {
|
|
1331
|
+
referenceAnswer = JSON.stringify(toolCalls, null, 2);
|
|
1332
|
+
}
|
|
1333
|
+
}
|
|
1334
|
+
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
1335
|
+
const caseExecution = isJsonObject(evalcase.execution) ? evalcase.execution : void 0;
|
|
1336
|
+
const mergedExecution = caseExecution ?? globalExecution;
|
|
1337
|
+
const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
|
|
1338
|
+
let evaluators;
|
|
1339
|
+
try {
|
|
1340
|
+
evaluators = await parseEvaluators(evalcase, mergedExecution, searchRoots, id ?? "unknown");
|
|
1341
|
+
} catch (error) {
|
|
1342
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1343
|
+
logError(`Skipping eval case '${id}' at line ${lineNumber}: ${message}`);
|
|
1344
|
+
continue;
|
|
1345
|
+
}
|
|
1346
|
+
const inlineRubrics = evalcase.rubrics;
|
|
1347
|
+
if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
|
|
1348
|
+
const rubricItems = inlineRubrics.filter((r) => isJsonObject(r) || typeof r === "string").map((rubric, index) => {
|
|
1349
|
+
if (typeof rubric === "string") {
|
|
1350
|
+
return {
|
|
1351
|
+
id: `rubric-${index + 1}`,
|
|
1352
|
+
description: rubric,
|
|
1353
|
+
weight: 1,
|
|
1354
|
+
required: true
|
|
1355
|
+
};
|
|
1356
|
+
}
|
|
1357
|
+
return {
|
|
1358
|
+
id: asString4(rubric.id) ?? `rubric-${index + 1}`,
|
|
1359
|
+
description: asString4(rubric.description) ?? "",
|
|
1360
|
+
weight: typeof rubric.weight === "number" ? rubric.weight : 1,
|
|
1361
|
+
required: typeof rubric.required === "boolean" ? rubric.required : true
|
|
1362
|
+
};
|
|
1363
|
+
}).filter((r) => r.description.length > 0);
|
|
1364
|
+
if (rubricItems.length > 0) {
|
|
1365
|
+
const rubricEvaluator = {
|
|
1366
|
+
name: "rubric",
|
|
1367
|
+
type: "llm_judge",
|
|
1368
|
+
rubrics: rubricItems
|
|
1369
|
+
};
|
|
1370
|
+
evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
|
|
1371
|
+
}
|
|
1372
|
+
}
|
|
1373
|
+
const userFilePaths = [];
|
|
1374
|
+
for (const segment of inputSegments) {
|
|
1375
|
+
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
1376
|
+
userFilePaths.push(segment.resolvedPath);
|
|
1377
|
+
}
|
|
1378
|
+
}
|
|
1379
|
+
const allFilePaths = [
|
|
1380
|
+
...guidelinePaths.map((guidelinePath) => path5.resolve(guidelinePath)),
|
|
1381
|
+
...userFilePaths
|
|
1382
|
+
];
|
|
1383
|
+
const testCase = {
|
|
1384
|
+
id,
|
|
1385
|
+
dataset: datasetName,
|
|
1386
|
+
conversation_id: conversationId,
|
|
1387
|
+
question,
|
|
1388
|
+
input_messages: inputMessages,
|
|
1389
|
+
input_segments: inputSegments,
|
|
1390
|
+
expected_messages: outputSegments,
|
|
1391
|
+
reference_answer: referenceAnswer,
|
|
1392
|
+
guideline_paths: guidelinePaths.map((guidelinePath) => path5.resolve(guidelinePath)),
|
|
1393
|
+
guideline_patterns: guidelinePatterns,
|
|
1394
|
+
file_paths: allFilePaths,
|
|
1395
|
+
expected_outcome: outcome,
|
|
1396
|
+
evaluator: evalCaseEvaluatorKind,
|
|
1397
|
+
evaluators
|
|
1398
|
+
};
|
|
1399
|
+
if (verbose) {
|
|
1400
|
+
console.log(`
|
|
1401
|
+
[Eval Case: ${id}]`);
|
|
1402
|
+
if (testCase.guideline_paths.length > 0) {
|
|
1403
|
+
console.log(` Guidelines used: ${testCase.guideline_paths.length}`);
|
|
1404
|
+
for (const guidelinePath of testCase.guideline_paths) {
|
|
1405
|
+
console.log(` - ${guidelinePath}`);
|
|
1406
|
+
}
|
|
1407
|
+
} else {
|
|
1408
|
+
console.log(" No guidelines found");
|
|
1409
|
+
}
|
|
1410
|
+
}
|
|
1411
|
+
results.push(testCase);
|
|
1412
|
+
}
|
|
1413
|
+
return results;
|
|
1414
|
+
}
|
|
1415
|
+
function asString4(value) {
|
|
1416
|
+
return typeof value === "string" ? value : void 0;
|
|
1417
|
+
}
|
|
1418
|
+
function logWarning4(message, details) {
|
|
1419
|
+
if (details && details.length > 0) {
|
|
1420
|
+
const detailBlock = details.join("\n");
|
|
1421
|
+
console.warn(`${ANSI_YELLOW5}Warning: ${message}
|
|
1422
|
+
${detailBlock}${ANSI_RESET5}`);
|
|
1423
|
+
} else {
|
|
1424
|
+
console.warn(`${ANSI_YELLOW5}Warning: ${message}${ANSI_RESET5}`);
|
|
1425
|
+
}
|
|
1426
|
+
}
|
|
1427
|
+
function logError(message, details) {
|
|
1428
|
+
if (details && details.length > 0) {
|
|
1429
|
+
const detailBlock = details.join("\n");
|
|
1430
|
+
console.error(`${ANSI_RED}Error: ${message}
|
|
1431
|
+
${detailBlock}${ANSI_RESET5}`);
|
|
1432
|
+
} else {
|
|
1433
|
+
console.error(`${ANSI_RED}Error: ${message}${ANSI_RESET5}`);
|
|
1434
|
+
}
|
|
1435
|
+
}
|
|
1436
|
+
|
|
1437
|
+
// src/evaluation/formatting/prompt-builder.ts
|
|
1438
|
+
import { readFile as readFile5 } from "node:fs/promises";
|
|
1439
|
+
import path6 from "node:path";
|
|
1440
|
+
var ANSI_YELLOW6 = "\x1B[33m";
|
|
1441
|
+
var ANSI_RESET6 = "\x1B[0m";
|
|
1201
1442
|
async function buildPromptInputs(testCase, mode = "lm") {
|
|
1202
1443
|
const guidelineParts = [];
|
|
1203
1444
|
for (const rawPath of testCase.guideline_paths) {
|
|
1204
|
-
const absolutePath =
|
|
1445
|
+
const absolutePath = path6.resolve(rawPath);
|
|
1205
1446
|
if (!await fileExists2(absolutePath)) {
|
|
1206
|
-
|
|
1447
|
+
logWarning5(`Could not read guideline file ${absolutePath}: file does not exist`);
|
|
1207
1448
|
continue;
|
|
1208
1449
|
}
|
|
1209
1450
|
try {
|
|
1210
|
-
const content = (await
|
|
1451
|
+
const content = (await readFile5(absolutePath, "utf8")).replace(/\r\n/g, "\n").trim();
|
|
1211
1452
|
guidelineParts.push({
|
|
1212
1453
|
content,
|
|
1213
1454
|
isFile: true,
|
|
1214
|
-
displayPath:
|
|
1455
|
+
displayPath: path6.basename(absolutePath)
|
|
1215
1456
|
});
|
|
1216
1457
|
} catch (error) {
|
|
1217
|
-
|
|
1458
|
+
logWarning5(`Could not read guideline file ${absolutePath}: ${error.message}`);
|
|
1218
1459
|
}
|
|
1219
1460
|
}
|
|
1220
1461
|
const guidelines = formatFileContents(guidelineParts);
|
|
@@ -1238,9 +1479,9 @@ async function buildPromptInputs(testCase, mode = "lm") {
|
|
|
1238
1479
|
messageSegments.push({ type: "text", value: segment });
|
|
1239
1480
|
}
|
|
1240
1481
|
} else if (isJsonObject(segment)) {
|
|
1241
|
-
const type =
|
|
1482
|
+
const type = asString5(segment.type);
|
|
1242
1483
|
if (type === "file") {
|
|
1243
|
-
const value =
|
|
1484
|
+
const value = asString5(segment.value);
|
|
1244
1485
|
if (!value) continue;
|
|
1245
1486
|
if (testCase.guideline_patterns && isGuidelineFile(value, testCase.guideline_patterns)) {
|
|
1246
1487
|
messageSegments.push({ type: "guideline_ref", path: value });
|
|
@@ -1251,7 +1492,7 @@ async function buildPromptInputs(testCase, mode = "lm") {
|
|
|
1251
1492
|
messageSegments.push({ type: "file", text: fileText, path: value });
|
|
1252
1493
|
}
|
|
1253
1494
|
} else if (type === "text") {
|
|
1254
|
-
const textValue =
|
|
1495
|
+
const textValue = asString5(segment.value);
|
|
1255
1496
|
if (textValue && textValue.trim().length > 0) {
|
|
1256
1497
|
messageSegments.push({ type: "text", value: textValue });
|
|
1257
1498
|
}
|
|
@@ -1405,21 +1646,21 @@ ${guidelineContent.trim()}`);
|
|
|
1405
1646
|
}
|
|
1406
1647
|
return chatPrompt.length > 0 ? chatPrompt : void 0;
|
|
1407
1648
|
}
|
|
1408
|
-
function
|
|
1649
|
+
function asString5(value) {
|
|
1409
1650
|
return typeof value === "string" ? value : void 0;
|
|
1410
1651
|
}
|
|
1411
|
-
function
|
|
1412
|
-
console.warn(`${
|
|
1652
|
+
function logWarning5(message) {
|
|
1653
|
+
console.warn(`${ANSI_YELLOW6}Warning: ${message}${ANSI_RESET6}`);
|
|
1413
1654
|
}
|
|
1414
1655
|
|
|
1415
1656
|
// src/evaluation/yaml-parser.ts
|
|
1416
|
-
var
|
|
1417
|
-
var
|
|
1418
|
-
var
|
|
1657
|
+
var ANSI_YELLOW7 = "\x1B[33m";
|
|
1658
|
+
var ANSI_RED2 = "\x1B[31m";
|
|
1659
|
+
var ANSI_RESET7 = "\x1B[0m";
|
|
1419
1660
|
async function readTestSuiteMetadata(testFilePath) {
|
|
1420
1661
|
try {
|
|
1421
|
-
const absolutePath =
|
|
1422
|
-
const content = await
|
|
1662
|
+
const absolutePath = path7.resolve(testFilePath);
|
|
1663
|
+
const content = await readFile6(absolutePath, "utf8");
|
|
1423
1664
|
const parsed = parse2(content);
|
|
1424
1665
|
if (!isJsonObject(parsed)) {
|
|
1425
1666
|
return {};
|
|
@@ -1430,21 +1671,25 @@ async function readTestSuiteMetadata(testFilePath) {
|
|
|
1430
1671
|
}
|
|
1431
1672
|
}
|
|
1432
1673
|
async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
1674
|
+
const format = detectFormat(evalFilePath);
|
|
1675
|
+
if (format === "jsonl") {
|
|
1676
|
+
return loadEvalCasesFromJsonl(evalFilePath, repoRoot, options);
|
|
1677
|
+
}
|
|
1433
1678
|
const verbose = options?.verbose ?? false;
|
|
1434
1679
|
const evalIdFilter = options?.evalId;
|
|
1435
|
-
const absoluteTestPath =
|
|
1680
|
+
const absoluteTestPath = path7.resolve(evalFilePath);
|
|
1436
1681
|
const repoRootPath = resolveToAbsolutePath(repoRoot);
|
|
1437
1682
|
const searchRoots = buildSearchRoots2(absoluteTestPath, repoRootPath);
|
|
1438
1683
|
const config = await loadConfig(absoluteTestPath, repoRootPath);
|
|
1439
1684
|
const guidelinePatterns = config?.guideline_patterns;
|
|
1440
|
-
const rawFile = await
|
|
1685
|
+
const rawFile = await readFile6(absoluteTestPath, "utf8");
|
|
1441
1686
|
const parsed = parse2(rawFile);
|
|
1442
1687
|
if (!isJsonObject(parsed)) {
|
|
1443
1688
|
throw new Error(`Invalid test file format: ${evalFilePath}`);
|
|
1444
1689
|
}
|
|
1445
1690
|
const suite = parsed;
|
|
1446
|
-
const datasetNameFromSuite =
|
|
1447
|
-
const fallbackDataset =
|
|
1691
|
+
const datasetNameFromSuite = asString6(suite.dataset)?.trim();
|
|
1692
|
+
const fallbackDataset = path7.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
|
|
1448
1693
|
const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
|
|
1449
1694
|
const rawTestcases = suite.evalcases;
|
|
1450
1695
|
if (!Array.isArray(rawTestcases)) {
|
|
@@ -1452,24 +1697,24 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1452
1697
|
}
|
|
1453
1698
|
const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm_judge";
|
|
1454
1699
|
const globalExecution = isJsonObject(suite.execution) ? suite.execution : void 0;
|
|
1455
|
-
const _globalTarget =
|
|
1700
|
+
const _globalTarget = asString6(globalExecution?.target) ?? asString6(suite.target);
|
|
1456
1701
|
const results = [];
|
|
1457
1702
|
for (const rawEvalcase of rawTestcases) {
|
|
1458
1703
|
if (!isJsonObject(rawEvalcase)) {
|
|
1459
|
-
|
|
1704
|
+
logWarning6("Skipping invalid eval case entry (expected object)");
|
|
1460
1705
|
continue;
|
|
1461
1706
|
}
|
|
1462
1707
|
const evalcase = rawEvalcase;
|
|
1463
|
-
const id =
|
|
1708
|
+
const id = asString6(evalcase.id);
|
|
1464
1709
|
if (evalIdFilter && id !== evalIdFilter) {
|
|
1465
1710
|
continue;
|
|
1466
1711
|
}
|
|
1467
|
-
const conversationId =
|
|
1468
|
-
const outcome =
|
|
1712
|
+
const conversationId = asString6(evalcase.conversation_id);
|
|
1713
|
+
const outcome = asString6(evalcase.expected_outcome) ?? asString6(evalcase.outcome);
|
|
1469
1714
|
const inputMessagesValue = evalcase.input_messages;
|
|
1470
1715
|
const expectedMessagesValue = evalcase.expected_messages;
|
|
1471
1716
|
if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
|
|
1472
|
-
|
|
1717
|
+
logError2(
|
|
1473
1718
|
`Skipping incomplete eval case: ${id ?? "unknown"}. Missing required fields: id, outcome, and/or input_messages`
|
|
1474
1719
|
);
|
|
1475
1720
|
continue;
|
|
@@ -1480,7 +1725,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1480
1725
|
);
|
|
1481
1726
|
const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
|
|
1482
1727
|
if (hasExpectedMessages && expectedMessages.length === 0) {
|
|
1483
|
-
|
|
1728
|
+
logError2(`No valid expected message found for eval case: ${id}`);
|
|
1484
1729
|
continue;
|
|
1485
1730
|
}
|
|
1486
1731
|
const guidelinePaths = [];
|
|
@@ -1521,7 +1766,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1521
1766
|
evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
|
|
1522
1767
|
} catch (error) {
|
|
1523
1768
|
const message = error instanceof Error ? error.message : String(error);
|
|
1524
|
-
|
|
1769
|
+
logError2(`Skipping eval case '${id}': ${message}`);
|
|
1525
1770
|
continue;
|
|
1526
1771
|
}
|
|
1527
1772
|
const inlineRubrics = evalcase.rubrics;
|
|
@@ -1536,8 +1781,8 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1536
1781
|
};
|
|
1537
1782
|
}
|
|
1538
1783
|
return {
|
|
1539
|
-
id:
|
|
1540
|
-
description:
|
|
1784
|
+
id: asString6(rubric.id) ?? `rubric-${index + 1}`,
|
|
1785
|
+
description: asString6(rubric.description) ?? "",
|
|
1541
1786
|
weight: typeof rubric.weight === "number" ? rubric.weight : 1,
|
|
1542
1787
|
required: typeof rubric.required === "boolean" ? rubric.required : true
|
|
1543
1788
|
};
|
|
@@ -1558,7 +1803,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1558
1803
|
}
|
|
1559
1804
|
}
|
|
1560
1805
|
const allFilePaths = [
|
|
1561
|
-
...guidelinePaths.map((guidelinePath) =>
|
|
1806
|
+
...guidelinePaths.map((guidelinePath) => path7.resolve(guidelinePath)),
|
|
1562
1807
|
...userFilePaths
|
|
1563
1808
|
];
|
|
1564
1809
|
const testCase = {
|
|
@@ -1570,7 +1815,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1570
1815
|
input_segments: inputSegments,
|
|
1571
1816
|
expected_messages: outputSegments,
|
|
1572
1817
|
reference_answer: referenceAnswer,
|
|
1573
|
-
guideline_paths: guidelinePaths.map((guidelinePath) =>
|
|
1818
|
+
guideline_paths: guidelinePaths.map((guidelinePath) => path7.resolve(guidelinePath)),
|
|
1574
1819
|
guideline_patterns: guidelinePatterns,
|
|
1575
1820
|
file_paths: allFilePaths,
|
|
1576
1821
|
expected_outcome: outcome,
|
|
@@ -1593,25 +1838,25 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1593
1838
|
}
|
|
1594
1839
|
return results;
|
|
1595
1840
|
}
|
|
1596
|
-
function
|
|
1841
|
+
function asString6(value) {
|
|
1597
1842
|
return typeof value === "string" ? value : void 0;
|
|
1598
1843
|
}
|
|
1599
|
-
function
|
|
1844
|
+
function logWarning6(message, details) {
|
|
1600
1845
|
if (details && details.length > 0) {
|
|
1601
1846
|
const detailBlock = details.join("\n");
|
|
1602
|
-
console.warn(`${
|
|
1603
|
-
${detailBlock}${
|
|
1847
|
+
console.warn(`${ANSI_YELLOW7}Warning: ${message}
|
|
1848
|
+
${detailBlock}${ANSI_RESET7}`);
|
|
1604
1849
|
} else {
|
|
1605
|
-
console.warn(`${
|
|
1850
|
+
console.warn(`${ANSI_YELLOW7}Warning: ${message}${ANSI_RESET7}`);
|
|
1606
1851
|
}
|
|
1607
1852
|
}
|
|
1608
|
-
function
|
|
1853
|
+
function logError2(message, details) {
|
|
1609
1854
|
if (details && details.length > 0) {
|
|
1610
1855
|
const detailBlock = details.join("\n");
|
|
1611
|
-
console.error(`${
|
|
1612
|
-
${detailBlock}${
|
|
1856
|
+
console.error(`${ANSI_RED2}Error: ${message}
|
|
1857
|
+
${detailBlock}${ANSI_RESET7}`);
|
|
1613
1858
|
} else {
|
|
1614
|
-
console.error(`${
|
|
1859
|
+
console.error(`${ANSI_RED2}Error: ${message}${ANSI_RESET7}`);
|
|
1615
1860
|
}
|
|
1616
1861
|
}
|
|
1617
1862
|
|
|
@@ -1954,7 +2199,7 @@ import { randomUUID } from "node:crypto";
|
|
|
1954
2199
|
import { createWriteStream } from "node:fs";
|
|
1955
2200
|
import { mkdir, mkdtemp, rm, writeFile } from "node:fs/promises";
|
|
1956
2201
|
import { tmpdir } from "node:os";
|
|
1957
|
-
import
|
|
2202
|
+
import path9 from "node:path";
|
|
1958
2203
|
|
|
1959
2204
|
// src/evaluation/providers/claude-code-log-tracker.ts
|
|
1960
2205
|
var GLOBAL_LOGS_KEY = Symbol.for("agentv.claudeCodeLogs");
|
|
@@ -2010,7 +2255,7 @@ function subscribeToClaudeCodeLogEntries(listener) {
|
|
|
2010
2255
|
}
|
|
2011
2256
|
|
|
2012
2257
|
// src/evaluation/providers/preread.ts
|
|
2013
|
-
import
|
|
2258
|
+
import path8 from "node:path";
|
|
2014
2259
|
function buildPromptDocument(request, inputFiles, options) {
|
|
2015
2260
|
const parts = [];
|
|
2016
2261
|
const guidelineFiles = collectGuidelineFiles(
|
|
@@ -2033,7 +2278,7 @@ function normalizeInputFiles(inputFiles) {
|
|
|
2033
2278
|
}
|
|
2034
2279
|
const deduped = /* @__PURE__ */ new Map();
|
|
2035
2280
|
for (const inputFile of inputFiles) {
|
|
2036
|
-
const absolutePath =
|
|
2281
|
+
const absolutePath = path8.resolve(inputFile);
|
|
2037
2282
|
if (!deduped.has(absolutePath)) {
|
|
2038
2283
|
deduped.set(absolutePath, absolutePath);
|
|
2039
2284
|
}
|
|
@@ -2046,14 +2291,14 @@ function collectGuidelineFiles(inputFiles, guidelinePatterns, overrides) {
|
|
|
2046
2291
|
}
|
|
2047
2292
|
const unique = /* @__PURE__ */ new Map();
|
|
2048
2293
|
for (const inputFile of inputFiles) {
|
|
2049
|
-
const absolutePath =
|
|
2294
|
+
const absolutePath = path8.resolve(inputFile);
|
|
2050
2295
|
if (overrides?.has(absolutePath)) {
|
|
2051
2296
|
if (!unique.has(absolutePath)) {
|
|
2052
2297
|
unique.set(absolutePath, absolutePath);
|
|
2053
2298
|
}
|
|
2054
2299
|
continue;
|
|
2055
2300
|
}
|
|
2056
|
-
const normalized = absolutePath.split(
|
|
2301
|
+
const normalized = absolutePath.split(path8.sep).join("/");
|
|
2057
2302
|
if (isGuidelineFile(normalized, guidelinePatterns)) {
|
|
2058
2303
|
if (!unique.has(absolutePath)) {
|
|
2059
2304
|
unique.set(absolutePath, absolutePath);
|
|
@@ -2068,7 +2313,7 @@ function collectInputFiles(inputFiles) {
|
|
|
2068
2313
|
}
|
|
2069
2314
|
const unique = /* @__PURE__ */ new Map();
|
|
2070
2315
|
for (const inputFile of inputFiles) {
|
|
2071
|
-
const absolutePath =
|
|
2316
|
+
const absolutePath = path8.resolve(inputFile);
|
|
2072
2317
|
if (!unique.has(absolutePath)) {
|
|
2073
2318
|
unique.set(absolutePath, absolutePath);
|
|
2074
2319
|
}
|
|
@@ -2080,7 +2325,7 @@ function buildMandatoryPrereadBlock(guidelineFiles, inputFiles) {
|
|
|
2080
2325
|
return "";
|
|
2081
2326
|
}
|
|
2082
2327
|
const buildList = (files) => files.map((absolutePath) => {
|
|
2083
|
-
const fileName =
|
|
2328
|
+
const fileName = path8.basename(absolutePath);
|
|
2084
2329
|
const fileUri = pathToFileUri(absolutePath);
|
|
2085
2330
|
return `* [${fileName}](${fileUri})`;
|
|
2086
2331
|
});
|
|
@@ -2100,7 +2345,7 @@ ${buildList(inputFiles).join("\n")}.`);
|
|
|
2100
2345
|
return sections.join("\n");
|
|
2101
2346
|
}
|
|
2102
2347
|
function pathToFileUri(filePath) {
|
|
2103
|
-
const absolutePath =
|
|
2348
|
+
const absolutePath = path8.isAbsolute(filePath) ? filePath : path8.resolve(filePath);
|
|
2104
2349
|
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
2105
2350
|
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
2106
2351
|
return `file:///${normalizedPath}`;
|
|
@@ -2137,7 +2382,7 @@ var ClaudeCodeProvider = class {
|
|
|
2137
2382
|
const workspaceRoot = await this.createWorkspace();
|
|
2138
2383
|
const logger = await this.createStreamLogger(request).catch(() => void 0);
|
|
2139
2384
|
try {
|
|
2140
|
-
const promptFile =
|
|
2385
|
+
const promptFile = path9.join(workspaceRoot, PROMPT_FILENAME);
|
|
2141
2386
|
await writeFile(promptFile, request.question, "utf8");
|
|
2142
2387
|
const args = this.buildClaudeCodeArgs(request.question, inputFiles);
|
|
2143
2388
|
const cwd = this.resolveCwd();
|
|
@@ -2185,7 +2430,7 @@ var ClaudeCodeProvider = class {
|
|
|
2185
2430
|
if (!this.config.cwd) {
|
|
2186
2431
|
return process.cwd();
|
|
2187
2432
|
}
|
|
2188
|
-
return
|
|
2433
|
+
return path9.resolve(this.config.cwd);
|
|
2189
2434
|
}
|
|
2190
2435
|
buildClaudeCodeArgs(prompt, inputFiles) {
|
|
2191
2436
|
const args = [];
|
|
@@ -2242,7 +2487,7 @@ ${filesContext}`;
|
|
|
2242
2487
|
}
|
|
2243
2488
|
}
|
|
2244
2489
|
async createWorkspace() {
|
|
2245
|
-
return await mkdtemp(
|
|
2490
|
+
return await mkdtemp(path9.join(tmpdir(), WORKSPACE_PREFIX));
|
|
2246
2491
|
}
|
|
2247
2492
|
async cleanupWorkspace(workspaceRoot) {
|
|
2248
2493
|
try {
|
|
@@ -2256,9 +2501,9 @@ ${filesContext}`;
|
|
|
2256
2501
|
return void 0;
|
|
2257
2502
|
}
|
|
2258
2503
|
if (this.config.logDir) {
|
|
2259
|
-
return
|
|
2504
|
+
return path9.resolve(this.config.logDir);
|
|
2260
2505
|
}
|
|
2261
|
-
return
|
|
2506
|
+
return path9.join(process.cwd(), ".agentv", "logs", "claude-code");
|
|
2262
2507
|
}
|
|
2263
2508
|
async createStreamLogger(request) {
|
|
2264
2509
|
const logDir = this.resolveLogDirectory();
|
|
@@ -2272,7 +2517,7 @@ ${filesContext}`;
|
|
|
2272
2517
|
console.warn(`Skipping Claude Code stream logging (could not create ${logDir}): ${message}`);
|
|
2273
2518
|
return void 0;
|
|
2274
2519
|
}
|
|
2275
|
-
const filePath =
|
|
2520
|
+
const filePath = path9.join(logDir, buildLogFilename(request, this.targetName));
|
|
2276
2521
|
try {
|
|
2277
2522
|
const logger = await ClaudeCodeStreamLogger.create({
|
|
2278
2523
|
filePath,
|
|
@@ -2677,10 +2922,10 @@ function escapeShellArg(arg) {
|
|
|
2677
2922
|
}
|
|
2678
2923
|
async function defaultClaudeCodeRunner(options) {
|
|
2679
2924
|
const tempId = randomUUID();
|
|
2680
|
-
const stdoutFile =
|
|
2681
|
-
const stderrFile =
|
|
2682
|
-
const exitFile =
|
|
2683
|
-
const pidFile =
|
|
2925
|
+
const stdoutFile = path9.join(tmpdir(), `agentv-cc-${tempId}-stdout`);
|
|
2926
|
+
const stderrFile = path9.join(tmpdir(), `agentv-cc-${tempId}-stderr`);
|
|
2927
|
+
const exitFile = path9.join(tmpdir(), `agentv-cc-${tempId}-exit`);
|
|
2928
|
+
const pidFile = path9.join(tmpdir(), `agentv-cc-${tempId}-pid`);
|
|
2684
2929
|
try {
|
|
2685
2930
|
return await runClaudeCodeWithTempFiles(options, stdoutFile, stderrFile, exitFile, pidFile);
|
|
2686
2931
|
} finally {
|
|
@@ -2720,8 +2965,8 @@ async function runClaudeCodeWithTempFiles(options, stdoutFile, stderrFile, exitF
|
|
|
2720
2965
|
let lastStdoutSize = 0;
|
|
2721
2966
|
const readFileIfExists = async (filePath) => {
|
|
2722
2967
|
try {
|
|
2723
|
-
const { readFile:
|
|
2724
|
-
return await
|
|
2968
|
+
const { readFile: readFile8 } = await import("node:fs/promises");
|
|
2969
|
+
return await readFile8(filePath, "utf8");
|
|
2725
2970
|
} catch {
|
|
2726
2971
|
return "";
|
|
2727
2972
|
}
|
|
@@ -2796,7 +3041,7 @@ async function runClaudeCodeWithTempFiles(options, stdoutFile, stderrFile, exitF
|
|
|
2796
3041
|
import { exec as execWithCallback } from "node:child_process";
|
|
2797
3042
|
import fs from "node:fs/promises";
|
|
2798
3043
|
import os from "node:os";
|
|
2799
|
-
import
|
|
3044
|
+
import path10 from "node:path";
|
|
2800
3045
|
import { promisify } from "node:util";
|
|
2801
3046
|
import { z } from "zod";
|
|
2802
3047
|
var ToolCallSchema = z.object({
|
|
@@ -3253,7 +3498,7 @@ function normalizeInputFiles2(inputFiles) {
|
|
|
3253
3498
|
}
|
|
3254
3499
|
const unique = /* @__PURE__ */ new Map();
|
|
3255
3500
|
for (const inputFile of inputFiles) {
|
|
3256
|
-
const absolutePath =
|
|
3501
|
+
const absolutePath = path10.resolve(inputFile);
|
|
3257
3502
|
if (!unique.has(absolutePath)) {
|
|
3258
3503
|
unique.set(absolutePath, absolutePath);
|
|
3259
3504
|
}
|
|
@@ -3267,7 +3512,7 @@ function formatFileList(files, template) {
|
|
|
3267
3512
|
const formatter = template ?? "{path}";
|
|
3268
3513
|
return files.map((filePath) => {
|
|
3269
3514
|
const escapedPath = shellEscape(filePath);
|
|
3270
|
-
const escapedName = shellEscape(
|
|
3515
|
+
const escapedName = shellEscape(path10.basename(filePath));
|
|
3271
3516
|
return formatter.replaceAll("{path}", escapedPath).replaceAll("{basename}", escapedName);
|
|
3272
3517
|
}).join(" ");
|
|
3273
3518
|
}
|
|
@@ -3291,7 +3536,7 @@ function generateOutputFilePath(evalCaseId, extension = ".json") {
|
|
|
3291
3536
|
const safeEvalId = evalCaseId || "unknown";
|
|
3292
3537
|
const timestamp = Date.now();
|
|
3293
3538
|
const random = Math.random().toString(36).substring(2, 9);
|
|
3294
|
-
return
|
|
3539
|
+
return path10.join(os.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}${extension}`);
|
|
3295
3540
|
}
|
|
3296
3541
|
function formatTimeoutSuffix2(timeoutMs) {
|
|
3297
3542
|
if (!timeoutMs || timeoutMs <= 0) {
|
|
@@ -3307,7 +3552,7 @@ import { randomUUID as randomUUID2 } from "node:crypto";
|
|
|
3307
3552
|
import { constants as constants2, createWriteStream as createWriteStream2 } from "node:fs";
|
|
3308
3553
|
import { access as access2, mkdir as mkdir2, mkdtemp as mkdtemp2, rm as rm2, writeFile as writeFile2 } from "node:fs/promises";
|
|
3309
3554
|
import { tmpdir as tmpdir2 } from "node:os";
|
|
3310
|
-
import
|
|
3555
|
+
import path11 from "node:path";
|
|
3311
3556
|
import { promisify as promisify2 } from "node:util";
|
|
3312
3557
|
|
|
3313
3558
|
// src/evaluation/providers/codex-log-tracker.ts
|
|
@@ -3402,7 +3647,7 @@ var CodexProvider = class {
|
|
|
3402
3647
|
const promptContent = `${systemPrompt}
|
|
3403
3648
|
|
|
3404
3649
|
${basePrompt}`;
|
|
3405
|
-
const promptFile =
|
|
3650
|
+
const promptFile = path11.join(workspaceRoot, PROMPT_FILENAME2);
|
|
3406
3651
|
await writeFile2(promptFile, promptContent, "utf8");
|
|
3407
3652
|
const args = this.buildCodexArgs();
|
|
3408
3653
|
const cwd = this.resolveCwd(workspaceRoot);
|
|
@@ -3452,7 +3697,7 @@ ${basePrompt}`;
|
|
|
3452
3697
|
if (!this.config.cwd) {
|
|
3453
3698
|
return workspaceRoot;
|
|
3454
3699
|
}
|
|
3455
|
-
return
|
|
3700
|
+
return path11.resolve(this.config.cwd);
|
|
3456
3701
|
}
|
|
3457
3702
|
buildCodexArgs() {
|
|
3458
3703
|
const args = [
|
|
@@ -3494,7 +3739,7 @@ ${basePrompt}`;
|
|
|
3494
3739
|
}
|
|
3495
3740
|
}
|
|
3496
3741
|
async createWorkspace() {
|
|
3497
|
-
return await mkdtemp2(
|
|
3742
|
+
return await mkdtemp2(path11.join(tmpdir2(), WORKSPACE_PREFIX2));
|
|
3498
3743
|
}
|
|
3499
3744
|
async cleanupWorkspace(workspaceRoot) {
|
|
3500
3745
|
try {
|
|
@@ -3508,9 +3753,9 @@ ${basePrompt}`;
|
|
|
3508
3753
|
return void 0;
|
|
3509
3754
|
}
|
|
3510
3755
|
if (this.config.logDir) {
|
|
3511
|
-
return
|
|
3756
|
+
return path11.resolve(this.config.logDir);
|
|
3512
3757
|
}
|
|
3513
|
-
return
|
|
3758
|
+
return path11.join(process.cwd(), ".agentv", "logs", "codex");
|
|
3514
3759
|
}
|
|
3515
3760
|
async createStreamLogger(request) {
|
|
3516
3761
|
const logDir = this.resolveLogDirectory();
|
|
@@ -3524,7 +3769,7 @@ ${basePrompt}`;
|
|
|
3524
3769
|
console.warn(`Skipping Codex stream logging (could not create ${logDir}): ${message}`);
|
|
3525
3770
|
return void 0;
|
|
3526
3771
|
}
|
|
3527
|
-
const filePath =
|
|
3772
|
+
const filePath = path11.join(logDir, buildLogFilename2(request, this.targetName));
|
|
3528
3773
|
try {
|
|
3529
3774
|
const logger = await CodexStreamLogger.create({
|
|
3530
3775
|
filePath,
|
|
@@ -3739,7 +3984,7 @@ function tryParseJsonValue2(rawLine) {
|
|
|
3739
3984
|
async function locateExecutable(candidate) {
|
|
3740
3985
|
const includesPathSeparator = candidate.includes("/") || candidate.includes("\\");
|
|
3741
3986
|
if (includesPathSeparator) {
|
|
3742
|
-
const resolved =
|
|
3987
|
+
const resolved = path11.isAbsolute(candidate) ? candidate : path11.resolve(candidate);
|
|
3743
3988
|
const executablePath = await ensureWindowsExecutableVariant(resolved);
|
|
3744
3989
|
await access2(executablePath, constants2.F_OK);
|
|
3745
3990
|
return executablePath;
|
|
@@ -4252,7 +4497,7 @@ import { randomUUID as randomUUID3 } from "node:crypto";
|
|
|
4252
4497
|
import { createWriteStream as createWriteStream3 } from "node:fs";
|
|
4253
4498
|
import { mkdir as mkdir3, mkdtemp as mkdtemp3, rm as rm3, writeFile as writeFile3 } from "node:fs/promises";
|
|
4254
4499
|
import { tmpdir as tmpdir3 } from "node:os";
|
|
4255
|
-
import
|
|
4500
|
+
import path12 from "node:path";
|
|
4256
4501
|
|
|
4257
4502
|
// src/evaluation/providers/pi-log-tracker.ts
|
|
4258
4503
|
var GLOBAL_LOGS_KEY3 = Symbol.for("agentv.piLogs");
|
|
@@ -4336,7 +4581,7 @@ var PiCodingAgentProvider = class {
|
|
|
4336
4581
|
const workspaceRoot = await this.createWorkspace();
|
|
4337
4582
|
const logger = await this.createStreamLogger(request).catch(() => void 0);
|
|
4338
4583
|
try {
|
|
4339
|
-
const promptFile =
|
|
4584
|
+
const promptFile = path12.join(workspaceRoot, PROMPT_FILENAME3);
|
|
4340
4585
|
await writeFile3(promptFile, request.question, "utf8");
|
|
4341
4586
|
const args = this.buildPiArgs(request.question, inputFiles);
|
|
4342
4587
|
const cwd = this.resolveCwd(workspaceRoot);
|
|
@@ -4378,7 +4623,7 @@ var PiCodingAgentProvider = class {
|
|
|
4378
4623
|
if (!this.config.cwd) {
|
|
4379
4624
|
return workspaceRoot;
|
|
4380
4625
|
}
|
|
4381
|
-
return
|
|
4626
|
+
return path12.resolve(this.config.cwd);
|
|
4382
4627
|
}
|
|
4383
4628
|
buildPiArgs(prompt, inputFiles) {
|
|
4384
4629
|
const args = [];
|
|
@@ -4467,7 +4712,7 @@ ${prompt}`;
|
|
|
4467
4712
|
return env;
|
|
4468
4713
|
}
|
|
4469
4714
|
async createWorkspace() {
|
|
4470
|
-
return await mkdtemp3(
|
|
4715
|
+
return await mkdtemp3(path12.join(tmpdir3(), WORKSPACE_PREFIX3));
|
|
4471
4716
|
}
|
|
4472
4717
|
async cleanupWorkspace(workspaceRoot) {
|
|
4473
4718
|
try {
|
|
@@ -4477,9 +4722,9 @@ ${prompt}`;
|
|
|
4477
4722
|
}
|
|
4478
4723
|
resolveLogDirectory() {
|
|
4479
4724
|
if (this.config.logDir) {
|
|
4480
|
-
return
|
|
4725
|
+
return path12.resolve(this.config.logDir);
|
|
4481
4726
|
}
|
|
4482
|
-
return
|
|
4727
|
+
return path12.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
|
|
4483
4728
|
}
|
|
4484
4729
|
async createStreamLogger(request) {
|
|
4485
4730
|
const logDir = this.resolveLogDirectory();
|
|
@@ -4493,7 +4738,7 @@ ${prompt}`;
|
|
|
4493
4738
|
console.warn(`Skipping Pi stream logging (could not create ${logDir}): ${message}`);
|
|
4494
4739
|
return void 0;
|
|
4495
4740
|
}
|
|
4496
|
-
const filePath =
|
|
4741
|
+
const filePath = path12.join(logDir, buildLogFilename3(request, this.targetName));
|
|
4497
4742
|
try {
|
|
4498
4743
|
const logger = await PiStreamLogger.create({
|
|
4499
4744
|
filePath,
|
|
@@ -4926,7 +5171,7 @@ async function defaultPiRunner(options) {
|
|
|
4926
5171
|
}
|
|
4927
5172
|
|
|
4928
5173
|
// src/evaluation/providers/vscode.ts
|
|
4929
|
-
import
|
|
5174
|
+
import path13 from "node:path";
|
|
4930
5175
|
import {
|
|
4931
5176
|
dispatchAgentSession,
|
|
4932
5177
|
dispatchBatchAgent,
|
|
@@ -5101,7 +5346,7 @@ function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
|
|
|
5101
5346
|
return "";
|
|
5102
5347
|
}
|
|
5103
5348
|
const buildList = (files) => files.map((absolutePath) => {
|
|
5104
|
-
const fileName =
|
|
5349
|
+
const fileName = path13.basename(absolutePath);
|
|
5105
5350
|
const fileUri = pathToFileUri2(absolutePath);
|
|
5106
5351
|
return `* [${fileName}](${fileUri})`;
|
|
5107
5352
|
});
|
|
@@ -5126,8 +5371,8 @@ function collectGuidelineFiles2(attachments, guidelinePatterns) {
|
|
|
5126
5371
|
}
|
|
5127
5372
|
const unique = /* @__PURE__ */ new Map();
|
|
5128
5373
|
for (const attachment of attachments) {
|
|
5129
|
-
const absolutePath =
|
|
5130
|
-
const normalized = absolutePath.split(
|
|
5374
|
+
const absolutePath = path13.resolve(attachment);
|
|
5375
|
+
const normalized = absolutePath.split(path13.sep).join("/");
|
|
5131
5376
|
if (isGuidelineFile(normalized, guidelinePatterns)) {
|
|
5132
5377
|
if (!unique.has(absolutePath)) {
|
|
5133
5378
|
unique.set(absolutePath, absolutePath);
|
|
@@ -5142,7 +5387,7 @@ function collectAttachmentFiles(attachments) {
|
|
|
5142
5387
|
}
|
|
5143
5388
|
const unique = /* @__PURE__ */ new Map();
|
|
5144
5389
|
for (const attachment of attachments) {
|
|
5145
|
-
const absolutePath =
|
|
5390
|
+
const absolutePath = path13.resolve(attachment);
|
|
5146
5391
|
if (!unique.has(absolutePath)) {
|
|
5147
5392
|
unique.set(absolutePath, absolutePath);
|
|
5148
5393
|
}
|
|
@@ -5150,7 +5395,7 @@ function collectAttachmentFiles(attachments) {
|
|
|
5150
5395
|
return Array.from(unique.values());
|
|
5151
5396
|
}
|
|
5152
5397
|
function pathToFileUri2(filePath) {
|
|
5153
|
-
const absolutePath =
|
|
5398
|
+
const absolutePath = path13.isAbsolute(filePath) ? filePath : path13.resolve(filePath);
|
|
5154
5399
|
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
5155
5400
|
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
5156
5401
|
return `file:///${normalizedPath}`;
|
|
@@ -5163,7 +5408,7 @@ function normalizeAttachments(attachments) {
|
|
|
5163
5408
|
}
|
|
5164
5409
|
const deduped = /* @__PURE__ */ new Set();
|
|
5165
5410
|
for (const attachment of attachments) {
|
|
5166
|
-
deduped.add(
|
|
5411
|
+
deduped.add(path13.resolve(attachment));
|
|
5167
5412
|
}
|
|
5168
5413
|
return Array.from(deduped);
|
|
5169
5414
|
}
|
|
@@ -5172,7 +5417,7 @@ function mergeAttachments(all) {
|
|
|
5172
5417
|
for (const list of all) {
|
|
5173
5418
|
if (!list) continue;
|
|
5174
5419
|
for (const inputFile of list) {
|
|
5175
|
-
deduped.add(
|
|
5420
|
+
deduped.add(path13.resolve(inputFile));
|
|
5176
5421
|
}
|
|
5177
5422
|
}
|
|
5178
5423
|
return deduped.size > 0 ? Array.from(deduped) : void 0;
|
|
@@ -5220,8 +5465,8 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
|
|
|
5220
5465
|
|
|
5221
5466
|
// src/evaluation/providers/targets-file.ts
|
|
5222
5467
|
import { constants as constants3 } from "node:fs";
|
|
5223
|
-
import { access as access3, readFile as
|
|
5224
|
-
import
|
|
5468
|
+
import { access as access3, readFile as readFile7 } from "node:fs/promises";
|
|
5469
|
+
import path14 from "node:path";
|
|
5225
5470
|
import { parse as parse3 } from "yaml";
|
|
5226
5471
|
function isRecord(value) {
|
|
5227
5472
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
@@ -5258,11 +5503,11 @@ async function fileExists3(filePath) {
|
|
|
5258
5503
|
}
|
|
5259
5504
|
}
|
|
5260
5505
|
async function readTargetDefinitions(filePath) {
|
|
5261
|
-
const absolutePath =
|
|
5506
|
+
const absolutePath = path14.resolve(filePath);
|
|
5262
5507
|
if (!await fileExists3(absolutePath)) {
|
|
5263
5508
|
throw new Error(`targets.yaml not found at ${absolutePath}`);
|
|
5264
5509
|
}
|
|
5265
|
-
const raw = await
|
|
5510
|
+
const raw = await readFile7(absolutePath, "utf8");
|
|
5266
5511
|
const parsed = parse3(raw);
|
|
5267
5512
|
if (!isRecord(parsed)) {
|
|
5268
5513
|
throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with a 'targets' field`);
|
|
@@ -5469,15 +5714,15 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
|
|
|
5469
5714
|
});
|
|
5470
5715
|
}
|
|
5471
5716
|
async function execShellWithStdin(command, stdinPayload, options = {}) {
|
|
5472
|
-
const { mkdir: mkdir4, readFile:
|
|
5717
|
+
const { mkdir: mkdir4, readFile: readFile8, rm: rm4, writeFile: writeFile4 } = await import("node:fs/promises");
|
|
5473
5718
|
const { tmpdir: tmpdir4 } = await import("node:os");
|
|
5474
|
-
const
|
|
5719
|
+
const path16 = await import("node:path");
|
|
5475
5720
|
const { randomUUID: randomUUID4 } = await import("node:crypto");
|
|
5476
|
-
const dir =
|
|
5721
|
+
const dir = path16.join(tmpdir4(), `agentv-exec-${randomUUID4()}`);
|
|
5477
5722
|
await mkdir4(dir, { recursive: true });
|
|
5478
|
-
const stdinPath =
|
|
5479
|
-
const stdoutPath =
|
|
5480
|
-
const stderrPath =
|
|
5723
|
+
const stdinPath = path16.join(dir, "stdin.txt");
|
|
5724
|
+
const stdoutPath = path16.join(dir, "stdout.txt");
|
|
5725
|
+
const stderrPath = path16.join(dir, "stderr.txt");
|
|
5481
5726
|
await writeFile4(stdinPath, stdinPayload, "utf8");
|
|
5482
5727
|
const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
|
|
5483
5728
|
const { spawn: spawn4 } = await import("node:child_process");
|
|
@@ -5507,8 +5752,8 @@ async function execShellWithStdin(command, stdinPayload, options = {}) {
|
|
|
5507
5752
|
resolve(code ?? 0);
|
|
5508
5753
|
});
|
|
5509
5754
|
});
|
|
5510
|
-
const stdout = (await
|
|
5511
|
-
const stderr = (await
|
|
5755
|
+
const stdout = (await readFile8(stdoutPath, "utf8")).replace(/\r\n/g, "\n");
|
|
5756
|
+
const stderr = (await readFile8(stderrPath, "utf8")).replace(/\r\n/g, "\n");
|
|
5512
5757
|
return { stdout, stderr, exitCode };
|
|
5513
5758
|
} finally {
|
|
5514
5759
|
await rm4(dir, { recursive: true, force: true });
|
|
@@ -5780,7 +6025,7 @@ var CodeEvaluator = class {
|
|
|
5780
6025
|
outputMessages: context.outputMessages ?? null,
|
|
5781
6026
|
guidelineFiles: context.evalCase.guideline_paths,
|
|
5782
6027
|
inputFiles: context.evalCase.file_paths.filter(
|
|
5783
|
-
(
|
|
6028
|
+
(path16) => !context.evalCase.guideline_paths.includes(path16)
|
|
5784
6029
|
),
|
|
5785
6030
|
inputMessages: context.evalCase.input_messages,
|
|
5786
6031
|
traceSummary: context.traceSummary ?? null,
|
|
@@ -6539,115 +6784,115 @@ var FieldAccuracyEvaluator = class {
|
|
|
6539
6784
|
* Evaluate a single field against the expected value.
|
|
6540
6785
|
*/
|
|
6541
6786
|
evaluateField(fieldConfig, candidateData, expectedData) {
|
|
6542
|
-
const { path:
|
|
6543
|
-
const candidateValue = resolvePath(candidateData,
|
|
6544
|
-
const expectedValue = resolvePath(expectedData,
|
|
6787
|
+
const { path: path16, match, required = true, weight = 1 } = fieldConfig;
|
|
6788
|
+
const candidateValue = resolvePath(candidateData, path16);
|
|
6789
|
+
const expectedValue = resolvePath(expectedData, path16);
|
|
6545
6790
|
if (expectedValue === void 0) {
|
|
6546
6791
|
return {
|
|
6547
|
-
path:
|
|
6792
|
+
path: path16,
|
|
6548
6793
|
score: 1,
|
|
6549
6794
|
// No expected value means no comparison needed
|
|
6550
6795
|
weight,
|
|
6551
6796
|
hit: true,
|
|
6552
|
-
message: `${
|
|
6797
|
+
message: `${path16}: no expected value`
|
|
6553
6798
|
};
|
|
6554
6799
|
}
|
|
6555
6800
|
if (candidateValue === void 0) {
|
|
6556
6801
|
if (required) {
|
|
6557
6802
|
return {
|
|
6558
|
-
path:
|
|
6803
|
+
path: path16,
|
|
6559
6804
|
score: 0,
|
|
6560
6805
|
weight,
|
|
6561
6806
|
hit: false,
|
|
6562
|
-
message: `${
|
|
6807
|
+
message: `${path16} (required, missing)`
|
|
6563
6808
|
};
|
|
6564
6809
|
}
|
|
6565
6810
|
return {
|
|
6566
|
-
path:
|
|
6811
|
+
path: path16,
|
|
6567
6812
|
score: 1,
|
|
6568
6813
|
// Don't penalize missing optional fields
|
|
6569
6814
|
weight: 0,
|
|
6570
6815
|
// Zero weight means it won't affect the score
|
|
6571
6816
|
hit: true,
|
|
6572
|
-
message: `${
|
|
6817
|
+
message: `${path16}: optional field missing`
|
|
6573
6818
|
};
|
|
6574
6819
|
}
|
|
6575
6820
|
switch (match) {
|
|
6576
6821
|
case "exact":
|
|
6577
|
-
return this.compareExact(
|
|
6822
|
+
return this.compareExact(path16, candidateValue, expectedValue, weight);
|
|
6578
6823
|
case "numeric_tolerance":
|
|
6579
6824
|
return this.compareNumericTolerance(
|
|
6580
|
-
|
|
6825
|
+
path16,
|
|
6581
6826
|
candidateValue,
|
|
6582
6827
|
expectedValue,
|
|
6583
6828
|
fieldConfig,
|
|
6584
6829
|
weight
|
|
6585
6830
|
);
|
|
6586
6831
|
case "date":
|
|
6587
|
-
return this.compareDate(
|
|
6832
|
+
return this.compareDate(path16, candidateValue, expectedValue, fieldConfig, weight);
|
|
6588
6833
|
default:
|
|
6589
6834
|
return {
|
|
6590
|
-
path:
|
|
6835
|
+
path: path16,
|
|
6591
6836
|
score: 0,
|
|
6592
6837
|
weight,
|
|
6593
6838
|
hit: false,
|
|
6594
|
-
message: `${
|
|
6839
|
+
message: `${path16}: unknown match type "${match}"`
|
|
6595
6840
|
};
|
|
6596
6841
|
}
|
|
6597
6842
|
}
|
|
6598
6843
|
/**
|
|
6599
6844
|
* Exact equality comparison.
|
|
6600
6845
|
*/
|
|
6601
|
-
compareExact(
|
|
6846
|
+
compareExact(path16, candidateValue, expectedValue, weight) {
|
|
6602
6847
|
if (deepEqual(candidateValue, expectedValue)) {
|
|
6603
6848
|
return {
|
|
6604
|
-
path:
|
|
6849
|
+
path: path16,
|
|
6605
6850
|
score: 1,
|
|
6606
6851
|
weight,
|
|
6607
6852
|
hit: true,
|
|
6608
|
-
message:
|
|
6853
|
+
message: path16
|
|
6609
6854
|
};
|
|
6610
6855
|
}
|
|
6611
6856
|
if (typeof candidateValue !== typeof expectedValue) {
|
|
6612
6857
|
return {
|
|
6613
|
-
path:
|
|
6858
|
+
path: path16,
|
|
6614
6859
|
score: 0,
|
|
6615
6860
|
weight,
|
|
6616
6861
|
hit: false,
|
|
6617
|
-
message: `${
|
|
6862
|
+
message: `${path16} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
|
|
6618
6863
|
};
|
|
6619
6864
|
}
|
|
6620
6865
|
return {
|
|
6621
|
-
path:
|
|
6866
|
+
path: path16,
|
|
6622
6867
|
score: 0,
|
|
6623
6868
|
weight,
|
|
6624
6869
|
hit: false,
|
|
6625
|
-
message: `${
|
|
6870
|
+
message: `${path16} (value mismatch)`
|
|
6626
6871
|
};
|
|
6627
6872
|
}
|
|
6628
6873
|
/**
|
|
6629
6874
|
* Numeric comparison with absolute or relative tolerance.
|
|
6630
6875
|
*/
|
|
6631
|
-
compareNumericTolerance(
|
|
6876
|
+
compareNumericTolerance(path16, candidateValue, expectedValue, fieldConfig, weight) {
|
|
6632
6877
|
const { tolerance = 0, relative = false } = fieldConfig;
|
|
6633
6878
|
const candidateNum = toNumber(candidateValue);
|
|
6634
6879
|
const expectedNum = toNumber(expectedValue);
|
|
6635
6880
|
if (candidateNum === null || expectedNum === null) {
|
|
6636
6881
|
return {
|
|
6637
|
-
path:
|
|
6882
|
+
path: path16,
|
|
6638
6883
|
score: 0,
|
|
6639
6884
|
weight,
|
|
6640
6885
|
hit: false,
|
|
6641
|
-
message: `${
|
|
6886
|
+
message: `${path16} (non-numeric value)`
|
|
6642
6887
|
};
|
|
6643
6888
|
}
|
|
6644
6889
|
if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
|
|
6645
6890
|
return {
|
|
6646
|
-
path:
|
|
6891
|
+
path: path16,
|
|
6647
6892
|
score: 0,
|
|
6648
6893
|
weight,
|
|
6649
6894
|
hit: false,
|
|
6650
|
-
message: `${
|
|
6895
|
+
message: `${path16} (invalid numeric value)`
|
|
6651
6896
|
};
|
|
6652
6897
|
}
|
|
6653
6898
|
const diff = Math.abs(candidateNum - expectedNum);
|
|
@@ -6660,61 +6905,61 @@ var FieldAccuracyEvaluator = class {
|
|
|
6660
6905
|
}
|
|
6661
6906
|
if (withinTolerance) {
|
|
6662
6907
|
return {
|
|
6663
|
-
path:
|
|
6908
|
+
path: path16,
|
|
6664
6909
|
score: 1,
|
|
6665
6910
|
weight,
|
|
6666
6911
|
hit: true,
|
|
6667
|
-
message: `${
|
|
6912
|
+
message: `${path16} (within tolerance: diff=${diff.toFixed(2)})`
|
|
6668
6913
|
};
|
|
6669
6914
|
}
|
|
6670
6915
|
return {
|
|
6671
|
-
path:
|
|
6916
|
+
path: path16,
|
|
6672
6917
|
score: 0,
|
|
6673
6918
|
weight,
|
|
6674
6919
|
hit: false,
|
|
6675
|
-
message: `${
|
|
6920
|
+
message: `${path16} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
|
|
6676
6921
|
};
|
|
6677
6922
|
}
|
|
6678
6923
|
/**
|
|
6679
6924
|
* Date comparison with format normalization.
|
|
6680
6925
|
*/
|
|
6681
|
-
compareDate(
|
|
6926
|
+
compareDate(path16, candidateValue, expectedValue, fieldConfig, weight) {
|
|
6682
6927
|
const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
|
|
6683
6928
|
const candidateDate = parseDate(String(candidateValue), formats);
|
|
6684
6929
|
const expectedDate = parseDate(String(expectedValue), formats);
|
|
6685
6930
|
if (candidateDate === null) {
|
|
6686
6931
|
return {
|
|
6687
|
-
path:
|
|
6932
|
+
path: path16,
|
|
6688
6933
|
score: 0,
|
|
6689
6934
|
weight,
|
|
6690
6935
|
hit: false,
|
|
6691
|
-
message: `${
|
|
6936
|
+
message: `${path16} (unparseable candidate date)`
|
|
6692
6937
|
};
|
|
6693
6938
|
}
|
|
6694
6939
|
if (expectedDate === null) {
|
|
6695
6940
|
return {
|
|
6696
|
-
path:
|
|
6941
|
+
path: path16,
|
|
6697
6942
|
score: 0,
|
|
6698
6943
|
weight,
|
|
6699
6944
|
hit: false,
|
|
6700
|
-
message: `${
|
|
6945
|
+
message: `${path16} (unparseable expected date)`
|
|
6701
6946
|
};
|
|
6702
6947
|
}
|
|
6703
6948
|
if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
|
|
6704
6949
|
return {
|
|
6705
|
-
path:
|
|
6950
|
+
path: path16,
|
|
6706
6951
|
score: 1,
|
|
6707
6952
|
weight,
|
|
6708
6953
|
hit: true,
|
|
6709
|
-
message:
|
|
6954
|
+
message: path16
|
|
6710
6955
|
};
|
|
6711
6956
|
}
|
|
6712
6957
|
return {
|
|
6713
|
-
path:
|
|
6958
|
+
path: path16,
|
|
6714
6959
|
score: 0,
|
|
6715
6960
|
weight,
|
|
6716
6961
|
hit: false,
|
|
6717
|
-
message: `${
|
|
6962
|
+
message: `${path16} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
|
|
6718
6963
|
};
|
|
6719
6964
|
}
|
|
6720
6965
|
/**
|
|
@@ -6754,11 +6999,11 @@ var FieldAccuracyEvaluator = class {
|
|
|
6754
6999
|
};
|
|
6755
7000
|
}
|
|
6756
7001
|
};
|
|
6757
|
-
function resolvePath(obj,
|
|
6758
|
-
if (!
|
|
7002
|
+
function resolvePath(obj, path16) {
|
|
7003
|
+
if (!path16 || !obj) {
|
|
6759
7004
|
return void 0;
|
|
6760
7005
|
}
|
|
6761
|
-
const parts =
|
|
7006
|
+
const parts = path16.split(/\.|\[|\]/).filter((p) => p.length > 0);
|
|
6762
7007
|
let current = obj;
|
|
6763
7008
|
for (const part of parts) {
|
|
6764
7009
|
if (current === null || current === void 0) {
|
|
@@ -7194,7 +7439,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
7194
7439
|
|
|
7195
7440
|
// src/evaluation/orchestrator.ts
|
|
7196
7441
|
import { createHash } from "node:crypto";
|
|
7197
|
-
import
|
|
7442
|
+
import path15 from "node:path";
|
|
7198
7443
|
|
|
7199
7444
|
// ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
|
|
7200
7445
|
var Node = class {
|
|
@@ -7993,7 +8238,7 @@ async function runEvaluatorList(options) {
|
|
|
7993
8238
|
});
|
|
7994
8239
|
}
|
|
7995
8240
|
if (evaluator.type === "composite") {
|
|
7996
|
-
const evalFileDir = evalCase.guideline_paths[0] ?
|
|
8241
|
+
const evalFileDir = evalCase.guideline_paths[0] ? path15.dirname(evalCase.guideline_paths[0]) : process.cwd();
|
|
7997
8242
|
const createEvaluator = (memberConfig) => {
|
|
7998
8243
|
switch (memberConfig.type) {
|
|
7999
8244
|
case "llm_judge":
|
|
@@ -8567,6 +8812,7 @@ export {
|
|
|
8567
8812
|
createAgentKernel,
|
|
8568
8813
|
createProvider,
|
|
8569
8814
|
deepEqual,
|
|
8815
|
+
detectFormat,
|
|
8570
8816
|
ensureVSCodeSubagents,
|
|
8571
8817
|
executeScript,
|
|
8572
8818
|
explorationRatio,
|