@agentv/core 4.6.1 → 4.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-ZK4GG7PR.js → chunk-75RFVESM.js} +215 -127
- package/dist/chunk-75RFVESM.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +110 -95
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +30 -72
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +1271 -465
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +288 -74
- package/dist/index.d.ts +288 -74
- package/dist/index.js +1024 -311
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-ZK4GG7PR.js.map +0 -1
package/dist/index.js
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import {
|
|
2
2
|
COMMON_TARGET_SETTINGS,
|
|
3
|
+
LLM_GRADER_CAPABLE_KINDS,
|
|
3
4
|
TEST_MESSAGE_ROLES,
|
|
4
5
|
buildDirectoryChain,
|
|
5
6
|
buildSearchRoots,
|
|
@@ -24,7 +25,7 @@ import {
|
|
|
24
25
|
resolveDelegatedTargetDefinition,
|
|
25
26
|
resolveFileReference,
|
|
26
27
|
resolveTargetDefinition
|
|
27
|
-
} from "./chunk-
|
|
28
|
+
} from "./chunk-75RFVESM.js";
|
|
28
29
|
import {
|
|
29
30
|
AgentvProvider
|
|
30
31
|
} from "./chunk-PRNXHNLF.js";
|
|
@@ -856,8 +857,13 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
856
857
|
const negate = rawEvaluator.negate === true ? true : void 0;
|
|
857
858
|
if (isCustomType) {
|
|
858
859
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
859
|
-
const required2 =
|
|
860
|
-
|
|
860
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
861
|
+
rawEvaluator.required,
|
|
862
|
+
rawEvaluator.min_score,
|
|
863
|
+
name,
|
|
864
|
+
evalId
|
|
865
|
+
);
|
|
866
|
+
const knownProps2 = /* @__PURE__ */ new Set(["name", "type", "weight", "required", "min_score", "negate"]);
|
|
861
867
|
const config2 = {};
|
|
862
868
|
for (const [key, value] of Object.entries(rawEvaluator)) {
|
|
863
869
|
if (!knownProps2.has(key) && value !== void 0) {
|
|
@@ -869,6 +875,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
869
875
|
type: customTypeName,
|
|
870
876
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
871
877
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
878
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
872
879
|
...negate !== void 0 ? { negate } : {},
|
|
873
880
|
...Object.keys(config2).length > 0 ? { config: config2 } : {}
|
|
874
881
|
});
|
|
@@ -938,7 +945,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
938
945
|
);
|
|
939
946
|
}
|
|
940
947
|
}
|
|
941
|
-
const required2 =
|
|
948
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
949
|
+
rawEvaluator.required,
|
|
950
|
+
rawEvaluator.min_score,
|
|
951
|
+
name,
|
|
952
|
+
evalId
|
|
953
|
+
);
|
|
942
954
|
const knownProps2 = /* @__PURE__ */ new Set([
|
|
943
955
|
"name",
|
|
944
956
|
"type",
|
|
@@ -964,6 +976,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
964
976
|
resolvedCwd,
|
|
965
977
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
966
978
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
979
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
967
980
|
...negate !== void 0 ? { negate } : {},
|
|
968
981
|
...Object.keys(config2).length > 0 ? { config: config2 } : {},
|
|
969
982
|
...targetConfig !== void 0 ? { target: targetConfig } : {}
|
|
@@ -1092,7 +1105,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1092
1105
|
};
|
|
1093
1106
|
}
|
|
1094
1107
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1095
|
-
const required2 =
|
|
1108
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
1109
|
+
rawEvaluator.required,
|
|
1110
|
+
rawEvaluator.min_score,
|
|
1111
|
+
name,
|
|
1112
|
+
evalId
|
|
1113
|
+
);
|
|
1096
1114
|
evaluators.push({
|
|
1097
1115
|
name,
|
|
1098
1116
|
type: "composite",
|
|
@@ -1100,6 +1118,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1100
1118
|
aggregator,
|
|
1101
1119
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1102
1120
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
1121
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
1103
1122
|
...negate !== void 0 ? { negate } : {}
|
|
1104
1123
|
});
|
|
1105
1124
|
continue;
|
|
@@ -1210,7 +1229,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1210
1229
|
continue;
|
|
1211
1230
|
}
|
|
1212
1231
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1213
|
-
const required2 =
|
|
1232
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
1233
|
+
rawEvaluator.required,
|
|
1234
|
+
rawEvaluator.min_score,
|
|
1235
|
+
name,
|
|
1236
|
+
evalId
|
|
1237
|
+
);
|
|
1214
1238
|
const config2 = {
|
|
1215
1239
|
name,
|
|
1216
1240
|
type: "tool-trajectory",
|
|
@@ -1219,6 +1243,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1219
1243
|
...expected ? { expected } : {},
|
|
1220
1244
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1221
1245
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
1246
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
1222
1247
|
...negate !== void 0 ? { negate } : {},
|
|
1223
1248
|
...argsMatch2 !== void 0 ? { argsMatch: argsMatch2 } : {}
|
|
1224
1249
|
};
|
|
@@ -1281,7 +1306,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1281
1306
|
const aggregation = asString(rawEvaluator.aggregation);
|
|
1282
1307
|
const validAggregation = isValidFieldAggregationType(aggregation) ? aggregation : void 0;
|
|
1283
1308
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1284
|
-
const required2 =
|
|
1309
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
1310
|
+
rawEvaluator.required,
|
|
1311
|
+
rawEvaluator.min_score,
|
|
1312
|
+
name,
|
|
1313
|
+
evalId
|
|
1314
|
+
);
|
|
1285
1315
|
evaluators.push({
|
|
1286
1316
|
name,
|
|
1287
1317
|
type: "field-accuracy",
|
|
@@ -1289,6 +1319,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1289
1319
|
...validAggregation ? { aggregation: validAggregation } : {},
|
|
1290
1320
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1291
1321
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
1322
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
1292
1323
|
...negate !== void 0 ? { negate } : {}
|
|
1293
1324
|
});
|
|
1294
1325
|
continue;
|
|
@@ -1302,13 +1333,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1302
1333
|
continue;
|
|
1303
1334
|
}
|
|
1304
1335
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1305
|
-
const required2 =
|
|
1336
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
1337
|
+
rawEvaluator.required,
|
|
1338
|
+
rawEvaluator.min_score,
|
|
1339
|
+
name,
|
|
1340
|
+
evalId
|
|
1341
|
+
);
|
|
1306
1342
|
evaluators.push({
|
|
1307
1343
|
name,
|
|
1308
1344
|
type: "latency",
|
|
1309
1345
|
threshold,
|
|
1310
1346
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1311
1347
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
1348
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
1312
1349
|
...negate !== void 0 ? { negate } : {}
|
|
1313
1350
|
});
|
|
1314
1351
|
continue;
|
|
@@ -1322,13 +1359,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1322
1359
|
continue;
|
|
1323
1360
|
}
|
|
1324
1361
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1325
|
-
const required2 =
|
|
1362
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
1363
|
+
rawEvaluator.required,
|
|
1364
|
+
rawEvaluator.min_score,
|
|
1365
|
+
name,
|
|
1366
|
+
evalId
|
|
1367
|
+
);
|
|
1326
1368
|
evaluators.push({
|
|
1327
1369
|
name,
|
|
1328
1370
|
type: "cost",
|
|
1329
1371
|
budget,
|
|
1330
1372
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1331
1373
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
1374
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
1332
1375
|
...negate !== void 0 ? { negate } : {}
|
|
1333
1376
|
});
|
|
1334
1377
|
continue;
|
|
@@ -1360,13 +1403,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1360
1403
|
continue;
|
|
1361
1404
|
}
|
|
1362
1405
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1363
|
-
const required2 =
|
|
1406
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
1407
|
+
rawEvaluator.required,
|
|
1408
|
+
rawEvaluator.min_score,
|
|
1409
|
+
name,
|
|
1410
|
+
evalId
|
|
1411
|
+
);
|
|
1364
1412
|
evaluators.push({
|
|
1365
1413
|
name,
|
|
1366
1414
|
type: "token-usage",
|
|
1367
1415
|
...validLimits,
|
|
1368
1416
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1369
1417
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
1418
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
1370
1419
|
...negate !== void 0 ? { negate } : {}
|
|
1371
1420
|
});
|
|
1372
1421
|
continue;
|
|
@@ -1412,13 +1461,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1412
1461
|
continue;
|
|
1413
1462
|
}
|
|
1414
1463
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1415
|
-
const required2 =
|
|
1464
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
1465
|
+
rawEvaluator.required,
|
|
1466
|
+
rawEvaluator.min_score,
|
|
1467
|
+
name,
|
|
1468
|
+
evalId
|
|
1469
|
+
);
|
|
1416
1470
|
evaluators.push({
|
|
1417
1471
|
name,
|
|
1418
1472
|
type: "execution-metrics",
|
|
1419
1473
|
...validThresholds,
|
|
1420
1474
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1421
1475
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
1476
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
1422
1477
|
...negate !== void 0 ? { negate } : {}
|
|
1423
1478
|
});
|
|
1424
1479
|
continue;
|
|
@@ -1432,7 +1487,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1432
1487
|
const rawShouldTrigger = rawEvaluator.should_trigger;
|
|
1433
1488
|
const shouldTrigger = typeof rawShouldTrigger === "boolean" ? rawShouldTrigger : void 0;
|
|
1434
1489
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1435
|
-
const required2 =
|
|
1490
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
1491
|
+
rawEvaluator.required,
|
|
1492
|
+
rawEvaluator.min_score,
|
|
1493
|
+
name,
|
|
1494
|
+
evalId
|
|
1495
|
+
);
|
|
1436
1496
|
evaluators.push({
|
|
1437
1497
|
name,
|
|
1438
1498
|
type: "skill-trigger",
|
|
@@ -1440,6 +1500,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1440
1500
|
...shouldTrigger !== void 0 ? { should_trigger: shouldTrigger } : {},
|
|
1441
1501
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1442
1502
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
1503
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
1443
1504
|
...negate !== void 0 ? { negate } : {}
|
|
1444
1505
|
});
|
|
1445
1506
|
continue;
|
|
@@ -1451,13 +1512,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1451
1512
|
continue;
|
|
1452
1513
|
}
|
|
1453
1514
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1454
|
-
const required2 =
|
|
1515
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
1516
|
+
rawEvaluator.required,
|
|
1517
|
+
rawEvaluator.min_score,
|
|
1518
|
+
name,
|
|
1519
|
+
evalId
|
|
1520
|
+
);
|
|
1455
1521
|
evaluators.push({
|
|
1456
1522
|
name,
|
|
1457
1523
|
type: "contains",
|
|
1458
1524
|
value,
|
|
1459
1525
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1460
1526
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
1527
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
1461
1528
|
...negate !== void 0 ? { negate } : {}
|
|
1462
1529
|
});
|
|
1463
1530
|
continue;
|
|
@@ -1471,13 +1538,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1471
1538
|
continue;
|
|
1472
1539
|
}
|
|
1473
1540
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1474
|
-
const required2 =
|
|
1541
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
1542
|
+
rawEvaluator.required,
|
|
1543
|
+
rawEvaluator.min_score,
|
|
1544
|
+
name,
|
|
1545
|
+
evalId
|
|
1546
|
+
);
|
|
1475
1547
|
evaluators.push({
|
|
1476
1548
|
name,
|
|
1477
1549
|
type: typeValue,
|
|
1478
1550
|
value,
|
|
1479
1551
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1480
1552
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
1553
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
1481
1554
|
...negate !== void 0 ? { negate } : {}
|
|
1482
1555
|
});
|
|
1483
1556
|
continue;
|
|
@@ -1489,13 +1562,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1489
1562
|
continue;
|
|
1490
1563
|
}
|
|
1491
1564
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1492
|
-
const required2 =
|
|
1565
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
1566
|
+
rawEvaluator.required,
|
|
1567
|
+
rawEvaluator.min_score,
|
|
1568
|
+
name,
|
|
1569
|
+
evalId
|
|
1570
|
+
);
|
|
1493
1571
|
evaluators.push({
|
|
1494
1572
|
name,
|
|
1495
1573
|
type: "icontains",
|
|
1496
1574
|
value,
|
|
1497
1575
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1498
1576
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
1577
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
1499
1578
|
...negate !== void 0 ? { negate } : {}
|
|
1500
1579
|
});
|
|
1501
1580
|
continue;
|
|
@@ -1509,13 +1588,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1509
1588
|
continue;
|
|
1510
1589
|
}
|
|
1511
1590
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1512
|
-
const required2 =
|
|
1591
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
1592
|
+
rawEvaluator.required,
|
|
1593
|
+
rawEvaluator.min_score,
|
|
1594
|
+
name,
|
|
1595
|
+
evalId
|
|
1596
|
+
);
|
|
1513
1597
|
evaluators.push({
|
|
1514
1598
|
name,
|
|
1515
1599
|
type: typeValue,
|
|
1516
1600
|
value,
|
|
1517
1601
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1518
1602
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
1603
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
1519
1604
|
...negate !== void 0 ? { negate } : {}
|
|
1520
1605
|
});
|
|
1521
1606
|
continue;
|
|
@@ -1527,13 +1612,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1527
1612
|
continue;
|
|
1528
1613
|
}
|
|
1529
1614
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1530
|
-
const required2 =
|
|
1615
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
1616
|
+
rawEvaluator.required,
|
|
1617
|
+
rawEvaluator.min_score,
|
|
1618
|
+
name,
|
|
1619
|
+
evalId
|
|
1620
|
+
);
|
|
1531
1621
|
evaluators.push({
|
|
1532
1622
|
name,
|
|
1533
1623
|
type: typeValue,
|
|
1534
1624
|
value,
|
|
1535
1625
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1536
1626
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
1627
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
1537
1628
|
...negate !== void 0 ? { negate } : {}
|
|
1538
1629
|
});
|
|
1539
1630
|
continue;
|
|
@@ -1546,7 +1637,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1546
1637
|
}
|
|
1547
1638
|
const flags = asString(rawEvaluator.flags);
|
|
1548
1639
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1549
|
-
const required2 =
|
|
1640
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
1641
|
+
rawEvaluator.required,
|
|
1642
|
+
rawEvaluator.min_score,
|
|
1643
|
+
name,
|
|
1644
|
+
evalId
|
|
1645
|
+
);
|
|
1550
1646
|
evaluators.push({
|
|
1551
1647
|
name,
|
|
1552
1648
|
type: "regex",
|
|
@@ -1554,18 +1650,25 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1554
1650
|
...flags !== void 0 ? { flags } : {},
|
|
1555
1651
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1556
1652
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
1653
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
1557
1654
|
...negate !== void 0 ? { negate } : {}
|
|
1558
1655
|
});
|
|
1559
1656
|
continue;
|
|
1560
1657
|
}
|
|
1561
1658
|
if (typeValue === "is-json") {
|
|
1562
1659
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1563
|
-
const required2 =
|
|
1660
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
1661
|
+
rawEvaluator.required,
|
|
1662
|
+
rawEvaluator.min_score,
|
|
1663
|
+
name,
|
|
1664
|
+
evalId
|
|
1665
|
+
);
|
|
1564
1666
|
evaluators.push({
|
|
1565
1667
|
name,
|
|
1566
1668
|
type: "is-json",
|
|
1567
1669
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1568
1670
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
1671
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
1569
1672
|
...negate !== void 0 ? { negate } : {}
|
|
1570
1673
|
});
|
|
1571
1674
|
continue;
|
|
@@ -1577,13 +1680,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1577
1680
|
continue;
|
|
1578
1681
|
}
|
|
1579
1682
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1580
|
-
const required2 =
|
|
1683
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
1684
|
+
rawEvaluator.required,
|
|
1685
|
+
rawEvaluator.min_score,
|
|
1686
|
+
name,
|
|
1687
|
+
evalId
|
|
1688
|
+
);
|
|
1581
1689
|
evaluators.push({
|
|
1582
1690
|
name,
|
|
1583
1691
|
type: "equals",
|
|
1584
1692
|
value,
|
|
1585
1693
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1586
1694
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
1695
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
1587
1696
|
...negate !== void 0 ? { negate } : {}
|
|
1588
1697
|
});
|
|
1589
1698
|
continue;
|
|
@@ -1619,7 +1728,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1619
1728
|
continue;
|
|
1620
1729
|
}
|
|
1621
1730
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1622
|
-
const required2 =
|
|
1731
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
1732
|
+
rawEvaluator.required,
|
|
1733
|
+
rawEvaluator.min_score,
|
|
1734
|
+
name,
|
|
1735
|
+
evalId
|
|
1736
|
+
);
|
|
1623
1737
|
evaluators.push({
|
|
1624
1738
|
name,
|
|
1625
1739
|
type: "llm-grader",
|
|
@@ -1627,6 +1741,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1627
1741
|
...graderTargetName ? { target: graderTargetName } : {},
|
|
1628
1742
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1629
1743
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
1744
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
1630
1745
|
...negate !== void 0 ? { negate } : {}
|
|
1631
1746
|
});
|
|
1632
1747
|
continue;
|
|
@@ -1696,7 +1811,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1696
1811
|
continue;
|
|
1697
1812
|
}
|
|
1698
1813
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1699
|
-
const required2 =
|
|
1814
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
1815
|
+
rawEvaluator.required,
|
|
1816
|
+
rawEvaluator.min_score,
|
|
1817
|
+
name,
|
|
1818
|
+
evalId
|
|
1819
|
+
);
|
|
1700
1820
|
evaluators.push({
|
|
1701
1821
|
name,
|
|
1702
1822
|
type: "llm-grader",
|
|
@@ -1704,12 +1824,18 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1704
1824
|
...graderTargetName ? { target: graderTargetName } : {},
|
|
1705
1825
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1706
1826
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
1827
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
1707
1828
|
...negate !== void 0 ? { negate } : {}
|
|
1708
1829
|
});
|
|
1709
1830
|
continue;
|
|
1710
1831
|
}
|
|
1711
1832
|
const weight = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1712
|
-
const required =
|
|
1833
|
+
const { required, min_score } = parseRequiredAndMinScore(
|
|
1834
|
+
rawEvaluator.required,
|
|
1835
|
+
rawEvaluator.min_score,
|
|
1836
|
+
name,
|
|
1837
|
+
evalId
|
|
1838
|
+
);
|
|
1713
1839
|
const knownProps = /* @__PURE__ */ new Set([
|
|
1714
1840
|
"name",
|
|
1715
1841
|
"type",
|
|
@@ -1720,6 +1846,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1720
1846
|
"weight",
|
|
1721
1847
|
"config",
|
|
1722
1848
|
"required",
|
|
1849
|
+
"min_score",
|
|
1723
1850
|
"negate",
|
|
1724
1851
|
"max_steps",
|
|
1725
1852
|
"maxSteps",
|
|
@@ -1749,6 +1876,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1749
1876
|
...graderTargetName ? { target: graderTargetName } : {},
|
|
1750
1877
|
...weight !== void 0 ? { weight } : {},
|
|
1751
1878
|
...required !== void 0 ? { required } : {},
|
|
1879
|
+
...min_score !== void 0 ? { min_score } : {},
|
|
1752
1880
|
...negate !== void 0 ? { negate } : {},
|
|
1753
1881
|
...finalConfig ? { config: finalConfig } : {},
|
|
1754
1882
|
...llmMaxSteps !== void 0 ? { max_steps: llmMaxSteps } : {},
|
|
@@ -1880,10 +2008,23 @@ ${detailBlock}${ANSI_RESET4}`);
|
|
|
1880
2008
|
console.warn(`${ANSI_YELLOW3}Warning: ${message}${ANSI_RESET4}`);
|
|
1881
2009
|
}
|
|
1882
2010
|
}
|
|
1883
|
-
function
|
|
1884
|
-
|
|
1885
|
-
if (typeof
|
|
1886
|
-
|
|
2011
|
+
function parseRequiredAndMinScore(rawRequired, rawMinScore, evaluatorName, evalId) {
|
|
2012
|
+
const result = {};
|
|
2013
|
+
if (typeof rawMinScore === "number" && rawMinScore > 0 && rawMinScore <= 1) {
|
|
2014
|
+
result.min_score = rawMinScore;
|
|
2015
|
+
}
|
|
2016
|
+
if (rawRequired === true) {
|
|
2017
|
+
result.required = true;
|
|
2018
|
+
} else if (typeof rawRequired === "number" && rawRequired > 0 && rawRequired <= 1) {
|
|
2019
|
+
if (result.min_score === void 0) {
|
|
2020
|
+
result.min_score = rawRequired;
|
|
2021
|
+
}
|
|
2022
|
+
result.required = rawRequired;
|
|
2023
|
+
logWarning2(
|
|
2024
|
+
`Evaluator '${evaluatorName}' in '${evalId}': 'required: ${rawRequired}' is deprecated. Use 'required: true' + 'min_score: ${rawRequired}' instead.`
|
|
2025
|
+
);
|
|
2026
|
+
}
|
|
2027
|
+
return result;
|
|
1887
2028
|
}
|
|
1888
2029
|
function validateWeight(rawWeight, evaluatorName, evalId) {
|
|
1889
2030
|
if (rawWeight === void 0) {
|
|
@@ -1926,16 +2067,30 @@ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
|
|
|
1926
2067
|
const id = asString(rawRubric.id) ?? `rubric-${index + 1}`;
|
|
1927
2068
|
const expectedOutcome = asString(rawRubric.outcome) ?? "";
|
|
1928
2069
|
const weight = typeof rawRubric.weight === "number" ? rawRubric.weight : 1;
|
|
2070
|
+
let minScore;
|
|
1929
2071
|
let requiredMinScore;
|
|
1930
2072
|
let required;
|
|
1931
|
-
if (typeof rawRubric.
|
|
1932
|
-
const
|
|
1933
|
-
if (
|
|
2073
|
+
if (typeof rawRubric.min_score === "number") {
|
|
2074
|
+
const ms = rawRubric.min_score;
|
|
2075
|
+
if (ms <= 0 || ms > 1) {
|
|
1934
2076
|
throw new Error(
|
|
1935
|
-
`Invalid
|
|
2077
|
+
`Invalid min_score for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be in (0, 1] (got ${ms})`
|
|
1936
2078
|
);
|
|
1937
2079
|
}
|
|
1938
|
-
|
|
2080
|
+
minScore = ms;
|
|
2081
|
+
requiredMinScore = Math.round(ms * 10);
|
|
2082
|
+
} else if (typeof rawRubric.required_min_score === "number") {
|
|
2083
|
+
const rms = rawRubric.required_min_score;
|
|
2084
|
+
if (!Number.isInteger(rms) || rms < 0 || rms > 10) {
|
|
2085
|
+
throw new Error(
|
|
2086
|
+
`Invalid required_min_score for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be an integer 0-10 (got ${rms})`
|
|
2087
|
+
);
|
|
2088
|
+
}
|
|
2089
|
+
requiredMinScore = rms;
|
|
2090
|
+
minScore = rms / 10;
|
|
2091
|
+
logWarning2(
|
|
2092
|
+
`Rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': 'required_min_score: ${rms}' is deprecated. Use 'min_score: ${rms / 10}' (0-1 scale) instead.`
|
|
2093
|
+
);
|
|
1939
2094
|
}
|
|
1940
2095
|
if (typeof rawRubric.required === "boolean") {
|
|
1941
2096
|
required = rawRubric.required;
|
|
@@ -1955,6 +2110,7 @@ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
|
|
|
1955
2110
|
weight,
|
|
1956
2111
|
...expectedOutcome.length > 0 ? { outcome: expectedOutcome } : {},
|
|
1957
2112
|
...required !== void 0 ? { required } : {},
|
|
2113
|
+
...minScore !== void 0 ? { min_score: minScore } : {},
|
|
1958
2114
|
...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {},
|
|
1959
2115
|
score_ranges: scoreRanges
|
|
1960
2116
|
});
|
|
@@ -1971,6 +2127,7 @@ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
|
|
|
1971
2127
|
weight,
|
|
1972
2128
|
// Default to required: true if not specified (backward compatibility)
|
|
1973
2129
|
required: required ?? true,
|
|
2130
|
+
...minScore !== void 0 ? { min_score: minScore } : {},
|
|
1974
2131
|
...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {}
|
|
1975
2132
|
});
|
|
1976
2133
|
}
|
|
@@ -2099,12 +2256,22 @@ function parseInlineRubrics(rawRubrics) {
|
|
|
2099
2256
|
id: asString(rubric.id) ?? `rubric-${index + 1}`,
|
|
2100
2257
|
weight: typeof rubric.weight === "number" ? rubric.weight : 1
|
|
2101
2258
|
};
|
|
2259
|
+
let inlineMinScore;
|
|
2260
|
+
let inlineRequiredMinScore;
|
|
2261
|
+
if (typeof rubric.min_score === "number") {
|
|
2262
|
+
inlineMinScore = rubric.min_score;
|
|
2263
|
+
inlineRequiredMinScore = Math.round(inlineMinScore * 10);
|
|
2264
|
+
} else if (typeof rubric.required_min_score === "number") {
|
|
2265
|
+
inlineRequiredMinScore = rubric.required_min_score;
|
|
2266
|
+
inlineMinScore = inlineRequiredMinScore / 10;
|
|
2267
|
+
}
|
|
2102
2268
|
if (scoreRanges && scoreRanges.length > 0) {
|
|
2103
2269
|
return {
|
|
2104
2270
|
...baseRubric,
|
|
2105
2271
|
...expectedOutcome.length > 0 ? { outcome: expectedOutcome } : {},
|
|
2106
2272
|
...typeof rubric.required === "boolean" ? { required: rubric.required } : {},
|
|
2107
|
-
...
|
|
2273
|
+
...inlineMinScore !== void 0 ? { min_score: inlineMinScore } : {},
|
|
2274
|
+
...inlineRequiredMinScore !== void 0 ? { required_min_score: inlineRequiredMinScore } : {},
|
|
2108
2275
|
score_ranges: scoreRanges
|
|
2109
2276
|
};
|
|
2110
2277
|
}
|
|
@@ -2112,7 +2279,8 @@ function parseInlineRubrics(rawRubrics) {
|
|
|
2112
2279
|
...baseRubric,
|
|
2113
2280
|
outcome: expectedOutcome,
|
|
2114
2281
|
required: typeof rubric.required === "boolean" ? rubric.required : true,
|
|
2115
|
-
...
|
|
2282
|
+
...inlineMinScore !== void 0 ? { min_score: inlineMinScore } : {},
|
|
2283
|
+
...inlineRequiredMinScore !== void 0 ? { required_min_score: inlineRequiredMinScore } : {}
|
|
2116
2284
|
};
|
|
2117
2285
|
}).filter((r) => r.outcome && r.outcome.length > 0 || "score_ranges" in r && r.score_ranges);
|
|
2118
2286
|
if (rubricItems.length === 0) {
|
|
@@ -2512,6 +2680,9 @@ function resolveExpectedMessages(raw) {
|
|
|
2512
2680
|
var ANSI_YELLOW5 = "\x1B[33m";
|
|
2513
2681
|
var ANSI_RED2 = "\x1B[31m";
|
|
2514
2682
|
var ANSI_RESET6 = "\x1B[0m";
|
|
2683
|
+
function matchesFilter(id, filter) {
|
|
2684
|
+
return typeof filter === "string" ? micromatch.isMatch(id, filter) : filter.some((pattern) => micromatch.isMatch(id, pattern));
|
|
2685
|
+
}
|
|
2515
2686
|
function detectFormat(filePath) {
|
|
2516
2687
|
const ext = path6.extname(filePath).toLowerCase();
|
|
2517
2688
|
if (ext === ".jsonl") return "jsonl";
|
|
@@ -2579,40 +2750,40 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
2579
2750
|
const sidecar = await loadSidecarMetadata(absoluteTestPath, verbose);
|
|
2580
2751
|
const rawFile = await readFile5(absoluteTestPath, "utf8");
|
|
2581
2752
|
const rawCases = parseJsonlContent(rawFile, evalFilePath);
|
|
2582
|
-
const
|
|
2583
|
-
const
|
|
2753
|
+
const fallbackSuiteName = path6.basename(absoluteTestPath, ".jsonl") || "eval";
|
|
2754
|
+
const suiteName = sidecar.name && sidecar.name.trim().length > 0 ? sidecar.name : fallbackSuiteName;
|
|
2584
2755
|
const globalEvaluator = coerceEvaluator(sidecar.evaluator, "sidecar") ?? "llm-grader";
|
|
2585
2756
|
const globalExecution = sidecar.execution;
|
|
2586
2757
|
if (verbose) {
|
|
2587
2758
|
console.log(`
|
|
2588
|
-
[JSONL
|
|
2759
|
+
[JSONL Suite: ${evalFilePath}]`);
|
|
2589
2760
|
console.log(` Cases: ${rawCases.length}`);
|
|
2590
|
-
console.log(`
|
|
2761
|
+
console.log(` Suite: ${suiteName}`);
|
|
2591
2762
|
if (sidecar.description) {
|
|
2592
2763
|
console.log(` Description: ${sidecar.description}`);
|
|
2593
2764
|
}
|
|
2594
2765
|
}
|
|
2595
2766
|
const results = [];
|
|
2596
2767
|
for (let lineIndex = 0; lineIndex < rawCases.length; lineIndex++) {
|
|
2597
|
-
const
|
|
2768
|
+
const testCaseConfig = rawCases[lineIndex];
|
|
2598
2769
|
const lineNumber = lineIndex + 1;
|
|
2599
|
-
const id = asString4(
|
|
2600
|
-
if (filterPattern && (!id || !
|
|
2770
|
+
const id = asString4(testCaseConfig.id);
|
|
2771
|
+
if (filterPattern && (!id || !matchesFilter(id, filterPattern))) {
|
|
2601
2772
|
continue;
|
|
2602
2773
|
}
|
|
2603
|
-
const conversationId = asString4(
|
|
2604
|
-
let outcome = asString4(
|
|
2605
|
-
if (!outcome &&
|
|
2606
|
-
outcome = asString4(
|
|
2774
|
+
const conversationId = asString4(testCaseConfig.conversation_id);
|
|
2775
|
+
let outcome = asString4(testCaseConfig.criteria);
|
|
2776
|
+
if (!outcome && testCaseConfig.expected_outcome !== void 0) {
|
|
2777
|
+
outcome = asString4(testCaseConfig.expected_outcome);
|
|
2607
2778
|
if (outcome) {
|
|
2608
2779
|
logWarning4(
|
|
2609
|
-
`Test '${asString4(
|
|
2780
|
+
`Test '${asString4(testCaseConfig.id) ?? "unknown"}': 'expected_outcome' is deprecated. Use 'criteria' instead.`
|
|
2610
2781
|
);
|
|
2611
2782
|
}
|
|
2612
2783
|
}
|
|
2613
|
-
const rawInputMessages = resolveInputMessages(
|
|
2614
|
-
const expectedMessages = resolveExpectedMessages(
|
|
2615
|
-
const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 ||
|
|
2784
|
+
const rawInputMessages = resolveInputMessages(testCaseConfig);
|
|
2785
|
+
const expectedMessages = resolveExpectedMessages(testCaseConfig) ?? [];
|
|
2786
|
+
const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || testCaseConfig.assert !== void 0;
|
|
2616
2787
|
if (!id || !hasEvaluationSpec || !rawInputMessages || rawInputMessages.length === 0) {
|
|
2617
2788
|
logError2(
|
|
2618
2789
|
`Skipping incomplete test at line ${lineNumber}: ${id ?? "unknown"}. Missing required fields: id, input, and at least one of criteria/expected_output/assert`
|
|
@@ -2649,18 +2820,23 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
2649
2820
|
}
|
|
2650
2821
|
}
|
|
2651
2822
|
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
2652
|
-
const caseExecution = isJsonObject(
|
|
2823
|
+
const caseExecution = isJsonObject(testCaseConfig.execution) ? testCaseConfig.execution : void 0;
|
|
2653
2824
|
const mergedExecution = caseExecution ?? globalExecution;
|
|
2654
|
-
const
|
|
2825
|
+
const testCaseEvaluatorKind = coerceEvaluator(testCaseConfig.evaluator, id) ?? globalEvaluator;
|
|
2655
2826
|
let evaluators;
|
|
2656
2827
|
try {
|
|
2657
|
-
evaluators = await parseEvaluators(
|
|
2828
|
+
evaluators = await parseEvaluators(
|
|
2829
|
+
testCaseConfig,
|
|
2830
|
+
mergedExecution,
|
|
2831
|
+
searchRoots,
|
|
2832
|
+
id ?? "unknown"
|
|
2833
|
+
);
|
|
2658
2834
|
} catch (error) {
|
|
2659
2835
|
const message = error instanceof Error ? error.message : String(error);
|
|
2660
2836
|
logError2(`Skipping test '${id}' at line ${lineNumber}: ${message}`);
|
|
2661
2837
|
continue;
|
|
2662
2838
|
}
|
|
2663
|
-
const inlineRubrics =
|
|
2839
|
+
const inlineRubrics = testCaseConfig.rubrics;
|
|
2664
2840
|
if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
|
|
2665
2841
|
const rubricEvaluator = parseInlineRubrics(inlineRubrics);
|
|
2666
2842
|
if (rubricEvaluator) {
|
|
@@ -2671,7 +2847,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
2671
2847
|
const userFilePaths = collectResolvedInputFilePaths(inputMessages);
|
|
2672
2848
|
const testCase = {
|
|
2673
2849
|
id,
|
|
2674
|
-
|
|
2850
|
+
suite: suiteName,
|
|
2675
2851
|
conversation_id: conversationId,
|
|
2676
2852
|
question,
|
|
2677
2853
|
input: inputMessages,
|
|
@@ -2679,7 +2855,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
2679
2855
|
reference_answer: referenceAnswer,
|
|
2680
2856
|
file_paths: userFilePaths,
|
|
2681
2857
|
criteria: outcome ?? "",
|
|
2682
|
-
evaluator:
|
|
2858
|
+
evaluator: testCaseEvaluatorKind,
|
|
2683
2859
|
assertions: evaluators
|
|
2684
2860
|
};
|
|
2685
2861
|
results.push(testCase);
|
|
@@ -2862,6 +3038,9 @@ function buildChatPromptFromSegments(options) {
|
|
|
2862
3038
|
var ANSI_YELLOW6 = "\x1B[33m";
|
|
2863
3039
|
var ANSI_RED3 = "\x1B[31m";
|
|
2864
3040
|
var ANSI_RESET7 = "\x1B[0m";
|
|
3041
|
+
function matchesFilter2(id, filter) {
|
|
3042
|
+
return typeof filter === "string" ? micromatch2.isMatch(id, filter) : filter.some((pattern) => micromatch2.isMatch(id, pattern));
|
|
3043
|
+
}
|
|
2865
3044
|
function resolveTests(suite) {
|
|
2866
3045
|
if (suite.tests !== void 0) return suite.tests;
|
|
2867
3046
|
if (suite.eval_cases !== void 0) {
|
|
@@ -2941,18 +3120,18 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
2941
3120
|
throw new Error(`Invalid test file format: ${evalFilePath}`);
|
|
2942
3121
|
}
|
|
2943
3122
|
const suite = interpolated;
|
|
2944
|
-
const
|
|
2945
|
-
const
|
|
2946
|
-
const
|
|
2947
|
-
const
|
|
3123
|
+
const suiteNameFromFile = asString5(suite.name)?.trim();
|
|
3124
|
+
const fallbackSuiteName = path7.basename(absoluteTestPath).replace(/\.eval\.ya?ml$/i, "").replace(/\.ya?ml$/i, "") || "eval";
|
|
3125
|
+
const suiteName = suiteNameFromFile && suiteNameFromFile.length > 0 ? suiteNameFromFile : fallbackSuiteName;
|
|
3126
|
+
const rawTestCases = resolveTests(suite);
|
|
2948
3127
|
const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm-grader";
|
|
2949
3128
|
const evalFileDir = path7.dirname(absoluteTestPath);
|
|
2950
|
-
let
|
|
2951
|
-
if (typeof
|
|
2952
|
-
const externalPath = path7.resolve(evalFileDir,
|
|
2953
|
-
|
|
2954
|
-
} else if (Array.isArray(
|
|
2955
|
-
|
|
3129
|
+
let expandedTestCases;
|
|
3130
|
+
if (typeof rawTestCases === "string") {
|
|
3131
|
+
const externalPath = path7.resolve(evalFileDir, rawTestCases);
|
|
3132
|
+
expandedTestCases = await loadCasesFromFile(externalPath);
|
|
3133
|
+
} else if (Array.isArray(rawTestCases)) {
|
|
3134
|
+
expandedTestCases = await expandFileReferences(rawTestCases, evalFileDir);
|
|
2956
3135
|
} else {
|
|
2957
3136
|
throw new Error(`Invalid test file format: ${evalFilePath} - missing 'tests' field`);
|
|
2958
3137
|
}
|
|
@@ -2967,32 +3146,33 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
2967
3146
|
}
|
|
2968
3147
|
const globalExecution = suiteAssertions !== void 0 ? { ...rawGlobalExecution ?? {}, assertions: suiteAssertions } : rawGlobalExecution;
|
|
2969
3148
|
const results = [];
|
|
2970
|
-
for (const
|
|
2971
|
-
if (!isJsonObject(
|
|
3149
|
+
for (const rawTestCase of expandedTestCases) {
|
|
3150
|
+
if (!isJsonObject(rawTestCase)) {
|
|
2972
3151
|
logWarning5("Skipping invalid test entry (expected object)");
|
|
2973
3152
|
continue;
|
|
2974
3153
|
}
|
|
2975
|
-
const
|
|
2976
|
-
const id = asString5(
|
|
2977
|
-
if (filterPattern && (!id || !
|
|
3154
|
+
const testCaseConfig = rawTestCase;
|
|
3155
|
+
const id = asString5(testCaseConfig.id);
|
|
3156
|
+
if (filterPattern && (!id || !matchesFilter2(id, filterPattern))) {
|
|
2978
3157
|
continue;
|
|
2979
3158
|
}
|
|
2980
|
-
const conversationId = asString5(
|
|
2981
|
-
let outcome = asString5(
|
|
2982
|
-
if (!outcome &&
|
|
2983
|
-
outcome = asString5(
|
|
3159
|
+
const conversationId = asString5(testCaseConfig.conversation_id);
|
|
3160
|
+
let outcome = asString5(testCaseConfig.criteria);
|
|
3161
|
+
if (!outcome && testCaseConfig.expected_outcome !== void 0) {
|
|
3162
|
+
outcome = asString5(testCaseConfig.expected_outcome);
|
|
2984
3163
|
if (outcome) {
|
|
2985
3164
|
logWarning5(
|
|
2986
|
-
`Test '${asString5(
|
|
3165
|
+
`Test '${asString5(testCaseConfig.id) ?? "unknown"}': 'expected_outcome' is deprecated. Use 'criteria' instead.`
|
|
2987
3166
|
);
|
|
2988
3167
|
}
|
|
2989
3168
|
}
|
|
2990
|
-
const caseExecution = isJsonObject(
|
|
3169
|
+
const caseExecution = isJsonObject(testCaseConfig.execution) ? testCaseConfig.execution : void 0;
|
|
2991
3170
|
const skipDefaults = caseExecution?.skip_defaults === true;
|
|
3171
|
+
const caseThreshold = typeof caseExecution?.threshold === "number" && caseExecution.threshold >= 0 && caseExecution.threshold <= 1 ? caseExecution.threshold : void 0;
|
|
2992
3172
|
const effectiveSuiteInputFiles = suiteInputFiles && !skipDefaults ? suiteInputFiles : void 0;
|
|
2993
|
-
const testInputMessages = resolveInputMessages(
|
|
2994
|
-
const expectedMessages = resolveExpectedMessages(
|
|
2995
|
-
const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 ||
|
|
3173
|
+
const testInputMessages = resolveInputMessages(testCaseConfig, effectiveSuiteInputFiles);
|
|
3174
|
+
const expectedMessages = resolveExpectedMessages(testCaseConfig) ?? [];
|
|
3175
|
+
const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || testCaseConfig.assertions !== void 0 || testCaseConfig.assert !== void 0;
|
|
2996
3176
|
if (!id || !hasEvaluationSpec || !testInputMessages || testInputMessages.length === 0) {
|
|
2997
3177
|
logError3(
|
|
2998
3178
|
`Skipping incomplete test: ${id ?? "unknown"}. Missing required fields: id, input, and at least one of criteria/expected_output/assertions`
|
|
@@ -3039,16 +3219,21 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
3039
3219
|
}
|
|
3040
3220
|
}
|
|
3041
3221
|
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
3042
|
-
const
|
|
3222
|
+
const testCaseEvaluatorKind = coerceEvaluator(testCaseConfig.evaluator, id) ?? globalEvaluator;
|
|
3043
3223
|
let evaluators;
|
|
3044
3224
|
try {
|
|
3045
|
-
evaluators = await parseEvaluators(
|
|
3225
|
+
evaluators = await parseEvaluators(
|
|
3226
|
+
testCaseConfig,
|
|
3227
|
+
globalExecution,
|
|
3228
|
+
searchRoots,
|
|
3229
|
+
id ?? "unknown"
|
|
3230
|
+
);
|
|
3046
3231
|
} catch (error) {
|
|
3047
3232
|
const message = error instanceof Error ? error.message : String(error);
|
|
3048
3233
|
logError3(`Skipping test '${id}': ${message}`);
|
|
3049
3234
|
continue;
|
|
3050
3235
|
}
|
|
3051
|
-
const inlineRubrics =
|
|
3236
|
+
const inlineRubrics = testCaseConfig.rubrics;
|
|
3052
3237
|
if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
|
|
3053
3238
|
const rubricEvaluator = parseInlineRubrics(inlineRubrics);
|
|
3054
3239
|
if (rubricEvaluator) {
|
|
@@ -3057,13 +3242,13 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
3057
3242
|
}
|
|
3058
3243
|
warnUnconsumedCriteria(outcome, evaluators, id ?? "unknown");
|
|
3059
3244
|
const userFilePaths = collectResolvedInputFilePaths(inputMessages);
|
|
3060
|
-
const caseWorkspace = await resolveWorkspaceConfig(
|
|
3245
|
+
const caseWorkspace = await resolveWorkspaceConfig(testCaseConfig.workspace, evalFileDir);
|
|
3061
3246
|
const mergedWorkspace = mergeWorkspaceConfigs(suiteWorkspace, caseWorkspace);
|
|
3062
|
-
const metadata = isJsonObject(
|
|
3063
|
-
const caseTargets = extractTargetsFromTestCase(
|
|
3247
|
+
const metadata = isJsonObject(testCaseConfig.metadata) ? testCaseConfig.metadata : void 0;
|
|
3248
|
+
const caseTargets = extractTargetsFromTestCase(testCaseConfig);
|
|
3064
3249
|
const testCase = {
|
|
3065
3250
|
id,
|
|
3066
|
-
|
|
3251
|
+
suite: suiteName,
|
|
3067
3252
|
category: options?.category,
|
|
3068
3253
|
conversation_id: conversationId,
|
|
3069
3254
|
question,
|
|
@@ -3072,11 +3257,12 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
3072
3257
|
reference_answer: referenceAnswer,
|
|
3073
3258
|
file_paths: userFilePaths,
|
|
3074
3259
|
criteria: outcome ?? "",
|
|
3075
|
-
evaluator:
|
|
3260
|
+
evaluator: testCaseEvaluatorKind,
|
|
3076
3261
|
assertions: evaluators,
|
|
3077
3262
|
workspace: mergedWorkspace,
|
|
3078
3263
|
metadata,
|
|
3079
|
-
targets: caseTargets
|
|
3264
|
+
targets: caseTargets,
|
|
3265
|
+
...caseThreshold !== void 0 ? { threshold: caseThreshold } : {}
|
|
3080
3266
|
};
|
|
3081
3267
|
results.push(testCase);
|
|
3082
3268
|
}
|
|
@@ -3620,7 +3806,7 @@ var AzureProvider = class {
|
|
|
3620
3806
|
};
|
|
3621
3807
|
this.retryConfig = config.retry;
|
|
3622
3808
|
const azure = createAzure(buildAzureOptions(config));
|
|
3623
|
-
this.model = azure.chat(config.deploymentName);
|
|
3809
|
+
this.model = config.apiFormat === "responses" ? azure(config.deploymentName) : azure.chat(config.deploymentName);
|
|
3624
3810
|
}
|
|
3625
3811
|
id;
|
|
3626
3812
|
kind = "azure";
|
|
@@ -3746,7 +3932,9 @@ function buildAzureOptions(config) {
|
|
|
3746
3932
|
const options = {
|
|
3747
3933
|
apiKey: config.apiKey,
|
|
3748
3934
|
apiVersion: config.version,
|
|
3749
|
-
|
|
3935
|
+
// Chat completions still use deployment-scoped Azure URLs for compatibility
|
|
3936
|
+
// with existing deployments. Responses API should use the SDK's v1 path.
|
|
3937
|
+
useDeploymentBasedUrls: config.apiFormat !== "responses"
|
|
3750
3938
|
};
|
|
3751
3939
|
const baseURL = normalizeAzureBaseUrl(config.resourceName);
|
|
3752
3940
|
if (baseURL) {
|
|
@@ -7354,6 +7542,22 @@ function extractAzureResourceName(baseUrl) {
|
|
|
7354
7542
|
if (urlMatch) return urlMatch[1];
|
|
7355
7543
|
return baseUrl;
|
|
7356
7544
|
}
|
|
7545
|
+
function normalizeAzureSdkBaseUrl(baseUrl) {
|
|
7546
|
+
const trimmed = baseUrl.trim().replace(/\/+$/, "");
|
|
7547
|
+
if (!trimmed) {
|
|
7548
|
+
return trimmed;
|
|
7549
|
+
}
|
|
7550
|
+
if (!/^https?:\/\//i.test(trimmed)) {
|
|
7551
|
+
return `https://${trimmed}.openai.azure.com/openai/v1`;
|
|
7552
|
+
}
|
|
7553
|
+
if (/\/openai\/v1$/i.test(trimmed)) {
|
|
7554
|
+
return trimmed;
|
|
7555
|
+
}
|
|
7556
|
+
if (/\/openai$/i.test(trimmed)) {
|
|
7557
|
+
return `${trimmed}/v1`;
|
|
7558
|
+
}
|
|
7559
|
+
return `${trimmed}/openai/v1`;
|
|
7560
|
+
}
|
|
7357
7561
|
|
|
7358
7562
|
// src/evaluation/providers/pi-utils.ts
|
|
7359
7563
|
function extractPiTextContent(content) {
|
|
@@ -8184,11 +8388,41 @@ async function defaultPiRunner(options) {
|
|
|
8184
8388
|
// src/evaluation/providers/pi-coding-agent.ts
|
|
8185
8389
|
import { execSync as execSync2 } from "node:child_process";
|
|
8186
8390
|
import { randomUUID as randomUUID8 } from "node:crypto";
|
|
8187
|
-
import { accessSync as accessSync2, createWriteStream as createWriteStream6 } from "node:fs";
|
|
8391
|
+
import { accessSync as accessSync2, createWriteStream as createWriteStream6, mkdirSync } from "node:fs";
|
|
8188
8392
|
import { mkdir as mkdir7 } from "node:fs/promises";
|
|
8189
|
-
import
|
|
8393
|
+
import path21 from "node:path";
|
|
8190
8394
|
import { createInterface } from "node:readline";
|
|
8191
|
-
import { fileURLToPath as fileURLToPath3 } from "node:url";
|
|
8395
|
+
import { fileURLToPath as fileURLToPath3, pathToFileURL } from "node:url";
|
|
8396
|
+
|
|
8397
|
+
// src/paths.ts
|
|
8398
|
+
import os2 from "node:os";
|
|
8399
|
+
import path20 from "node:path";
|
|
8400
|
+
var logged = false;
|
|
8401
|
+
function getAgentvHome() {
|
|
8402
|
+
const envHome = process.env.AGENTV_HOME;
|
|
8403
|
+
if (envHome && envHome !== "undefined") {
|
|
8404
|
+
if (!logged) {
|
|
8405
|
+
logged = true;
|
|
8406
|
+
console.warn(`Using AGENTV_HOME: ${envHome}`);
|
|
8407
|
+
}
|
|
8408
|
+
return envHome;
|
|
8409
|
+
}
|
|
8410
|
+
return path20.join(os2.homedir(), ".agentv");
|
|
8411
|
+
}
|
|
8412
|
+
function getWorkspacesRoot() {
|
|
8413
|
+
return path20.join(getAgentvHome(), "workspaces");
|
|
8414
|
+
}
|
|
8415
|
+
function getSubagentsRoot() {
|
|
8416
|
+
return path20.join(getAgentvHome(), "subagents");
|
|
8417
|
+
}
|
|
8418
|
+
function getTraceStateRoot() {
|
|
8419
|
+
return path20.join(getAgentvHome(), "trace-state");
|
|
8420
|
+
}
|
|
8421
|
+
function getWorkspacePoolRoot() {
|
|
8422
|
+
return path20.join(getAgentvHome(), "workspace-pool");
|
|
8423
|
+
}
|
|
8424
|
+
|
|
8425
|
+
// src/evaluation/providers/pi-coding-agent.ts
|
|
8192
8426
|
var piCodingAgentModule = null;
|
|
8193
8427
|
var piAiModule = null;
|
|
8194
8428
|
var loadingPromise = null;
|
|
@@ -8206,46 +8440,126 @@ async function promptInstall() {
|
|
|
8206
8440
|
rl.close();
|
|
8207
8441
|
}
|
|
8208
8442
|
}
|
|
8209
|
-
function
|
|
8210
|
-
|
|
8211
|
-
|
|
8212
|
-
|
|
8443
|
+
function findManagedSdkInstallRoot() {
|
|
8444
|
+
return path21.join(getAgentvHome(), "deps", "pi-sdk");
|
|
8445
|
+
}
|
|
8446
|
+
function resolveGlobalNpmRoot() {
|
|
8447
|
+
try {
|
|
8448
|
+
const root = execSync2("npm root -g", {
|
|
8449
|
+
encoding: "utf-8",
|
|
8450
|
+
stdio: ["ignore", "pipe", "ignore"]
|
|
8451
|
+
}).trim();
|
|
8452
|
+
return root.length > 0 ? root : void 0;
|
|
8453
|
+
} catch {
|
|
8454
|
+
return void 0;
|
|
8455
|
+
}
|
|
8456
|
+
}
|
|
8457
|
+
function buildGlobalModuleEntry(moduleName, globalNpmRoot) {
|
|
8458
|
+
return path21.join(globalNpmRoot, ...moduleName.split("/"), "dist", "index.js");
|
|
8459
|
+
}
|
|
8460
|
+
function findAccessiblePath(paths) {
|
|
8461
|
+
for (const candidate of paths) {
|
|
8213
8462
|
try {
|
|
8214
|
-
|
|
8215
|
-
|
|
8216
|
-
return dir;
|
|
8463
|
+
accessSync2(candidate);
|
|
8464
|
+
return candidate;
|
|
8217
8465
|
} catch {
|
|
8218
|
-
const parent = path20.dirname(dir);
|
|
8219
|
-
if (parent === dir) break;
|
|
8220
|
-
dir = parent;
|
|
8221
8466
|
}
|
|
8222
8467
|
}
|
|
8223
|
-
return
|
|
8468
|
+
return void 0;
|
|
8224
8469
|
}
|
|
8225
|
-
async function
|
|
8470
|
+
async function tryImportLocalSdkModules() {
|
|
8226
8471
|
try {
|
|
8227
8472
|
[piCodingAgentModule, piAiModule] = await Promise.all([
|
|
8228
8473
|
import("@mariozechner/pi-coding-agent"),
|
|
8229
8474
|
import("@mariozechner/pi-ai")
|
|
8230
8475
|
]);
|
|
8476
|
+
return true;
|
|
8231
8477
|
} catch {
|
|
8232
|
-
|
|
8233
|
-
|
|
8234
|
-
|
|
8235
|
-
|
|
8236
|
-
|
|
8237
|
-
|
|
8238
|
-
|
|
8239
|
-
|
|
8240
|
-
|
|
8241
|
-
|
|
8242
|
-
|
|
8243
|
-
|
|
8244
|
-
|
|
8245
|
-
|
|
8246
|
-
|
|
8478
|
+
return false;
|
|
8479
|
+
}
|
|
8480
|
+
}
|
|
8481
|
+
async function tryImportManagedSdkModules() {
|
|
8482
|
+
const managedRoot = findManagedSdkInstallRoot();
|
|
8483
|
+
const piCodingAgentEntry = findAccessiblePath([
|
|
8484
|
+
path21.join(managedRoot, "node_modules", "@mariozechner", "pi-coding-agent", "dist", "index.js")
|
|
8485
|
+
]);
|
|
8486
|
+
const piAiEntry = findAccessiblePath([
|
|
8487
|
+
path21.join(managedRoot, "node_modules", "@mariozechner", "pi-ai", "dist", "index.js"),
|
|
8488
|
+
path21.join(
|
|
8489
|
+
managedRoot,
|
|
8490
|
+
"node_modules",
|
|
8491
|
+
"@mariozechner",
|
|
8492
|
+
"pi-coding-agent",
|
|
8493
|
+
"node_modules",
|
|
8494
|
+
"@mariozechner",
|
|
8495
|
+
"pi-ai",
|
|
8496
|
+
"dist",
|
|
8497
|
+
"index.js"
|
|
8498
|
+
)
|
|
8499
|
+
]);
|
|
8500
|
+
if (!piCodingAgentEntry || !piAiEntry) return false;
|
|
8501
|
+
try {
|
|
8502
|
+
[piCodingAgentModule, piAiModule] = await Promise.all([
|
|
8503
|
+
import(pathToFileURL(piCodingAgentEntry).href),
|
|
8504
|
+
import(pathToFileURL(piAiEntry).href)
|
|
8505
|
+
]);
|
|
8506
|
+
return true;
|
|
8507
|
+
} catch {
|
|
8508
|
+
return false;
|
|
8509
|
+
}
|
|
8510
|
+
}
|
|
8511
|
+
async function tryImportGlobalSdkModules() {
|
|
8512
|
+
const globalNpmRoot = resolveGlobalNpmRoot();
|
|
8513
|
+
if (!globalNpmRoot) return false;
|
|
8514
|
+
const piCodingAgentEntry = findAccessiblePath([
|
|
8515
|
+
buildGlobalModuleEntry("@mariozechner/pi-coding-agent", globalNpmRoot)
|
|
8516
|
+
]);
|
|
8517
|
+
const piAiEntry = findAccessiblePath([
|
|
8518
|
+
buildGlobalModuleEntry("@mariozechner/pi-ai", globalNpmRoot),
|
|
8519
|
+
path21.join(
|
|
8520
|
+
globalNpmRoot,
|
|
8521
|
+
"@mariozechner",
|
|
8522
|
+
"pi-coding-agent",
|
|
8523
|
+
"node_modules",
|
|
8524
|
+
"@mariozechner",
|
|
8525
|
+
"pi-ai",
|
|
8526
|
+
"dist",
|
|
8527
|
+
"index.js"
|
|
8528
|
+
)
|
|
8529
|
+
]);
|
|
8530
|
+
if (!piCodingAgentEntry || !piAiEntry) return false;
|
|
8531
|
+
try {
|
|
8532
|
+
[piCodingAgentModule, piAiModule] = await Promise.all([
|
|
8533
|
+
import(pathToFileURL(piCodingAgentEntry).href),
|
|
8534
|
+
import(pathToFileURL(piAiEntry).href)
|
|
8535
|
+
]);
|
|
8536
|
+
return true;
|
|
8537
|
+
} catch {
|
|
8538
|
+
return false;
|
|
8539
|
+
}
|
|
8540
|
+
}
|
|
8541
|
+
function installSdkModules(installDir) {
|
|
8542
|
+
console.error(`Installing @mariozechner/pi-coding-agent into ${installDir} via npm...`);
|
|
8543
|
+
mkdirSync(installDir, { recursive: true });
|
|
8544
|
+
execSync2("npm install --no-save --no-package-lock @mariozechner/pi-coding-agent", {
|
|
8545
|
+
cwd: installDir,
|
|
8546
|
+
stdio: "inherit"
|
|
8547
|
+
});
|
|
8548
|
+
}
|
|
8549
|
+
async function doLoadSdkModules() {
|
|
8550
|
+
if (await tryImportLocalSdkModules() || await tryImportManagedSdkModules() || await tryImportGlobalSdkModules()) {
|
|
8551
|
+
return;
|
|
8552
|
+
}
|
|
8553
|
+
if (await promptInstall()) {
|
|
8554
|
+
const installDir = findManagedSdkInstallRoot();
|
|
8555
|
+
installSdkModules(installDir);
|
|
8556
|
+
if (await tryImportManagedSdkModules()) {
|
|
8557
|
+
return;
|
|
8247
8558
|
}
|
|
8248
8559
|
}
|
|
8560
|
+
throw new Error(
|
|
8561
|
+
"pi-coding-agent SDK is not installed. Install it with:\n npm install @mariozechner/pi-coding-agent"
|
|
8562
|
+
);
|
|
8249
8563
|
}
|
|
8250
8564
|
async function loadSdkModules() {
|
|
8251
8565
|
if (!piCodingAgentModule || !piAiModule) {
|
|
@@ -8302,12 +8616,16 @@ var PiCodingAgentProvider = class {
|
|
|
8302
8616
|
try {
|
|
8303
8617
|
const cwd = this.resolveCwd(request.cwd);
|
|
8304
8618
|
const rawProvider = this.config.subprovider ?? "google";
|
|
8305
|
-
const
|
|
8619
|
+
const normalizedBaseUrl = this.normalizeSdkBaseUrl(rawProvider, this.config.baseUrl);
|
|
8620
|
+
const hasBaseUrl = !!normalizedBaseUrl;
|
|
8306
8621
|
const providerName = resolveSubprovider(rawProvider, hasBaseUrl);
|
|
8307
8622
|
const modelId = this.config.model ?? "gemini-2.5-flash";
|
|
8308
8623
|
this.setApiKeyEnv(rawProvider, hasBaseUrl);
|
|
8309
|
-
this.setBaseUrlEnv(rawProvider, hasBaseUrl);
|
|
8624
|
+
this.setBaseUrlEnv(rawProvider, normalizedBaseUrl, hasBaseUrl);
|
|
8310
8625
|
let model = sdk.getModel(providerName, modelId);
|
|
8626
|
+
if (model && normalizedBaseUrl) {
|
|
8627
|
+
model = { ...model, baseUrl: normalizedBaseUrl };
|
|
8628
|
+
}
|
|
8311
8629
|
if (!model) {
|
|
8312
8630
|
const envProvider = providerName.replace(/-responses$/, "");
|
|
8313
8631
|
model = {
|
|
@@ -8315,7 +8633,7 @@ var PiCodingAgentProvider = class {
|
|
|
8315
8633
|
name: modelId,
|
|
8316
8634
|
api: providerName,
|
|
8317
8635
|
provider: envProvider,
|
|
8318
|
-
baseUrl:
|
|
8636
|
+
baseUrl: normalizedBaseUrl ?? "",
|
|
8319
8637
|
reasoning: false,
|
|
8320
8638
|
input: ["text"],
|
|
8321
8639
|
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
|
|
@@ -8482,19 +8800,27 @@ ${fileList}`;
|
|
|
8482
8800
|
}
|
|
8483
8801
|
}
|
|
8484
8802
|
/** Maps config baseUrl to the provider-specific env var the SDK reads. */
|
|
8485
|
-
setBaseUrlEnv(providerName, hasBaseUrl = false) {
|
|
8486
|
-
|
|
8803
|
+
setBaseUrlEnv(providerName, baseUrl = this.config.baseUrl, hasBaseUrl = false) {
|
|
8804
|
+
const normalizedBaseUrl = this.normalizeSdkBaseUrl(providerName, baseUrl);
|
|
8805
|
+
if (!normalizedBaseUrl) return;
|
|
8487
8806
|
const envKey = resolveEnvBaseUrlName(providerName, hasBaseUrl);
|
|
8488
8807
|
if (envKey) {
|
|
8489
|
-
process.env[envKey] =
|
|
8808
|
+
process.env[envKey] = normalizedBaseUrl;
|
|
8490
8809
|
}
|
|
8491
8810
|
}
|
|
8811
|
+
normalizeSdkBaseUrl(providerName, baseUrl) {
|
|
8812
|
+
if (!baseUrl) return void 0;
|
|
8813
|
+
if (providerName.toLowerCase() === "azure") {
|
|
8814
|
+
return normalizeAzureSdkBaseUrl(baseUrl);
|
|
8815
|
+
}
|
|
8816
|
+
return baseUrl;
|
|
8817
|
+
}
|
|
8492
8818
|
resolveCwd(cwdOverride) {
|
|
8493
8819
|
if (cwdOverride) {
|
|
8494
|
-
return
|
|
8820
|
+
return path21.resolve(cwdOverride);
|
|
8495
8821
|
}
|
|
8496
8822
|
if (this.config.cwd) {
|
|
8497
|
-
return
|
|
8823
|
+
return path21.resolve(this.config.cwd);
|
|
8498
8824
|
}
|
|
8499
8825
|
return process.cwd();
|
|
8500
8826
|
}
|
|
@@ -8513,9 +8839,9 @@ ${fileList}`;
|
|
|
8513
8839
|
}
|
|
8514
8840
|
resolveLogDirectory() {
|
|
8515
8841
|
if (this.config.logDir) {
|
|
8516
|
-
return
|
|
8842
|
+
return path21.resolve(this.config.logDir);
|
|
8517
8843
|
}
|
|
8518
|
-
return
|
|
8844
|
+
return path21.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
|
|
8519
8845
|
}
|
|
8520
8846
|
async createStreamLogger(request) {
|
|
8521
8847
|
const logDir = this.resolveLogDirectory();
|
|
@@ -8529,7 +8855,7 @@ ${fileList}`;
|
|
|
8529
8855
|
console.warn(`Skipping Pi stream logging (could not create ${logDir}): ${message}`);
|
|
8530
8856
|
return void 0;
|
|
8531
8857
|
}
|
|
8532
|
-
const filePath =
|
|
8858
|
+
const filePath = path21.join(logDir, buildLogFilename6(request, this.targetName));
|
|
8533
8859
|
try {
|
|
8534
8860
|
const logger = await PiStreamLogger2.create({
|
|
8535
8861
|
filePath,
|
|
@@ -8754,7 +9080,7 @@ import path30 from "node:path";
|
|
|
8754
9080
|
// src/evaluation/providers/vscode/utils/fs.ts
|
|
8755
9081
|
import { constants as constants2 } from "node:fs";
|
|
8756
9082
|
import { access as access2, mkdir as mkdir8, readdir as readdir2, rm as rm2, stat as stat2 } from "node:fs/promises";
|
|
8757
|
-
import
|
|
9083
|
+
import path22 from "node:path";
|
|
8758
9084
|
async function pathExists(target) {
|
|
8759
9085
|
try {
|
|
8760
9086
|
await access2(target, constants2.F_OK);
|
|
@@ -8770,7 +9096,7 @@ async function readDirEntries(target) {
|
|
|
8770
9096
|
const entries = await readdir2(target, { withFileTypes: true });
|
|
8771
9097
|
return entries.map((entry) => ({
|
|
8772
9098
|
name: entry.name,
|
|
8773
|
-
absolutePath:
|
|
9099
|
+
absolutePath: path22.join(target, entry.name),
|
|
8774
9100
|
isDirectory: entry.isDirectory()
|
|
8775
9101
|
}));
|
|
8776
9102
|
}
|
|
@@ -8785,9 +9111,9 @@ async function removeIfExists(target) {
|
|
|
8785
9111
|
}
|
|
8786
9112
|
|
|
8787
9113
|
// src/evaluation/providers/vscode/utils/path.ts
|
|
8788
|
-
import
|
|
9114
|
+
import path23 from "node:path";
|
|
8789
9115
|
function pathToFileUri2(filePath) {
|
|
8790
|
-
const absolutePath =
|
|
9116
|
+
const absolutePath = path23.isAbsolute(filePath) ? filePath : path23.resolve(filePath);
|
|
8791
9117
|
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
8792
9118
|
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
8793
9119
|
return `file:///${normalizedPath}`;
|
|
@@ -8796,7 +9122,7 @@ function pathToFileUri2(filePath) {
|
|
|
8796
9122
|
}
|
|
8797
9123
|
|
|
8798
9124
|
// src/evaluation/providers/vscode/dispatch/promptBuilder.ts
|
|
8799
|
-
import
|
|
9125
|
+
import path24 from "node:path";
|
|
8800
9126
|
|
|
8801
9127
|
// src/evaluation/providers/vscode/utils/template.ts
|
|
8802
9128
|
function renderTemplate2(content, variables) {
|
|
@@ -8888,8 +9214,8 @@ function createBatchRequestPrompt(userQuery, responseFileTmp, responseFileFinal,
|
|
|
8888
9214
|
});
|
|
8889
9215
|
}
|
|
8890
9216
|
function createBatchOrchestratorPrompt(requestFiles, responseFiles, templateContent) {
|
|
8891
|
-
const requestLines = requestFiles.map((file, index) => `${index + 1}. messages/${
|
|
8892
|
-
const responseList = responseFiles.map((file) => `"${
|
|
9217
|
+
const requestLines = requestFiles.map((file, index) => `${index + 1}. messages/${path24.basename(file)}`).join("\n");
|
|
9218
|
+
const responseList = responseFiles.map((file) => `"${path24.basename(file)}"`).join(", ");
|
|
8893
9219
|
return renderTemplate2(templateContent, {
|
|
8894
9220
|
requestFiles: requestLines,
|
|
8895
9221
|
responseList
|
|
@@ -8898,7 +9224,7 @@ function createBatchOrchestratorPrompt(requestFiles, responseFiles, templateCont
|
|
|
8898
9224
|
|
|
8899
9225
|
// src/evaluation/providers/vscode/dispatch/responseWaiter.ts
|
|
8900
9226
|
import { readFile as readFile9 } from "node:fs/promises";
|
|
8901
|
-
import
|
|
9227
|
+
import path25 from "node:path";
|
|
8902
9228
|
|
|
8903
9229
|
// src/evaluation/providers/vscode/utils/time.ts
|
|
8904
9230
|
function sleep2(ms) {
|
|
@@ -8957,7 +9283,7 @@ async function waitForResponseOutput(responseFileFinal, pollInterval = 1e3, sile
|
|
|
8957
9283
|
}
|
|
8958
9284
|
async function waitForBatchResponses(responseFilesFinal, pollInterval = 1e3, silent = false, timeoutMs = DEFAULT_TIMEOUT_MS) {
|
|
8959
9285
|
if (!silent) {
|
|
8960
|
-
const fileList = responseFilesFinal.map((file) =>
|
|
9286
|
+
const fileList = responseFilesFinal.map((file) => path25.basename(file)).join(", ");
|
|
8961
9287
|
console.error(`waiting for ${responseFilesFinal.length} batch response(s): ${fileList}`);
|
|
8962
9288
|
}
|
|
8963
9289
|
const deadline = Date.now() + timeoutMs;
|
|
@@ -8966,7 +9292,7 @@ async function waitForBatchResponses(responseFilesFinal, pollInterval = 1e3, sil
|
|
|
8966
9292
|
while (pending.size > 0) {
|
|
8967
9293
|
if (Date.now() >= deadline) {
|
|
8968
9294
|
if (!silent) {
|
|
8969
|
-
const remaining = [...pending].map((f) =>
|
|
9295
|
+
const remaining = [...pending].map((f) => path25.basename(f)).join(", ");
|
|
8970
9296
|
console.error(
|
|
8971
9297
|
`error: timed out after ${Math.round(timeoutMs / 1e3)}s waiting for batch responses. Still pending: ${remaining}`
|
|
8972
9298
|
);
|
|
@@ -9022,36 +9348,6 @@ import { promisify as promisify2 } from "node:util";
|
|
|
9022
9348
|
|
|
9023
9349
|
// src/evaluation/providers/vscode/dispatch/constants.ts
|
|
9024
9350
|
import path26 from "node:path";
|
|
9025
|
-
|
|
9026
|
-
// src/paths.ts
|
|
9027
|
-
import os2 from "node:os";
|
|
9028
|
-
import path25 from "node:path";
|
|
9029
|
-
var logged = false;
|
|
9030
|
-
function getAgentvHome() {
|
|
9031
|
-
const envHome = process.env.AGENTV_HOME;
|
|
9032
|
-
if (envHome && envHome !== "undefined") {
|
|
9033
|
-
if (!logged) {
|
|
9034
|
-
logged = true;
|
|
9035
|
-
console.warn(`Using AGENTV_HOME: ${envHome}`);
|
|
9036
|
-
}
|
|
9037
|
-
return envHome;
|
|
9038
|
-
}
|
|
9039
|
-
return path25.join(os2.homedir(), ".agentv");
|
|
9040
|
-
}
|
|
9041
|
-
function getWorkspacesRoot() {
|
|
9042
|
-
return path25.join(getAgentvHome(), "workspaces");
|
|
9043
|
-
}
|
|
9044
|
-
function getSubagentsRoot() {
|
|
9045
|
-
return path25.join(getAgentvHome(), "subagents");
|
|
9046
|
-
}
|
|
9047
|
-
function getTraceStateRoot() {
|
|
9048
|
-
return path25.join(getAgentvHome(), "trace-state");
|
|
9049
|
-
}
|
|
9050
|
-
function getWorkspacePoolRoot() {
|
|
9051
|
-
return path25.join(getAgentvHome(), "workspace-pool");
|
|
9052
|
-
}
|
|
9053
|
-
|
|
9054
|
-
// src/evaluation/providers/vscode/dispatch/constants.ts
|
|
9055
9351
|
var DEFAULT_LOCK_NAME = "subagent.lock";
|
|
9056
9352
|
var DEFAULT_ALIVE_FILENAME = ".alive";
|
|
9057
9353
|
function getDefaultSubagentRoot(vscodeCmd = "code") {
|
|
@@ -10308,9 +10604,10 @@ function resolveAndCreateProvider(definition, env = process.env) {
|
|
|
10308
10604
|
}
|
|
10309
10605
|
|
|
10310
10606
|
// src/evaluation/evaluators/scoring.ts
|
|
10311
|
-
var
|
|
10312
|
-
|
|
10313
|
-
|
|
10607
|
+
var DEFAULT_THRESHOLD = 0.8;
|
|
10608
|
+
var PASS_THRESHOLD = DEFAULT_THRESHOLD;
|
|
10609
|
+
function scoreToVerdict(score, threshold = DEFAULT_THRESHOLD) {
|
|
10610
|
+
return score >= threshold ? "pass" : "fail";
|
|
10314
10611
|
}
|
|
10315
10612
|
function clampScore(value) {
|
|
10316
10613
|
if (Number.isNaN(value) || !Number.isFinite(value)) {
|
|
@@ -10499,13 +10796,13 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
|
|
|
10499
10796
|
async function execShellWithStdin(command, stdinPayload, options = {}) {
|
|
10500
10797
|
const { mkdir: mkdir16, readFile: readFile15, rm: rm6, writeFile: writeFile9 } = await import("node:fs/promises");
|
|
10501
10798
|
const { tmpdir: tmpdir3 } = await import("node:os");
|
|
10502
|
-
const
|
|
10799
|
+
const path50 = await import("node:path");
|
|
10503
10800
|
const { randomUUID: randomUUID10 } = await import("node:crypto");
|
|
10504
|
-
const dir =
|
|
10801
|
+
const dir = path50.join(tmpdir3(), `agentv-exec-${randomUUID10()}`);
|
|
10505
10802
|
await mkdir16(dir, { recursive: true });
|
|
10506
|
-
const stdinPath =
|
|
10507
|
-
const stdoutPath =
|
|
10508
|
-
const stderrPath =
|
|
10803
|
+
const stdinPath = path50.join(dir, "stdin.txt");
|
|
10804
|
+
const stdoutPath = path50.join(dir, "stdout.txt");
|
|
10805
|
+
const stderrPath = path50.join(dir, "stderr.txt");
|
|
10509
10806
|
await writeFile9(stdinPath, stdinPayload, "utf8");
|
|
10510
10807
|
const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
|
|
10511
10808
|
const { spawn: spawn5 } = await import("node:child_process");
|
|
@@ -11703,7 +12000,7 @@ ${outputSchema}`;
|
|
|
11703
12000
|
parts.push("[[ ## scoring_criteria ## ]]");
|
|
11704
12001
|
for (const rubric of rubrics) {
|
|
11705
12002
|
const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
|
|
11706
|
-
const minScoreLabel = rubric.required_min_score !== void 0 ? ` [REQUIRED: min score ${rubric.required_min_score}]` : "";
|
|
12003
|
+
const minScoreLabel = rubric.min_score !== void 0 ? ` [REQUIRED: min score ${rubric.min_score}]` : rubric.required_min_score !== void 0 ? ` [REQUIRED: min score ${rubric.required_min_score}]` : "";
|
|
11707
12004
|
parts.push("", `### Criterion: ${rubric.id}${weightLabel}${minScoreLabel}`);
|
|
11708
12005
|
if (rubric.outcome) {
|
|
11709
12006
|
parts.push(`Description: ${rubric.outcome}`);
|
|
@@ -11757,54 +12054,106 @@ ${outputSchema}`;
|
|
|
11757
12054
|
async runWithRetry(options) {
|
|
11758
12055
|
const { context, graderProvider, systemPrompt, userPrompt, schema, images } = options;
|
|
11759
12056
|
let lastError;
|
|
12057
|
+
let lastInvalidResponse;
|
|
12058
|
+
let shouldAttemptStructureFix = false;
|
|
11760
12059
|
for (let attempt = 1; attempt <= 3; attempt++) {
|
|
11761
12060
|
try {
|
|
11762
|
-
const
|
|
11763
|
-
|
|
11764
|
-
|
|
11765
|
-
|
|
11766
|
-
|
|
11767
|
-
|
|
11768
|
-
|
|
11769
|
-
|
|
11770
|
-
|
|
11771
|
-
|
|
11772
|
-
|
|
11773
|
-
|
|
11774
|
-
|
|
11775
|
-
|
|
11776
|
-
|
|
11777
|
-
|
|
11778
|
-
]
|
|
11779
|
-
}
|
|
11780
|
-
],
|
|
11781
|
-
...modelOptions
|
|
11782
|
-
}) : await generateText2({
|
|
11783
|
-
model,
|
|
11784
|
-
system: systemPrompt,
|
|
11785
|
-
prompt: userPrompt,
|
|
11786
|
-
...modelOptions
|
|
11787
|
-
});
|
|
11788
|
-
const data2 = schema.parse(parseJsonFromText(result.text));
|
|
11789
|
-
const rawUsage = result.usage;
|
|
11790
|
-
const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens } : void 0;
|
|
11791
|
-
return { data: data2, tokenUsage };
|
|
12061
|
+
const result = await this.generateStructuredResponse({
|
|
12062
|
+
context,
|
|
12063
|
+
graderProvider,
|
|
12064
|
+
systemPrompt,
|
|
12065
|
+
userPrompt,
|
|
12066
|
+
images
|
|
12067
|
+
});
|
|
12068
|
+
const canRepairResponse = result.text.trim().length > 0;
|
|
12069
|
+
lastInvalidResponse = canRepairResponse ? result : void 0;
|
|
12070
|
+
let data;
|
|
12071
|
+
try {
|
|
12072
|
+
data = schema.parse(parseJsonFromText(result.text));
|
|
12073
|
+
} catch (e) {
|
|
12074
|
+
lastError = e instanceof Error ? e : new Error(String(e));
|
|
12075
|
+
shouldAttemptStructureFix = canRepairResponse;
|
|
12076
|
+
continue;
|
|
11792
12077
|
}
|
|
11793
|
-
|
|
11794
|
-
|
|
12078
|
+
return {
|
|
12079
|
+
data,
|
|
12080
|
+
providerResponse: result.providerResponse,
|
|
12081
|
+
tokenUsage: result.tokenUsage
|
|
12082
|
+
};
|
|
12083
|
+
} catch (e) {
|
|
12084
|
+
lastError = e instanceof Error ? e : new Error(String(e));
|
|
12085
|
+
}
|
|
12086
|
+
}
|
|
12087
|
+
if (shouldAttemptStructureFix && lastInvalidResponse) {
|
|
12088
|
+
try {
|
|
12089
|
+
const repaired = await this.generateStructuredResponse({
|
|
12090
|
+
context,
|
|
12091
|
+
graderProvider,
|
|
11795
12092
|
systemPrompt,
|
|
11796
|
-
|
|
11797
|
-
|
|
11798
|
-
|
|
11799
|
-
|
|
12093
|
+
userPrompt: buildStructureRepairPrompt({
|
|
12094
|
+
validationError: lastError?.message ?? "Schema validation failed",
|
|
12095
|
+
invalidResponse: lastInvalidResponse.text
|
|
12096
|
+
})
|
|
11800
12097
|
});
|
|
11801
|
-
const data = schema.parse(parseJsonFromText(
|
|
11802
|
-
return {
|
|
12098
|
+
const data = schema.parse(parseJsonFromText(repaired.text));
|
|
12099
|
+
return {
|
|
12100
|
+
data,
|
|
12101
|
+
providerResponse: repaired.providerResponse,
|
|
12102
|
+
tokenUsage: sumTokenUsage(lastInvalidResponse.tokenUsage, repaired.tokenUsage)
|
|
12103
|
+
};
|
|
11803
12104
|
} catch (e) {
|
|
11804
12105
|
lastError = e instanceof Error ? e : new Error(String(e));
|
|
11805
12106
|
}
|
|
11806
12107
|
}
|
|
11807
|
-
throw new Error(
|
|
12108
|
+
throw new Error(
|
|
12109
|
+
`Failed to parse evaluator response after 3 attempts and 1 structure-fix attempt: ${lastError?.message}`
|
|
12110
|
+
);
|
|
12111
|
+
}
|
|
12112
|
+
async generateStructuredResponse(options) {
|
|
12113
|
+
const { context, graderProvider, systemPrompt, userPrompt, images } = options;
|
|
12114
|
+
const model = graderProvider.asLanguageModel?.();
|
|
12115
|
+
if (model) {
|
|
12116
|
+
const modelOptions = {
|
|
12117
|
+
...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
|
|
12118
|
+
...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
|
|
12119
|
+
};
|
|
12120
|
+
const hasImages = images && images.length > 0;
|
|
12121
|
+
const result = hasImages ? await generateText2({
|
|
12122
|
+
model,
|
|
12123
|
+
system: systemPrompt,
|
|
12124
|
+
messages: [
|
|
12125
|
+
{
|
|
12126
|
+
role: "user",
|
|
12127
|
+
content: [
|
|
12128
|
+
{ type: "text", text: userPrompt },
|
|
12129
|
+
...toAiSdkImageParts(images)
|
|
12130
|
+
]
|
|
12131
|
+
}
|
|
12132
|
+
],
|
|
12133
|
+
...modelOptions
|
|
12134
|
+
}) : await generateText2({
|
|
12135
|
+
model,
|
|
12136
|
+
system: systemPrompt,
|
|
12137
|
+
prompt: userPrompt,
|
|
12138
|
+
...modelOptions
|
|
12139
|
+
});
|
|
12140
|
+
const rawUsage = result.usage;
|
|
12141
|
+
const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens } : void 0;
|
|
12142
|
+
return { text: result.text, tokenUsage };
|
|
12143
|
+
}
|
|
12144
|
+
const response = await graderProvider.invoke({
|
|
12145
|
+
question: userPrompt,
|
|
12146
|
+
systemPrompt,
|
|
12147
|
+
evalCaseId: context.evalCase.id,
|
|
12148
|
+
attempt: context.attempt,
|
|
12149
|
+
maxOutputTokens: this.maxOutputTokens,
|
|
12150
|
+
temperature: this.temperature
|
|
12151
|
+
});
|
|
12152
|
+
return {
|
|
12153
|
+
text: extractLastAssistantContent(response.output),
|
|
12154
|
+
providerResponse: response,
|
|
12155
|
+
tokenUsage: response.tokenUsage
|
|
12156
|
+
};
|
|
11808
12157
|
}
|
|
11809
12158
|
};
|
|
11810
12159
|
function buildOutputSchema() {
|
|
@@ -11824,6 +12173,29 @@ function buildOutputSchema() {
|
|
|
11824
12173
|
"}"
|
|
11825
12174
|
].join("\n");
|
|
11826
12175
|
}
|
|
12176
|
+
function buildStructureRepairPrompt(options) {
|
|
12177
|
+
const { validationError, invalidResponse } = options;
|
|
12178
|
+
return [
|
|
12179
|
+
"The following evaluation response has useful grading content but invalid JSON structure.",
|
|
12180
|
+
"Repair it to satisfy the schema in the system prompt.",
|
|
12181
|
+
"Preserve the evaluation meaning, do not re-grade the answer, and return only a single JSON object.",
|
|
12182
|
+
"",
|
|
12183
|
+
"Validation error:",
|
|
12184
|
+
validationError,
|
|
12185
|
+
"",
|
|
12186
|
+
"Invalid response:",
|
|
12187
|
+
invalidResponse
|
|
12188
|
+
].join("\n");
|
|
12189
|
+
}
|
|
12190
|
+
function sumTokenUsage(first, second) {
|
|
12191
|
+
if (!first && !second) {
|
|
12192
|
+
return void 0;
|
|
12193
|
+
}
|
|
12194
|
+
return {
|
|
12195
|
+
input: (first?.input ?? 0) + (second?.input ?? 0),
|
|
12196
|
+
output: (first?.output ?? 0) + (second?.output ?? 0)
|
|
12197
|
+
};
|
|
12198
|
+
}
|
|
11827
12199
|
function buildRubricOutputSchema() {
|
|
11828
12200
|
return `You are an expert evaluator. Evaluate the candidate answer against each rubric item.
|
|
11829
12201
|
You must return a valid JSON object matching this schema:
|
|
@@ -11923,19 +12295,21 @@ function calculateScoreRangeResult(result, rubrics) {
|
|
|
11923
12295
|
rawScores[rubric.id] = rawScore;
|
|
11924
12296
|
totalWeight += rubric.weight;
|
|
11925
12297
|
weightedScoreSum += normalizedScore * rubric.weight;
|
|
11926
|
-
let
|
|
11927
|
-
if (rubric.
|
|
11928
|
-
|
|
12298
|
+
let minScoreThreshold;
|
|
12299
|
+
if (rubric.min_score !== void 0) {
|
|
12300
|
+
minScoreThreshold = rubric.min_score;
|
|
12301
|
+
} else if (rubric.required_min_score !== void 0) {
|
|
12302
|
+
minScoreThreshold = rubric.required_min_score / 10;
|
|
11929
12303
|
} else if (rubric.required === true) {
|
|
11930
|
-
|
|
12304
|
+
minScoreThreshold = 1;
|
|
11931
12305
|
}
|
|
11932
12306
|
const matchingRange = rubric.score_ranges?.find(
|
|
11933
12307
|
(r) => rawScore >= r.score_range[0] && rawScore <= r.score_range[1]
|
|
11934
12308
|
);
|
|
11935
12309
|
const rangeDescription = matchingRange?.outcome ?? "";
|
|
11936
12310
|
const criterionLabel = rubric.outcome ?? rubric.id;
|
|
11937
|
-
const passed = !(
|
|
11938
|
-
if (
|
|
12311
|
+
const passed = !(minScoreThreshold !== void 0 && normalizedScore < minScoreThreshold) && rawScore >= 7;
|
|
12312
|
+
if (minScoreThreshold !== void 0 && normalizedScore < minScoreThreshold) {
|
|
11939
12313
|
failedRequired = true;
|
|
11940
12314
|
}
|
|
11941
12315
|
assertions.push({
|
|
@@ -12012,11 +12386,11 @@ function createFilesystemTools(workspacePath) {
|
|
|
12012
12386
|
execute: async (input) => {
|
|
12013
12387
|
try {
|
|
12014
12388
|
const resolved = resolveSandboxed(workspacePath, input.path);
|
|
12015
|
-
const
|
|
12016
|
-
if (
|
|
12389
|
+
const stat11 = await fs2.stat(resolved);
|
|
12390
|
+
if (stat11.isDirectory()) {
|
|
12017
12391
|
return { error: `'${input.path}' is a directory, not a file` };
|
|
12018
12392
|
}
|
|
12019
|
-
const buffer = Buffer.alloc(Math.min(
|
|
12393
|
+
const buffer = Buffer.alloc(Math.min(stat11.size, MAX_FILE_SIZE));
|
|
12020
12394
|
const fd = await fs2.open(resolved, "r");
|
|
12021
12395
|
try {
|
|
12022
12396
|
await fd.read(buffer, 0, buffer.length, 0);
|
|
@@ -12024,8 +12398,8 @@ function createFilesystemTools(workspacePath) {
|
|
|
12024
12398
|
await fd.close();
|
|
12025
12399
|
}
|
|
12026
12400
|
const content = buffer.toString("utf-8");
|
|
12027
|
-
const truncated =
|
|
12028
|
-
return { content, truncated, size:
|
|
12401
|
+
const truncated = stat11.size > MAX_FILE_SIZE;
|
|
12402
|
+
return { content, truncated, size: stat11.size };
|
|
12029
12403
|
} catch (error) {
|
|
12030
12404
|
return { error: error instanceof Error ? error.message : String(error) };
|
|
12031
12405
|
}
|
|
@@ -12076,8 +12450,8 @@ async function searchDirectory(dirPath, workspacePath, regex, matches) {
|
|
|
12076
12450
|
const ext = path35.extname(entry.name).toLowerCase();
|
|
12077
12451
|
if (BINARY_EXTENSIONS.has(ext)) continue;
|
|
12078
12452
|
try {
|
|
12079
|
-
const
|
|
12080
|
-
if (
|
|
12453
|
+
const stat11 = await fs2.stat(fullPath);
|
|
12454
|
+
if (stat11.size > MAX_FILE_SIZE) continue;
|
|
12081
12455
|
const content = await fs2.readFile(fullPath, "utf-8");
|
|
12082
12456
|
const lines = content.split("\n");
|
|
12083
12457
|
for (let i = 0; i < lines.length; i++) {
|
|
@@ -12718,115 +13092,115 @@ var FieldAccuracyEvaluator = class {
|
|
|
12718
13092
|
* Evaluate a single field against the expected value.
|
|
12719
13093
|
*/
|
|
12720
13094
|
evaluateField(fieldConfig, candidateData, expectedData) {
|
|
12721
|
-
const { path:
|
|
12722
|
-
const candidateValue = resolvePath(candidateData,
|
|
12723
|
-
const expectedValue = resolvePath(expectedData,
|
|
13095
|
+
const { path: path50, match, required = true, weight = 1 } = fieldConfig;
|
|
13096
|
+
const candidateValue = resolvePath(candidateData, path50);
|
|
13097
|
+
const expectedValue = resolvePath(expectedData, path50);
|
|
12724
13098
|
if (expectedValue === void 0) {
|
|
12725
13099
|
return {
|
|
12726
|
-
path:
|
|
13100
|
+
path: path50,
|
|
12727
13101
|
score: 1,
|
|
12728
13102
|
// No expected value means no comparison needed
|
|
12729
13103
|
weight,
|
|
12730
13104
|
hit: true,
|
|
12731
|
-
message: `${
|
|
13105
|
+
message: `${path50}: no expected value`
|
|
12732
13106
|
};
|
|
12733
13107
|
}
|
|
12734
13108
|
if (candidateValue === void 0) {
|
|
12735
13109
|
if (required) {
|
|
12736
13110
|
return {
|
|
12737
|
-
path:
|
|
13111
|
+
path: path50,
|
|
12738
13112
|
score: 0,
|
|
12739
13113
|
weight,
|
|
12740
13114
|
hit: false,
|
|
12741
|
-
message: `${
|
|
13115
|
+
message: `${path50} (required, missing)`
|
|
12742
13116
|
};
|
|
12743
13117
|
}
|
|
12744
13118
|
return {
|
|
12745
|
-
path:
|
|
13119
|
+
path: path50,
|
|
12746
13120
|
score: 1,
|
|
12747
13121
|
// Don't penalize missing optional fields
|
|
12748
13122
|
weight: 0,
|
|
12749
13123
|
// Zero weight means it won't affect the score
|
|
12750
13124
|
hit: true,
|
|
12751
|
-
message: `${
|
|
13125
|
+
message: `${path50}: optional field missing`
|
|
12752
13126
|
};
|
|
12753
13127
|
}
|
|
12754
13128
|
switch (match) {
|
|
12755
13129
|
case "exact":
|
|
12756
|
-
return this.compareExact(
|
|
13130
|
+
return this.compareExact(path50, candidateValue, expectedValue, weight);
|
|
12757
13131
|
case "numeric_tolerance":
|
|
12758
13132
|
return this.compareNumericTolerance(
|
|
12759
|
-
|
|
13133
|
+
path50,
|
|
12760
13134
|
candidateValue,
|
|
12761
13135
|
expectedValue,
|
|
12762
13136
|
fieldConfig,
|
|
12763
13137
|
weight
|
|
12764
13138
|
);
|
|
12765
13139
|
case "date":
|
|
12766
|
-
return this.compareDate(
|
|
13140
|
+
return this.compareDate(path50, candidateValue, expectedValue, fieldConfig, weight);
|
|
12767
13141
|
default:
|
|
12768
13142
|
return {
|
|
12769
|
-
path:
|
|
13143
|
+
path: path50,
|
|
12770
13144
|
score: 0,
|
|
12771
13145
|
weight,
|
|
12772
13146
|
hit: false,
|
|
12773
|
-
message: `${
|
|
13147
|
+
message: `${path50}: unknown match type "${match}"`
|
|
12774
13148
|
};
|
|
12775
13149
|
}
|
|
12776
13150
|
}
|
|
12777
13151
|
/**
|
|
12778
13152
|
* Exact equality comparison.
|
|
12779
13153
|
*/
|
|
12780
|
-
compareExact(
|
|
13154
|
+
compareExact(path50, candidateValue, expectedValue, weight) {
|
|
12781
13155
|
if (deepEqual(candidateValue, expectedValue)) {
|
|
12782
13156
|
return {
|
|
12783
|
-
path:
|
|
13157
|
+
path: path50,
|
|
12784
13158
|
score: 1,
|
|
12785
13159
|
weight,
|
|
12786
13160
|
hit: true,
|
|
12787
|
-
message:
|
|
13161
|
+
message: path50
|
|
12788
13162
|
};
|
|
12789
13163
|
}
|
|
12790
13164
|
if (typeof candidateValue !== typeof expectedValue) {
|
|
12791
13165
|
return {
|
|
12792
|
-
path:
|
|
13166
|
+
path: path50,
|
|
12793
13167
|
score: 0,
|
|
12794
13168
|
weight,
|
|
12795
13169
|
hit: false,
|
|
12796
|
-
message: `${
|
|
13170
|
+
message: `${path50} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
|
|
12797
13171
|
};
|
|
12798
13172
|
}
|
|
12799
13173
|
return {
|
|
12800
|
-
path:
|
|
13174
|
+
path: path50,
|
|
12801
13175
|
score: 0,
|
|
12802
13176
|
weight,
|
|
12803
13177
|
hit: false,
|
|
12804
|
-
message: `${
|
|
13178
|
+
message: `${path50} (value mismatch)`
|
|
12805
13179
|
};
|
|
12806
13180
|
}
|
|
12807
13181
|
/**
|
|
12808
13182
|
* Numeric comparison with absolute or relative tolerance.
|
|
12809
13183
|
*/
|
|
12810
|
-
compareNumericTolerance(
|
|
13184
|
+
compareNumericTolerance(path50, candidateValue, expectedValue, fieldConfig, weight) {
|
|
12811
13185
|
const { tolerance = 0, relative = false } = fieldConfig;
|
|
12812
13186
|
const candidateNum = toNumber(candidateValue);
|
|
12813
13187
|
const expectedNum = toNumber(expectedValue);
|
|
12814
13188
|
if (candidateNum === null || expectedNum === null) {
|
|
12815
13189
|
return {
|
|
12816
|
-
path:
|
|
13190
|
+
path: path50,
|
|
12817
13191
|
score: 0,
|
|
12818
13192
|
weight,
|
|
12819
13193
|
hit: false,
|
|
12820
|
-
message: `${
|
|
13194
|
+
message: `${path50} (non-numeric value)`
|
|
12821
13195
|
};
|
|
12822
13196
|
}
|
|
12823
13197
|
if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
|
|
12824
13198
|
return {
|
|
12825
|
-
path:
|
|
13199
|
+
path: path50,
|
|
12826
13200
|
score: 0,
|
|
12827
13201
|
weight,
|
|
12828
13202
|
hit: false,
|
|
12829
|
-
message: `${
|
|
13203
|
+
message: `${path50} (invalid numeric value)`
|
|
12830
13204
|
};
|
|
12831
13205
|
}
|
|
12832
13206
|
const diff = Math.abs(candidateNum - expectedNum);
|
|
@@ -12839,61 +13213,61 @@ var FieldAccuracyEvaluator = class {
|
|
|
12839
13213
|
}
|
|
12840
13214
|
if (withinTolerance) {
|
|
12841
13215
|
return {
|
|
12842
|
-
path:
|
|
13216
|
+
path: path50,
|
|
12843
13217
|
score: 1,
|
|
12844
13218
|
weight,
|
|
12845
13219
|
hit: true,
|
|
12846
|
-
message: `${
|
|
13220
|
+
message: `${path50} (within tolerance: diff=${diff.toFixed(2)})`
|
|
12847
13221
|
};
|
|
12848
13222
|
}
|
|
12849
13223
|
return {
|
|
12850
|
-
path:
|
|
13224
|
+
path: path50,
|
|
12851
13225
|
score: 0,
|
|
12852
13226
|
weight,
|
|
12853
13227
|
hit: false,
|
|
12854
|
-
message: `${
|
|
13228
|
+
message: `${path50} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
|
|
12855
13229
|
};
|
|
12856
13230
|
}
|
|
12857
13231
|
/**
|
|
12858
13232
|
* Date comparison with format normalization.
|
|
12859
13233
|
*/
|
|
12860
|
-
compareDate(
|
|
13234
|
+
compareDate(path50, candidateValue, expectedValue, fieldConfig, weight) {
|
|
12861
13235
|
const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
|
|
12862
13236
|
const candidateDate = parseDate(String(candidateValue), formats);
|
|
12863
13237
|
const expectedDate = parseDate(String(expectedValue), formats);
|
|
12864
13238
|
if (candidateDate === null) {
|
|
12865
13239
|
return {
|
|
12866
|
-
path:
|
|
13240
|
+
path: path50,
|
|
12867
13241
|
score: 0,
|
|
12868
13242
|
weight,
|
|
12869
13243
|
hit: false,
|
|
12870
|
-
message: `${
|
|
13244
|
+
message: `${path50} (unparseable candidate date)`
|
|
12871
13245
|
};
|
|
12872
13246
|
}
|
|
12873
13247
|
if (expectedDate === null) {
|
|
12874
13248
|
return {
|
|
12875
|
-
path:
|
|
13249
|
+
path: path50,
|
|
12876
13250
|
score: 0,
|
|
12877
13251
|
weight,
|
|
12878
13252
|
hit: false,
|
|
12879
|
-
message: `${
|
|
13253
|
+
message: `${path50} (unparseable expected date)`
|
|
12880
13254
|
};
|
|
12881
13255
|
}
|
|
12882
13256
|
if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
|
|
12883
13257
|
return {
|
|
12884
|
-
path:
|
|
13258
|
+
path: path50,
|
|
12885
13259
|
score: 1,
|
|
12886
13260
|
weight,
|
|
12887
13261
|
hit: true,
|
|
12888
|
-
message:
|
|
13262
|
+
message: path50
|
|
12889
13263
|
};
|
|
12890
13264
|
}
|
|
12891
13265
|
return {
|
|
12892
|
-
path:
|
|
13266
|
+
path: path50,
|
|
12893
13267
|
score: 0,
|
|
12894
13268
|
weight,
|
|
12895
13269
|
hit: false,
|
|
12896
|
-
message: `${
|
|
13270
|
+
message: `${path50} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
|
|
12897
13271
|
};
|
|
12898
13272
|
}
|
|
12899
13273
|
/**
|
|
@@ -12926,11 +13300,11 @@ var FieldAccuracyEvaluator = class {
|
|
|
12926
13300
|
};
|
|
12927
13301
|
}
|
|
12928
13302
|
};
|
|
12929
|
-
function resolvePath(obj,
|
|
12930
|
-
if (!
|
|
13303
|
+
function resolvePath(obj, path50) {
|
|
13304
|
+
if (!path50 || !obj) {
|
|
12931
13305
|
return void 0;
|
|
12932
13306
|
}
|
|
12933
|
-
const parts =
|
|
13307
|
+
const parts = path50.split(/\.|\[|\]/).filter((p) => p.length > 0);
|
|
12934
13308
|
let current = obj;
|
|
12935
13309
|
for (const part of parts) {
|
|
12936
13310
|
if (current === null || current === void 0) {
|
|
@@ -13422,8 +13796,8 @@ var TokenUsageEvaluator = class {
|
|
|
13422
13796
|
};
|
|
13423
13797
|
|
|
13424
13798
|
// src/evaluation/evaluators/tool-trajectory.ts
|
|
13425
|
-
function getNestedValue(obj,
|
|
13426
|
-
const parts =
|
|
13799
|
+
function getNestedValue(obj, path50) {
|
|
13800
|
+
const parts = path50.split(".");
|
|
13427
13801
|
let current = obj;
|
|
13428
13802
|
for (const part of parts) {
|
|
13429
13803
|
if (current === null || current === void 0 || typeof current !== "object") {
|
|
@@ -15195,7 +15569,7 @@ var WorkspacePoolManager = class {
|
|
|
15195
15569
|
}
|
|
15196
15570
|
/**
|
|
15197
15571
|
* Reset an existing slot for reuse:
|
|
15198
|
-
* 1. Reset repos (git reset --hard
|
|
15572
|
+
* 1. Reset repos (fetch from origin when resolve=remote, then git reset --hard && git clean per repo)
|
|
15199
15573
|
* 2. Re-copy template files (skip repo directories)
|
|
15200
15574
|
*/
|
|
15201
15575
|
async resetSlot(slotPath, templatePath, repos, poolReset = "fast") {
|
|
@@ -15208,7 +15582,17 @@ var WorkspacePoolManager = class {
|
|
|
15208
15582
|
continue;
|
|
15209
15583
|
}
|
|
15210
15584
|
const ref = repo.checkout?.ref ?? "HEAD";
|
|
15211
|
-
|
|
15585
|
+
const resolve = repo.checkout?.resolve ?? "remote";
|
|
15586
|
+
if (resolve === "remote") {
|
|
15587
|
+
const fetchArgs = ["fetch", "origin", ref];
|
|
15588
|
+
if (repo.clone?.depth) {
|
|
15589
|
+
fetchArgs.splice(1, 0, "--depth", String(repo.clone.depth));
|
|
15590
|
+
}
|
|
15591
|
+
await git(fetchArgs, { cwd: repoDir });
|
|
15592
|
+
await git(["reset", "--hard", "FETCH_HEAD"], { cwd: repoDir });
|
|
15593
|
+
} else {
|
|
15594
|
+
await git(["reset", "--hard", ref], { cwd: repoDir });
|
|
15595
|
+
}
|
|
15212
15596
|
const cleanFlag = poolReset === "strict" ? "-fdx" : "-fd";
|
|
15213
15597
|
await git(["clean", cleanFlag], { cwd: repoDir });
|
|
15214
15598
|
}
|
|
@@ -15505,7 +15889,7 @@ async function executeWorkspaceScript(config, context, failureMode = "fatal") {
|
|
|
15505
15889
|
}
|
|
15506
15890
|
|
|
15507
15891
|
// src/evaluation/orchestrator.ts
|
|
15508
|
-
function classifyQualityStatus(score, threshold =
|
|
15892
|
+
function classifyQualityStatus(score, threshold = DEFAULT_THRESHOLD) {
|
|
15509
15893
|
return score >= threshold ? "ok" : "quality_failure";
|
|
15510
15894
|
}
|
|
15511
15895
|
function buildSkippedEvaluatorError(scores) {
|
|
@@ -15597,7 +15981,7 @@ async function runEvaluation(options) {
|
|
|
15597
15981
|
const filteredEvalCases = filterEvalCases(evalCases, filter);
|
|
15598
15982
|
if (filteredEvalCases.length === 0) {
|
|
15599
15983
|
if (filter) {
|
|
15600
|
-
throw new Error(`No tests matched filter '${filter}' in ${evalFilePath}`);
|
|
15984
|
+
throw new Error(`No tests matched filter '${formatFilter(filter)}' in ${evalFilePath}`);
|
|
15601
15985
|
}
|
|
15602
15986
|
return [];
|
|
15603
15987
|
}
|
|
@@ -15649,6 +16033,9 @@ async function runEvaluation(options) {
|
|
|
15649
16033
|
const graderName = targetContext.graderTarget ?? targetContext.name;
|
|
15650
16034
|
const resolvedGrader = resolveTargetByName(graderName);
|
|
15651
16035
|
if (!resolvedGrader) {
|
|
16036
|
+
if (!LLM_GRADER_CAPABLE_KINDS.includes(targetContext.kind)) {
|
|
16037
|
+
return void 0;
|
|
16038
|
+
}
|
|
15652
16039
|
return getOrCreateProvider(targetContext);
|
|
15653
16040
|
}
|
|
15654
16041
|
return getOrCreateProvider(resolvedGrader);
|
|
@@ -15979,7 +16366,7 @@ async function runEvaluation(options) {
|
|
|
15979
16366
|
const budgetResult = {
|
|
15980
16367
|
timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
|
|
15981
16368
|
testId: evalCase.id,
|
|
15982
|
-
|
|
16369
|
+
suite: evalCase.suite,
|
|
15983
16370
|
category: evalCase.category,
|
|
15984
16371
|
score: 0,
|
|
15985
16372
|
assertions: [],
|
|
@@ -16016,7 +16403,7 @@ async function runEvaluation(options) {
|
|
|
16016
16403
|
const haltResult = {
|
|
16017
16404
|
timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
|
|
16018
16405
|
testId: evalCase.id,
|
|
16019
|
-
|
|
16406
|
+
suite: evalCase.suite,
|
|
16020
16407
|
category: evalCase.category,
|
|
16021
16408
|
score: 0,
|
|
16022
16409
|
assertions: [],
|
|
@@ -16328,7 +16715,7 @@ async function runBatchEvaluation(options) {
|
|
|
16328
16715
|
targetResolver,
|
|
16329
16716
|
availableTargets,
|
|
16330
16717
|
verbose,
|
|
16331
|
-
threshold: batchThreshold
|
|
16718
|
+
threshold: evalCase.threshold ?? batchThreshold
|
|
16332
16719
|
});
|
|
16333
16720
|
if (providerError) {
|
|
16334
16721
|
result = {
|
|
@@ -16790,8 +17177,9 @@ async function runEvalCase(options) {
|
|
|
16790
17177
|
fileChanges,
|
|
16791
17178
|
workspacePath,
|
|
16792
17179
|
verbose,
|
|
16793
|
-
threshold: caseThreshold
|
|
17180
|
+
threshold: evalCase.threshold ?? caseThreshold
|
|
16794
17181
|
});
|
|
17182
|
+
const effectiveThreshold = evalCase.threshold ?? caseThreshold;
|
|
16795
17183
|
const totalDurationMs = Date.now() - caseStartMs;
|
|
16796
17184
|
const graderTokens = aggregateEvaluatorTokenUsage(result.scores);
|
|
16797
17185
|
const evalRunTokenUsage = tokenUsage || graderTokens ? {
|
|
@@ -16805,7 +17193,7 @@ async function runEvalCase(options) {
|
|
|
16805
17193
|
...evalRunTokenUsage ? { tokenUsage: evalRunTokenUsage } : {}
|
|
16806
17194
|
};
|
|
16807
17195
|
const skippedEvaluatorError = buildSkippedEvaluatorError(result.scores);
|
|
16808
|
-
const executionStatus = providerError || skippedEvaluatorError ? "execution_error" : classifyQualityStatus(result.score,
|
|
17196
|
+
const executionStatus = providerError || skippedEvaluatorError ? "execution_error" : classifyQualityStatus(result.score, effectiveThreshold);
|
|
16809
17197
|
const targetUsedField = targetUsed ? { targetUsed } : {};
|
|
16810
17198
|
const finalResult = providerError ? {
|
|
16811
17199
|
...result,
|
|
@@ -17006,7 +17394,8 @@ async function evaluateCandidate(options) {
|
|
|
17006
17394
|
targetResolver,
|
|
17007
17395
|
availableTargets,
|
|
17008
17396
|
fileChanges,
|
|
17009
|
-
workspacePath
|
|
17397
|
+
workspacePath,
|
|
17398
|
+
threshold: evalThreshold
|
|
17010
17399
|
});
|
|
17011
17400
|
const completedAt = nowFn();
|
|
17012
17401
|
let agentRequest;
|
|
@@ -17037,7 +17426,7 @@ async function evaluateCandidate(options) {
|
|
|
17037
17426
|
return {
|
|
17038
17427
|
timestamp: completedAt.toISOString(),
|
|
17039
17428
|
testId: evalCase.id,
|
|
17040
|
-
|
|
17429
|
+
suite: evalCase.suite,
|
|
17041
17430
|
category: evalCase.category,
|
|
17042
17431
|
conversationId: evalCase.conversation_id,
|
|
17043
17432
|
score: score.score,
|
|
@@ -17080,7 +17469,8 @@ async function runEvaluatorsForCase(options) {
|
|
|
17080
17469
|
targetResolver,
|
|
17081
17470
|
availableTargets,
|
|
17082
17471
|
fileChanges,
|
|
17083
|
-
workspacePath
|
|
17472
|
+
workspacePath,
|
|
17473
|
+
threshold
|
|
17084
17474
|
} = options;
|
|
17085
17475
|
if (evalCase.assertions && evalCase.assertions.length > 0) {
|
|
17086
17476
|
return runEvaluatorList({
|
|
@@ -17106,7 +17496,8 @@ async function runEvaluatorsForCase(options) {
|
|
|
17106
17496
|
targetResolver,
|
|
17107
17497
|
availableTargets,
|
|
17108
17498
|
fileChanges,
|
|
17109
|
-
workspacePath
|
|
17499
|
+
workspacePath,
|
|
17500
|
+
threshold
|
|
17110
17501
|
});
|
|
17111
17502
|
}
|
|
17112
17503
|
const evaluatorKind = evalCase.evaluator ?? "llm-grader";
|
|
@@ -17208,7 +17599,8 @@ async function runEvaluatorList(options) {
|
|
|
17208
17599
|
name: evaluatorConfig.name,
|
|
17209
17600
|
type: evaluatorConfig.type,
|
|
17210
17601
|
weight,
|
|
17211
|
-
...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {}
|
|
17602
|
+
...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {},
|
|
17603
|
+
...evaluatorConfig.min_score !== void 0 ? { min_score: evaluatorConfig.min_score } : {}
|
|
17212
17604
|
});
|
|
17213
17605
|
scores.push({
|
|
17214
17606
|
name: evaluatorConfig.name,
|
|
@@ -17243,7 +17635,8 @@ async function runEvaluatorList(options) {
|
|
|
17243
17635
|
name: evaluatorConfig.name ?? "unknown",
|
|
17244
17636
|
type: evaluatorConfig.type ?? "llm-grader",
|
|
17245
17637
|
weight,
|
|
17246
|
-
...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {}
|
|
17638
|
+
...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {},
|
|
17639
|
+
...evaluatorConfig.min_score !== void 0 ? { min_score: evaluatorConfig.min_score } : {}
|
|
17247
17640
|
});
|
|
17248
17641
|
scores.push({
|
|
17249
17642
|
name: evaluatorConfig.name ?? "unknown",
|
|
@@ -17277,9 +17670,10 @@ async function runEvaluatorList(options) {
|
|
|
17277
17670
|
}
|
|
17278
17671
|
}
|
|
17279
17672
|
}
|
|
17673
|
+
const effectiveThreshold = options.threshold ?? DEFAULT_THRESHOLD;
|
|
17280
17674
|
const hasRequiredFailure = scored.some((entry) => {
|
|
17281
17675
|
if (!entry.required) return false;
|
|
17282
|
-
const minScore = typeof entry.required === "number" ? entry.required :
|
|
17676
|
+
const minScore = entry.min_score ?? (typeof entry.required === "number" ? entry.required : effectiveThreshold);
|
|
17283
17677
|
return entry.score.score < minScore;
|
|
17284
17678
|
});
|
|
17285
17679
|
const scorable = scored.filter((entry) => entry.score.verdict !== "skip");
|
|
@@ -17290,17 +17684,23 @@ async function runEvaluatorList(options) {
|
|
|
17290
17684
|
const expectedAspectCount = assertions.length || 1;
|
|
17291
17685
|
const score = {
|
|
17292
17686
|
score: aggregateScore,
|
|
17293
|
-
verdict: scoreToVerdict(aggregateScore),
|
|
17687
|
+
verdict: scoreToVerdict(aggregateScore, effectiveThreshold),
|
|
17294
17688
|
assertions,
|
|
17295
17689
|
expectedAspectCount
|
|
17296
17690
|
};
|
|
17297
17691
|
return { score, scores };
|
|
17298
17692
|
}
|
|
17693
|
+
function formatFilter(filter) {
|
|
17694
|
+
return typeof filter === "string" ? filter : filter.join(", ");
|
|
17695
|
+
}
|
|
17696
|
+
function matchesFilter3(id, filter) {
|
|
17697
|
+
return typeof filter === "string" ? micromatch3.isMatch(id, filter) : filter.some((pattern) => micromatch3.isMatch(id, pattern));
|
|
17698
|
+
}
|
|
17299
17699
|
function filterEvalCases(evalCases, filter) {
|
|
17300
17700
|
if (!filter) {
|
|
17301
17701
|
return evalCases;
|
|
17302
17702
|
}
|
|
17303
|
-
return evalCases.filter((evalCase) =>
|
|
17703
|
+
return evalCases.filter((evalCase) => matchesFilter3(evalCase.id, filter));
|
|
17304
17704
|
}
|
|
17305
17705
|
function buildEvaluatorRegistry(overrides, resolveGraderProvider) {
|
|
17306
17706
|
const llmGrader = overrides?.["llm-grader"] ?? new LlmGraderEvaluator({
|
|
@@ -17387,7 +17787,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
17387
17787
|
return {
|
|
17388
17788
|
timestamp: timestamp.toISOString(),
|
|
17389
17789
|
testId: evalCase.id,
|
|
17390
|
-
|
|
17790
|
+
suite: evalCase.suite,
|
|
17391
17791
|
category: evalCase.category,
|
|
17392
17792
|
conversationId: evalCase.conversation_id,
|
|
17393
17793
|
score: 0,
|
|
@@ -17659,6 +18059,7 @@ async function evaluate(config) {
|
|
|
17659
18059
|
verbose: config.verbose,
|
|
17660
18060
|
maxConcurrency: config.workers ?? 3,
|
|
17661
18061
|
filter: config.filter,
|
|
18062
|
+
threshold: config.threshold,
|
|
17662
18063
|
evalCases,
|
|
17663
18064
|
onResult: async (result) => {
|
|
17664
18065
|
collectedResults.push(result);
|
|
@@ -17669,19 +18070,19 @@ async function evaluate(config) {
|
|
|
17669
18070
|
const durationMs = Date.now() - startTime;
|
|
17670
18071
|
return {
|
|
17671
18072
|
results: allResults,
|
|
17672
|
-
summary: computeSummary(allResults, durationMs)
|
|
18073
|
+
summary: computeSummary(allResults, durationMs, config.threshold)
|
|
17673
18074
|
};
|
|
17674
18075
|
}
|
|
17675
18076
|
function mapAssertionType(type) {
|
|
17676
18077
|
return type.replace(/_/g, "-");
|
|
17677
18078
|
}
|
|
17678
|
-
function computeSummary(results, durationMs) {
|
|
18079
|
+
function computeSummary(results, durationMs, threshold = DEFAULT_THRESHOLD) {
|
|
17679
18080
|
const total = results.length;
|
|
17680
18081
|
let passed = 0;
|
|
17681
18082
|
let scoreSum = 0;
|
|
17682
18083
|
for (const r of results) {
|
|
17683
18084
|
scoreSum += r.score;
|
|
17684
|
-
if (r.score >=
|
|
18085
|
+
if (r.score >= threshold) {
|
|
17685
18086
|
passed++;
|
|
17686
18087
|
}
|
|
17687
18088
|
}
|
|
@@ -17794,7 +18195,7 @@ var CONFIG_FILE_NAMES = [
|
|
|
17794
18195
|
];
|
|
17795
18196
|
async function loadTsConfig(projectRoot) {
|
|
17796
18197
|
const { existsSync: existsSync7 } = await import("node:fs");
|
|
17797
|
-
const { pathToFileURL } = await import("node:url");
|
|
18198
|
+
const { pathToFileURL: pathToFileURL2 } = await import("node:url");
|
|
17798
18199
|
const { join: join2 } = await import("node:path");
|
|
17799
18200
|
for (const fileName of CONFIG_FILE_NAMES) {
|
|
17800
18201
|
const filePath = join2(projectRoot, fileName);
|
|
@@ -17802,7 +18203,7 @@ async function loadTsConfig(projectRoot) {
|
|
|
17802
18203
|
continue;
|
|
17803
18204
|
}
|
|
17804
18205
|
try {
|
|
17805
|
-
const fileUrl =
|
|
18206
|
+
const fileUrl = pathToFileURL2(filePath).href;
|
|
17806
18207
|
const mod = await import(fileUrl);
|
|
17807
18208
|
const config = mod.default ?? mod;
|
|
17808
18209
|
return AgentVConfigSchema.parse(config);
|
|
@@ -17936,7 +18337,7 @@ function shouldSkipCacheForTemperature(targetConfig) {
|
|
|
17936
18337
|
}
|
|
17937
18338
|
|
|
17938
18339
|
// src/projects.ts
|
|
17939
|
-
import { existsSync as existsSync6, mkdirSync, readFileSync as readFileSync3, readdirSync as readdirSync3, statSync as statSync2, writeFileSync } from "node:fs";
|
|
18340
|
+
import { existsSync as existsSync6, mkdirSync as mkdirSync2, readFileSync as readFileSync3, readdirSync as readdirSync3, statSync as statSync2, writeFileSync } from "node:fs";
|
|
17940
18341
|
import path47 from "node:path";
|
|
17941
18342
|
import { parse as parseYaml3, stringify as stringifyYaml } from "yaml";
|
|
17942
18343
|
function getProjectsRegistryPath() {
|
|
@@ -17962,7 +18363,7 @@ function saveProjectRegistry(registry) {
|
|
|
17962
18363
|
const registryPath = getProjectsRegistryPath();
|
|
17963
18364
|
const dir = path47.dirname(registryPath);
|
|
17964
18365
|
if (!existsSync6(dir)) {
|
|
17965
|
-
|
|
18366
|
+
mkdirSync2(dir, { recursive: true });
|
|
17966
18367
|
}
|
|
17967
18368
|
writeFileSync(registryPath, stringifyYaml(registry), "utf-8");
|
|
17968
18369
|
}
|
|
@@ -18228,7 +18629,7 @@ var OtelTraceExporter = class {
|
|
|
18228
18629
|
rootSpan.setAttribute("gen_ai.system", "agentv");
|
|
18229
18630
|
rootSpan.setAttribute("agentv.test_id", result.testId);
|
|
18230
18631
|
rootSpan.setAttribute("agentv.target", result.target);
|
|
18231
|
-
if (result.
|
|
18632
|
+
if (result.suite) rootSpan.setAttribute("agentv.suite", result.suite);
|
|
18232
18633
|
rootSpan.setAttribute("agentv.score", result.score);
|
|
18233
18634
|
if (captureContent && result.output.length > 0) {
|
|
18234
18635
|
const lastMsg = result.output[result.output.length - 1];
|
|
@@ -18437,7 +18838,7 @@ var OtelStreamingObserver = class {
|
|
|
18437
18838
|
this.rootSpan.setAttribute("gen_ai.system", "agentv");
|
|
18438
18839
|
this.rootSpan.setAttribute("agentv.test_id", testId);
|
|
18439
18840
|
this.rootSpan.setAttribute("agentv.target", target);
|
|
18440
|
-
if (evalSet) this.rootSpan.setAttribute("agentv.
|
|
18841
|
+
if (evalSet) this.rootSpan.setAttribute("agentv.suite", evalSet);
|
|
18441
18842
|
this.rootCtx = this.api.trace.setSpan(this.api.context.active(), this.rootSpan);
|
|
18442
18843
|
}
|
|
18443
18844
|
/** Create and immediately export a tool span */
|
|
@@ -18786,11 +19187,241 @@ function extractToolResultContent(content) {
|
|
|
18786
19187
|
return parts.length > 0 ? parts.join("") : void 0;
|
|
18787
19188
|
}
|
|
18788
19189
|
|
|
18789
|
-
// src/import/
|
|
19190
|
+
// src/import/codex-parser.ts
|
|
19191
|
+
function parseCodexSession(jsonl) {
|
|
19192
|
+
const messages = [];
|
|
19193
|
+
let sessionId = "";
|
|
19194
|
+
let cwd;
|
|
19195
|
+
let model;
|
|
19196
|
+
let version;
|
|
19197
|
+
let startTimestamp;
|
|
19198
|
+
let endTimestamp;
|
|
19199
|
+
const pendingCalls = /* @__PURE__ */ new Map();
|
|
19200
|
+
const lines = jsonl.split("\n").filter((l) => l.trim().length > 0);
|
|
19201
|
+
for (const line of lines) {
|
|
19202
|
+
let entry;
|
|
19203
|
+
try {
|
|
19204
|
+
entry = JSON.parse(line);
|
|
19205
|
+
} catch {
|
|
19206
|
+
continue;
|
|
19207
|
+
}
|
|
19208
|
+
if (!entry.type) continue;
|
|
19209
|
+
if (entry.timestamp) {
|
|
19210
|
+
if (!startTimestamp) startTimestamp = entry.timestamp;
|
|
19211
|
+
endTimestamp = entry.timestamp;
|
|
19212
|
+
}
|
|
19213
|
+
const payload = entry.payload ?? {};
|
|
19214
|
+
switch (entry.type) {
|
|
19215
|
+
case "session_meta": {
|
|
19216
|
+
sessionId = String(payload.id ?? "");
|
|
19217
|
+
cwd = payload.cwd ? String(payload.cwd) : void 0;
|
|
19218
|
+
version = payload.cli_version ? String(payload.cli_version) : void 0;
|
|
19219
|
+
if (payload.model && !model) {
|
|
19220
|
+
model = String(payload.model);
|
|
19221
|
+
}
|
|
19222
|
+
break;
|
|
19223
|
+
}
|
|
19224
|
+
case "turn_context": {
|
|
19225
|
+
if (payload.model && !model) {
|
|
19226
|
+
model = String(payload.model);
|
|
19227
|
+
}
|
|
19228
|
+
if (payload.cwd && !cwd) {
|
|
19229
|
+
cwd = String(payload.cwd);
|
|
19230
|
+
}
|
|
19231
|
+
break;
|
|
19232
|
+
}
|
|
19233
|
+
case "response_item": {
|
|
19234
|
+
const itemType = String(payload.type ?? "");
|
|
19235
|
+
const role = String(payload.role ?? "");
|
|
19236
|
+
switch (itemType) {
|
|
19237
|
+
case "message": {
|
|
19238
|
+
if (role === "developer") break;
|
|
19239
|
+
const content = extractResponseItemContent(payload.content);
|
|
19240
|
+
if (role === "user" && content) {
|
|
19241
|
+
messages.push({ role: "user", content });
|
|
19242
|
+
} else if (role === "assistant" && content) {
|
|
19243
|
+
messages.push({ role: "assistant", content });
|
|
19244
|
+
}
|
|
19245
|
+
break;
|
|
19246
|
+
}
|
|
19247
|
+
case "function_call": {
|
|
19248
|
+
const toolName = String(payload.name ?? "");
|
|
19249
|
+
const callId = String(payload.call_id ?? "");
|
|
19250
|
+
let input;
|
|
19251
|
+
if (typeof payload.arguments === "string") {
|
|
19252
|
+
try {
|
|
19253
|
+
input = JSON.parse(payload.arguments);
|
|
19254
|
+
} catch {
|
|
19255
|
+
input = payload.arguments;
|
|
19256
|
+
}
|
|
19257
|
+
} else {
|
|
19258
|
+
input = payload.arguments;
|
|
19259
|
+
}
|
|
19260
|
+
const toolCall = { tool: toolName, input, id: callId };
|
|
19261
|
+
const msgIdx = messages.length;
|
|
19262
|
+
messages.push({
|
|
19263
|
+
role: "assistant",
|
|
19264
|
+
toolCalls: [toolCall]
|
|
19265
|
+
});
|
|
19266
|
+
if (callId) {
|
|
19267
|
+
pendingCalls.set(callId, { msgIdx, toolIdx: 0 });
|
|
19268
|
+
}
|
|
19269
|
+
break;
|
|
19270
|
+
}
|
|
19271
|
+
case "custom_tool_call": {
|
|
19272
|
+
const toolName = String(payload.name ?? "");
|
|
19273
|
+
const callId = String(payload.call_id ?? "");
|
|
19274
|
+
let input;
|
|
19275
|
+
if (typeof payload.arguments === "string") {
|
|
19276
|
+
try {
|
|
19277
|
+
input = JSON.parse(payload.arguments);
|
|
19278
|
+
} catch {
|
|
19279
|
+
input = payload.arguments;
|
|
19280
|
+
}
|
|
19281
|
+
} else {
|
|
19282
|
+
input = payload.arguments;
|
|
19283
|
+
}
|
|
19284
|
+
const toolCall = { tool: toolName, input, id: callId };
|
|
19285
|
+
const msgIdx = messages.length;
|
|
19286
|
+
messages.push({
|
|
19287
|
+
role: "assistant",
|
|
19288
|
+
toolCalls: [toolCall]
|
|
19289
|
+
});
|
|
19290
|
+
if (callId) {
|
|
19291
|
+
pendingCalls.set(callId, { msgIdx, toolIdx: 0 });
|
|
19292
|
+
}
|
|
19293
|
+
break;
|
|
19294
|
+
}
|
|
19295
|
+
case "function_call_output":
|
|
19296
|
+
case "custom_tool_call_output": {
|
|
19297
|
+
const callId = String(payload.call_id ?? "");
|
|
19298
|
+
const pending = pendingCalls.get(callId);
|
|
19299
|
+
if (pending) {
|
|
19300
|
+
const existingMsg = messages[pending.msgIdx];
|
|
19301
|
+
const existingCalls = [...existingMsg.toolCalls ?? []];
|
|
19302
|
+
existingCalls[pending.toolIdx] = {
|
|
19303
|
+
...existingCalls[pending.toolIdx],
|
|
19304
|
+
output: payload.output
|
|
19305
|
+
};
|
|
19306
|
+
messages[pending.msgIdx] = { ...existingMsg, toolCalls: existingCalls };
|
|
19307
|
+
pendingCalls.delete(callId);
|
|
19308
|
+
}
|
|
19309
|
+
break;
|
|
19310
|
+
}
|
|
19311
|
+
// Skip reasoning blocks (thinking tokens)
|
|
19312
|
+
case "reasoning":
|
|
19313
|
+
break;
|
|
19314
|
+
}
|
|
19315
|
+
break;
|
|
19316
|
+
}
|
|
19317
|
+
}
|
|
19318
|
+
}
|
|
19319
|
+
let durationMs;
|
|
19320
|
+
if (startTimestamp && endTimestamp) {
|
|
19321
|
+
durationMs = new Date(endTimestamp).getTime() - new Date(startTimestamp).getTime();
|
|
19322
|
+
}
|
|
19323
|
+
const source = {
|
|
19324
|
+
provider: "codex",
|
|
19325
|
+
sessionId,
|
|
19326
|
+
cwd,
|
|
19327
|
+
startedAt: startTimestamp,
|
|
19328
|
+
model,
|
|
19329
|
+
version
|
|
19330
|
+
};
|
|
19331
|
+
return {
|
|
19332
|
+
messages,
|
|
19333
|
+
source,
|
|
19334
|
+
// Codex rollout files don't include token counts (only rate limit info)
|
|
19335
|
+
tokenUsage: void 0,
|
|
19336
|
+
durationMs,
|
|
19337
|
+
costUsd: null
|
|
19338
|
+
};
|
|
19339
|
+
}
|
|
19340
|
+
function extractResponseItemContent(content) {
|
|
19341
|
+
if (typeof content === "string") return content;
|
|
19342
|
+
if (!Array.isArray(content)) return void 0;
|
|
19343
|
+
const parts = [];
|
|
19344
|
+
for (const block of content) {
|
|
19345
|
+
if (typeof block === "object" && block !== null) {
|
|
19346
|
+
const b = block;
|
|
19347
|
+
if (typeof b.text === "string") {
|
|
19348
|
+
parts.push(b.text);
|
|
19349
|
+
}
|
|
19350
|
+
}
|
|
19351
|
+
}
|
|
19352
|
+
return parts.length > 0 ? parts.join("") : void 0;
|
|
19353
|
+
}
|
|
19354
|
+
|
|
19355
|
+
// src/import/codex-session-discovery.ts
|
|
18790
19356
|
import { readdir as readdir8, stat as stat9 } from "node:fs/promises";
|
|
18791
19357
|
import { homedir as homedir3 } from "node:os";
|
|
18792
19358
|
import path48 from "node:path";
|
|
18793
|
-
var
|
|
19359
|
+
var DEFAULT_SESSIONS_DIR = () => path48.join(homedir3(), ".codex", "sessions");
|
|
19360
|
+
async function discoverCodexSessions(opts) {
|
|
19361
|
+
const sessionsDir = opts?.sessionsDir ?? DEFAULT_SESSIONS_DIR();
|
|
19362
|
+
const limit = opts?.latest ? 1 : opts?.limit ?? 10;
|
|
19363
|
+
const sessions = [];
|
|
19364
|
+
let yearDirs;
|
|
19365
|
+
try {
|
|
19366
|
+
yearDirs = await readdir8(sessionsDir);
|
|
19367
|
+
} catch {
|
|
19368
|
+
return [];
|
|
19369
|
+
}
|
|
19370
|
+
for (const year of yearDirs) {
|
|
19371
|
+
const yearPath = path48.join(sessionsDir, year);
|
|
19372
|
+
let monthDirs;
|
|
19373
|
+
try {
|
|
19374
|
+
monthDirs = await readdir8(yearPath);
|
|
19375
|
+
} catch {
|
|
19376
|
+
continue;
|
|
19377
|
+
}
|
|
19378
|
+
for (const month of monthDirs) {
|
|
19379
|
+
const monthPath = path48.join(yearPath, month);
|
|
19380
|
+
let dayDirs;
|
|
19381
|
+
try {
|
|
19382
|
+
dayDirs = await readdir8(monthPath);
|
|
19383
|
+
} catch {
|
|
19384
|
+
continue;
|
|
19385
|
+
}
|
|
19386
|
+
for (const day of dayDirs) {
|
|
19387
|
+
if (opts?.date) {
|
|
19388
|
+
const dirDate = `${year}-${month}-${day}`;
|
|
19389
|
+
if (dirDate !== opts.date) continue;
|
|
19390
|
+
}
|
|
19391
|
+
const dayPath = path48.join(monthPath, day);
|
|
19392
|
+
let files;
|
|
19393
|
+
try {
|
|
19394
|
+
files = await readdir8(dayPath);
|
|
19395
|
+
} catch {
|
|
19396
|
+
continue;
|
|
19397
|
+
}
|
|
19398
|
+
for (const file of files) {
|
|
19399
|
+
if (!file.startsWith("rollout-") || !file.endsWith(".jsonl")) continue;
|
|
19400
|
+
const filePath = path48.join(dayPath, file);
|
|
19401
|
+
const nameWithoutExt = file.replace(/\.jsonl$/, "");
|
|
19402
|
+
const parts = nameWithoutExt.split("-");
|
|
19403
|
+
const sessionId = parts.length >= 6 ? parts.slice(-5).join("-") : nameWithoutExt;
|
|
19404
|
+
let updatedAt;
|
|
19405
|
+
try {
|
|
19406
|
+
const fileStat = await stat9(filePath);
|
|
19407
|
+
updatedAt = fileStat.mtime;
|
|
19408
|
+
} catch {
|
|
19409
|
+
updatedAt = /* @__PURE__ */ new Date(0);
|
|
19410
|
+
}
|
|
19411
|
+
sessions.push({ sessionId, filePath, filename: file, updatedAt });
|
|
19412
|
+
}
|
|
19413
|
+
}
|
|
19414
|
+
}
|
|
19415
|
+
}
|
|
19416
|
+
sessions.sort((a, b) => b.updatedAt.getTime() - a.updatedAt.getTime());
|
|
19417
|
+
return sessions.slice(0, limit);
|
|
19418
|
+
}
|
|
19419
|
+
|
|
19420
|
+
// src/import/session-discovery.ts
|
|
19421
|
+
import { readdir as readdir9, stat as stat10 } from "node:fs/promises";
|
|
19422
|
+
import { homedir as homedir4 } from "node:os";
|
|
19423
|
+
import path49 from "node:path";
|
|
19424
|
+
var DEFAULT_PROJECTS_DIR = () => path49.join(homedir4(), ".claude", "projects");
|
|
18794
19425
|
function encodeProjectPath(projectPath) {
|
|
18795
19426
|
return projectPath.replace(/\//g, "-");
|
|
18796
19427
|
}
|
|
@@ -18799,7 +19430,7 @@ async function discoverClaudeSessions(opts) {
|
|
|
18799
19430
|
const limit = opts?.latest ? 1 : opts?.limit ?? 10;
|
|
18800
19431
|
let projectDirs;
|
|
18801
19432
|
try {
|
|
18802
|
-
projectDirs = await
|
|
19433
|
+
projectDirs = await readdir9(projectsDir);
|
|
18803
19434
|
} catch {
|
|
18804
19435
|
return [];
|
|
18805
19436
|
}
|
|
@@ -18809,10 +19440,10 @@ async function discoverClaudeSessions(opts) {
|
|
|
18809
19440
|
}
|
|
18810
19441
|
const sessions = [];
|
|
18811
19442
|
for (const projectDir of projectDirs) {
|
|
18812
|
-
const dirPath =
|
|
19443
|
+
const dirPath = path49.join(projectsDir, projectDir);
|
|
18813
19444
|
let entries;
|
|
18814
19445
|
try {
|
|
18815
|
-
entries = await
|
|
19446
|
+
entries = await readdir9(dirPath);
|
|
18816
19447
|
} catch {
|
|
18817
19448
|
continue;
|
|
18818
19449
|
}
|
|
@@ -18820,10 +19451,10 @@ async function discoverClaudeSessions(opts) {
|
|
|
18820
19451
|
if (!entry.endsWith(".jsonl")) continue;
|
|
18821
19452
|
const sessionId = entry.replace(/\.jsonl$/, "");
|
|
18822
19453
|
if (opts?.sessionId && sessionId !== opts.sessionId) continue;
|
|
18823
|
-
const filePath =
|
|
19454
|
+
const filePath = path49.join(dirPath, entry);
|
|
18824
19455
|
let updatedAt;
|
|
18825
19456
|
try {
|
|
18826
|
-
const fileStat = await
|
|
19457
|
+
const fileStat = await stat10(filePath);
|
|
18827
19458
|
updatedAt = fileStat.mtime;
|
|
18828
19459
|
} catch {
|
|
18829
19460
|
updatedAt = /* @__PURE__ */ new Date(0);
|
|
@@ -18842,10 +19473,85 @@ async function discoverClaudeSessions(opts) {
|
|
|
18842
19473
|
|
|
18843
19474
|
// src/import/types.ts
|
|
18844
19475
|
import { readFile as readFile14 } from "node:fs/promises";
|
|
19476
|
+
function toTranscriptJsonLine(entry) {
|
|
19477
|
+
const firstUserMessage = entry.messages.find((m) => m.role === "user");
|
|
19478
|
+
const input = typeof firstUserMessage?.content === "string" ? firstUserMessage.content : "";
|
|
19479
|
+
return {
|
|
19480
|
+
input,
|
|
19481
|
+
output: entry.messages,
|
|
19482
|
+
token_usage: entry.tokenUsage ? {
|
|
19483
|
+
input: entry.tokenUsage.input,
|
|
19484
|
+
output: entry.tokenUsage.output,
|
|
19485
|
+
cached: entry.tokenUsage.cached
|
|
19486
|
+
} : void 0,
|
|
19487
|
+
duration_ms: entry.durationMs,
|
|
19488
|
+
cost_usd: entry.costUsd,
|
|
19489
|
+
source: {
|
|
19490
|
+
provider: entry.source.provider,
|
|
19491
|
+
session_id: entry.source.sessionId,
|
|
19492
|
+
model: entry.source.model,
|
|
19493
|
+
timestamp: entry.source.startedAt,
|
|
19494
|
+
git_branch: entry.source.gitBranch,
|
|
19495
|
+
cwd: entry.source.cwd ?? entry.source.projectPath,
|
|
19496
|
+
version: entry.source.version
|
|
19497
|
+
}
|
|
19498
|
+
};
|
|
19499
|
+
}
|
|
19500
|
+
async function readTranscriptJsonl(filePath) {
|
|
19501
|
+
const text = await readFile14(filePath, "utf8");
|
|
19502
|
+
return text.split("\n").filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
19503
|
+
}
|
|
18845
19504
|
async function readTranscriptFile(filePath) {
|
|
18846
19505
|
return readFile14(filePath, "utf8");
|
|
18847
19506
|
}
|
|
18848
19507
|
|
|
19508
|
+
// src/import/transcript-provider.ts
|
|
19509
|
+
var TranscriptProvider = class _TranscriptProvider {
|
|
19510
|
+
id;
|
|
19511
|
+
kind = "transcript";
|
|
19512
|
+
targetName;
|
|
19513
|
+
lines;
|
|
19514
|
+
cursor = 0;
|
|
19515
|
+
constructor(targetName, lines) {
|
|
19516
|
+
this.targetName = targetName;
|
|
19517
|
+
this.id = `transcript:${targetName}`;
|
|
19518
|
+
this.lines = lines;
|
|
19519
|
+
}
|
|
19520
|
+
/**
|
|
19521
|
+
* Create a TranscriptProvider from a JSONL file path.
|
|
19522
|
+
*/
|
|
19523
|
+
static async fromFile(filePath) {
|
|
19524
|
+
const lines = await readTranscriptJsonl(filePath);
|
|
19525
|
+
if (lines.length === 0) {
|
|
19526
|
+
throw new Error(`Transcript file is empty: ${filePath}`);
|
|
19527
|
+
}
|
|
19528
|
+
const providerName = lines[0].source.provider ?? "transcript";
|
|
19529
|
+
return new _TranscriptProvider(providerName, lines);
|
|
19530
|
+
}
|
|
19531
|
+
get lineCount() {
|
|
19532
|
+
return this.lines.length;
|
|
19533
|
+
}
|
|
19534
|
+
async invoke(_request) {
|
|
19535
|
+
if (this.cursor >= this.lines.length) {
|
|
19536
|
+
throw new Error(
|
|
19537
|
+
`Transcript exhausted: ${this.lines.length} line(s) available but ${this.cursor + 1} invocations attempted. Each transcript line maps to one test case.`
|
|
19538
|
+
);
|
|
19539
|
+
}
|
|
19540
|
+
const line = this.lines[this.cursor++];
|
|
19541
|
+
return {
|
|
19542
|
+
output: line.output,
|
|
19543
|
+
tokenUsage: line.token_usage ? {
|
|
19544
|
+
input: line.token_usage.input,
|
|
19545
|
+
output: line.token_usage.output,
|
|
19546
|
+
cached: line.token_usage.cached
|
|
19547
|
+
} : void 0,
|
|
19548
|
+
durationMs: line.duration_ms,
|
|
19549
|
+
costUsd: line.cost_usd ?? void 0,
|
|
19550
|
+
startTime: line.source.timestamp
|
|
19551
|
+
};
|
|
19552
|
+
}
|
|
19553
|
+
};
|
|
19554
|
+
|
|
18849
19555
|
// src/index.ts
|
|
18850
19556
|
function createAgentKernel() {
|
|
18851
19557
|
return { status: "stub" };
|
|
@@ -18859,6 +19565,7 @@ export {
|
|
|
18859
19565
|
DEFAULT_EVALUATOR_TEMPLATE,
|
|
18860
19566
|
DEFAULT_EVAL_PATTERNS,
|
|
18861
19567
|
DEFAULT_EXPLORATION_TOOLS,
|
|
19568
|
+
DEFAULT_THRESHOLD,
|
|
18862
19569
|
DeterministicAssertionEvaluator,
|
|
18863
19570
|
EvaluatorRegistry,
|
|
18864
19571
|
ExecutionMetricsEvaluator,
|
|
@@ -18880,6 +19587,7 @@ export {
|
|
|
18880
19587
|
TemplateNotFoundError,
|
|
18881
19588
|
TokenUsageEvaluator,
|
|
18882
19589
|
ToolTrajectoryEvaluator,
|
|
19590
|
+
TranscriptProvider,
|
|
18883
19591
|
WorkspaceCreationError,
|
|
18884
19592
|
WorkspacePoolManager,
|
|
18885
19593
|
addProject,
|
|
@@ -18916,6 +19624,7 @@ export {
|
|
|
18916
19624
|
detectFormat,
|
|
18917
19625
|
discoverAssertions,
|
|
18918
19626
|
discoverClaudeSessions,
|
|
19627
|
+
discoverCodexSessions,
|
|
18919
19628
|
discoverCopilotSessions,
|
|
18920
19629
|
discoverGraders,
|
|
18921
19630
|
discoverGraders as discoverJudges,
|
|
@@ -18976,6 +19685,8 @@ export {
|
|
|
18976
19685
|
normalizeLineEndings,
|
|
18977
19686
|
parseAgentSkillsEvals,
|
|
18978
19687
|
parseClaudeSession,
|
|
19688
|
+
parseCodexSession,
|
|
19689
|
+
parseCopilotEvents,
|
|
18979
19690
|
parseJsonFromText,
|
|
18980
19691
|
parseJsonSafe,
|
|
18981
19692
|
readJsonFile,
|
|
@@ -18983,6 +19694,7 @@ export {
|
|
|
18983
19694
|
readTestSuiteMetadata,
|
|
18984
19695
|
readTextFile,
|
|
18985
19696
|
readTranscriptFile,
|
|
19697
|
+
readTranscriptJsonl,
|
|
18986
19698
|
removeProject,
|
|
18987
19699
|
resolveAndCreateProvider,
|
|
18988
19700
|
resolveDelegatedTargetDefinition,
|
|
@@ -19015,6 +19727,7 @@ export {
|
|
|
19015
19727
|
substituteVariables,
|
|
19016
19728
|
toCamelCaseDeep,
|
|
19017
19729
|
toSnakeCaseDeep,
|
|
19730
|
+
toTranscriptJsonLine,
|
|
19018
19731
|
tokensPerTool,
|
|
19019
19732
|
touchProject,
|
|
19020
19733
|
transpileEvalYaml,
|