@agentv/core 4.6.1 → 4.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-ZK4GG7PR.js → chunk-VCVVKCC4.js} +268 -128
- package/dist/chunk-VCVVKCC4.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +110 -94
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +30 -71
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +1353 -466
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +301 -74
- package/dist/index.d.ts +301 -74
- package/dist/index.js +1053 -311
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-ZK4GG7PR.js.map +0 -1
package/dist/index.js
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import {
|
|
2
2
|
COMMON_TARGET_SETTINGS,
|
|
3
|
+
LLM_GRADER_CAPABLE_KINDS,
|
|
3
4
|
TEST_MESSAGE_ROLES,
|
|
4
5
|
buildDirectoryChain,
|
|
5
6
|
buildSearchRoots,
|
|
@@ -24,7 +25,7 @@ import {
|
|
|
24
25
|
resolveDelegatedTargetDefinition,
|
|
25
26
|
resolveFileReference,
|
|
26
27
|
resolveTargetDefinition
|
|
27
|
-
} from "./chunk-
|
|
28
|
+
} from "./chunk-VCVVKCC4.js";
|
|
28
29
|
import {
|
|
29
30
|
AgentvProvider
|
|
30
31
|
} from "./chunk-PRNXHNLF.js";
|
|
@@ -856,8 +857,13 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
856
857
|
const negate = rawEvaluator.negate === true ? true : void 0;
|
|
857
858
|
if (isCustomType) {
|
|
858
859
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
859
|
-
const required2 =
|
|
860
|
-
|
|
860
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
861
|
+
rawEvaluator.required,
|
|
862
|
+
rawEvaluator.min_score,
|
|
863
|
+
name,
|
|
864
|
+
evalId
|
|
865
|
+
);
|
|
866
|
+
const knownProps2 = /* @__PURE__ */ new Set(["name", "type", "weight", "required", "min_score", "negate"]);
|
|
861
867
|
const config2 = {};
|
|
862
868
|
for (const [key, value] of Object.entries(rawEvaluator)) {
|
|
863
869
|
if (!knownProps2.has(key) && value !== void 0) {
|
|
@@ -869,6 +875,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
869
875
|
type: customTypeName,
|
|
870
876
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
871
877
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
878
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
872
879
|
...negate !== void 0 ? { negate } : {},
|
|
873
880
|
...Object.keys(config2).length > 0 ? { config: config2 } : {}
|
|
874
881
|
});
|
|
@@ -938,7 +945,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
938
945
|
);
|
|
939
946
|
}
|
|
940
947
|
}
|
|
941
|
-
const required2 =
|
|
948
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
949
|
+
rawEvaluator.required,
|
|
950
|
+
rawEvaluator.min_score,
|
|
951
|
+
name,
|
|
952
|
+
evalId
|
|
953
|
+
);
|
|
942
954
|
const knownProps2 = /* @__PURE__ */ new Set([
|
|
943
955
|
"name",
|
|
944
956
|
"type",
|
|
@@ -964,6 +976,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
964
976
|
resolvedCwd,
|
|
965
977
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
966
978
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
979
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
967
980
|
...negate !== void 0 ? { negate } : {},
|
|
968
981
|
...Object.keys(config2).length > 0 ? { config: config2 } : {},
|
|
969
982
|
...targetConfig !== void 0 ? { target: targetConfig } : {}
|
|
@@ -1092,7 +1105,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1092
1105
|
};
|
|
1093
1106
|
}
|
|
1094
1107
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1095
|
-
const required2 =
|
|
1108
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
1109
|
+
rawEvaluator.required,
|
|
1110
|
+
rawEvaluator.min_score,
|
|
1111
|
+
name,
|
|
1112
|
+
evalId
|
|
1113
|
+
);
|
|
1096
1114
|
evaluators.push({
|
|
1097
1115
|
name,
|
|
1098
1116
|
type: "composite",
|
|
@@ -1100,6 +1118,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1100
1118
|
aggregator,
|
|
1101
1119
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1102
1120
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
1121
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
1103
1122
|
...negate !== void 0 ? { negate } : {}
|
|
1104
1123
|
});
|
|
1105
1124
|
continue;
|
|
@@ -1210,7 +1229,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1210
1229
|
continue;
|
|
1211
1230
|
}
|
|
1212
1231
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1213
|
-
const required2 =
|
|
1232
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
1233
|
+
rawEvaluator.required,
|
|
1234
|
+
rawEvaluator.min_score,
|
|
1235
|
+
name,
|
|
1236
|
+
evalId
|
|
1237
|
+
);
|
|
1214
1238
|
const config2 = {
|
|
1215
1239
|
name,
|
|
1216
1240
|
type: "tool-trajectory",
|
|
@@ -1219,6 +1243,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1219
1243
|
...expected ? { expected } : {},
|
|
1220
1244
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1221
1245
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
1246
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
1222
1247
|
...negate !== void 0 ? { negate } : {},
|
|
1223
1248
|
...argsMatch2 !== void 0 ? { argsMatch: argsMatch2 } : {}
|
|
1224
1249
|
};
|
|
@@ -1281,7 +1306,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1281
1306
|
const aggregation = asString(rawEvaluator.aggregation);
|
|
1282
1307
|
const validAggregation = isValidFieldAggregationType(aggregation) ? aggregation : void 0;
|
|
1283
1308
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1284
|
-
const required2 =
|
|
1309
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
1310
|
+
rawEvaluator.required,
|
|
1311
|
+
rawEvaluator.min_score,
|
|
1312
|
+
name,
|
|
1313
|
+
evalId
|
|
1314
|
+
);
|
|
1285
1315
|
evaluators.push({
|
|
1286
1316
|
name,
|
|
1287
1317
|
type: "field-accuracy",
|
|
@@ -1289,6 +1319,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1289
1319
|
...validAggregation ? { aggregation: validAggregation } : {},
|
|
1290
1320
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1291
1321
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
1322
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
1292
1323
|
...negate !== void 0 ? { negate } : {}
|
|
1293
1324
|
});
|
|
1294
1325
|
continue;
|
|
@@ -1302,13 +1333,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1302
1333
|
continue;
|
|
1303
1334
|
}
|
|
1304
1335
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1305
|
-
const required2 =
|
|
1336
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
1337
|
+
rawEvaluator.required,
|
|
1338
|
+
rawEvaluator.min_score,
|
|
1339
|
+
name,
|
|
1340
|
+
evalId
|
|
1341
|
+
);
|
|
1306
1342
|
evaluators.push({
|
|
1307
1343
|
name,
|
|
1308
1344
|
type: "latency",
|
|
1309
1345
|
threshold,
|
|
1310
1346
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1311
1347
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
1348
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
1312
1349
|
...negate !== void 0 ? { negate } : {}
|
|
1313
1350
|
});
|
|
1314
1351
|
continue;
|
|
@@ -1322,13 +1359,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1322
1359
|
continue;
|
|
1323
1360
|
}
|
|
1324
1361
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1325
|
-
const required2 =
|
|
1362
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
1363
|
+
rawEvaluator.required,
|
|
1364
|
+
rawEvaluator.min_score,
|
|
1365
|
+
name,
|
|
1366
|
+
evalId
|
|
1367
|
+
);
|
|
1326
1368
|
evaluators.push({
|
|
1327
1369
|
name,
|
|
1328
1370
|
type: "cost",
|
|
1329
1371
|
budget,
|
|
1330
1372
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1331
1373
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
1374
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
1332
1375
|
...negate !== void 0 ? { negate } : {}
|
|
1333
1376
|
});
|
|
1334
1377
|
continue;
|
|
@@ -1360,13 +1403,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1360
1403
|
continue;
|
|
1361
1404
|
}
|
|
1362
1405
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1363
|
-
const required2 =
|
|
1406
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
1407
|
+
rawEvaluator.required,
|
|
1408
|
+
rawEvaluator.min_score,
|
|
1409
|
+
name,
|
|
1410
|
+
evalId
|
|
1411
|
+
);
|
|
1364
1412
|
evaluators.push({
|
|
1365
1413
|
name,
|
|
1366
1414
|
type: "token-usage",
|
|
1367
1415
|
...validLimits,
|
|
1368
1416
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1369
1417
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
1418
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
1370
1419
|
...negate !== void 0 ? { negate } : {}
|
|
1371
1420
|
});
|
|
1372
1421
|
continue;
|
|
@@ -1412,13 +1461,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1412
1461
|
continue;
|
|
1413
1462
|
}
|
|
1414
1463
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1415
|
-
const required2 =
|
|
1464
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
1465
|
+
rawEvaluator.required,
|
|
1466
|
+
rawEvaluator.min_score,
|
|
1467
|
+
name,
|
|
1468
|
+
evalId
|
|
1469
|
+
);
|
|
1416
1470
|
evaluators.push({
|
|
1417
1471
|
name,
|
|
1418
1472
|
type: "execution-metrics",
|
|
1419
1473
|
...validThresholds,
|
|
1420
1474
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1421
1475
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
1476
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
1422
1477
|
...negate !== void 0 ? { negate } : {}
|
|
1423
1478
|
});
|
|
1424
1479
|
continue;
|
|
@@ -1432,7 +1487,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1432
1487
|
const rawShouldTrigger = rawEvaluator.should_trigger;
|
|
1433
1488
|
const shouldTrigger = typeof rawShouldTrigger === "boolean" ? rawShouldTrigger : void 0;
|
|
1434
1489
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1435
|
-
const required2 =
|
|
1490
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
1491
|
+
rawEvaluator.required,
|
|
1492
|
+
rawEvaluator.min_score,
|
|
1493
|
+
name,
|
|
1494
|
+
evalId
|
|
1495
|
+
);
|
|
1436
1496
|
evaluators.push({
|
|
1437
1497
|
name,
|
|
1438
1498
|
type: "skill-trigger",
|
|
@@ -1440,6 +1500,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1440
1500
|
...shouldTrigger !== void 0 ? { should_trigger: shouldTrigger } : {},
|
|
1441
1501
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1442
1502
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
1503
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
1443
1504
|
...negate !== void 0 ? { negate } : {}
|
|
1444
1505
|
});
|
|
1445
1506
|
continue;
|
|
@@ -1451,13 +1512,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1451
1512
|
continue;
|
|
1452
1513
|
}
|
|
1453
1514
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1454
|
-
const required2 =
|
|
1515
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
1516
|
+
rawEvaluator.required,
|
|
1517
|
+
rawEvaluator.min_score,
|
|
1518
|
+
name,
|
|
1519
|
+
evalId
|
|
1520
|
+
);
|
|
1455
1521
|
evaluators.push({
|
|
1456
1522
|
name,
|
|
1457
1523
|
type: "contains",
|
|
1458
1524
|
value,
|
|
1459
1525
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1460
1526
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
1527
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
1461
1528
|
...negate !== void 0 ? { negate } : {}
|
|
1462
1529
|
});
|
|
1463
1530
|
continue;
|
|
@@ -1471,13 +1538,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1471
1538
|
continue;
|
|
1472
1539
|
}
|
|
1473
1540
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1474
|
-
const required2 =
|
|
1541
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
1542
|
+
rawEvaluator.required,
|
|
1543
|
+
rawEvaluator.min_score,
|
|
1544
|
+
name,
|
|
1545
|
+
evalId
|
|
1546
|
+
);
|
|
1475
1547
|
evaluators.push({
|
|
1476
1548
|
name,
|
|
1477
1549
|
type: typeValue,
|
|
1478
1550
|
value,
|
|
1479
1551
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1480
1552
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
1553
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
1481
1554
|
...negate !== void 0 ? { negate } : {}
|
|
1482
1555
|
});
|
|
1483
1556
|
continue;
|
|
@@ -1489,13 +1562,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1489
1562
|
continue;
|
|
1490
1563
|
}
|
|
1491
1564
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1492
|
-
const required2 =
|
|
1565
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
1566
|
+
rawEvaluator.required,
|
|
1567
|
+
rawEvaluator.min_score,
|
|
1568
|
+
name,
|
|
1569
|
+
evalId
|
|
1570
|
+
);
|
|
1493
1571
|
evaluators.push({
|
|
1494
1572
|
name,
|
|
1495
1573
|
type: "icontains",
|
|
1496
1574
|
value,
|
|
1497
1575
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1498
1576
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
1577
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
1499
1578
|
...negate !== void 0 ? { negate } : {}
|
|
1500
1579
|
});
|
|
1501
1580
|
continue;
|
|
@@ -1509,13 +1588,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1509
1588
|
continue;
|
|
1510
1589
|
}
|
|
1511
1590
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1512
|
-
const required2 =
|
|
1591
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
1592
|
+
rawEvaluator.required,
|
|
1593
|
+
rawEvaluator.min_score,
|
|
1594
|
+
name,
|
|
1595
|
+
evalId
|
|
1596
|
+
);
|
|
1513
1597
|
evaluators.push({
|
|
1514
1598
|
name,
|
|
1515
1599
|
type: typeValue,
|
|
1516
1600
|
value,
|
|
1517
1601
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1518
1602
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
1603
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
1519
1604
|
...negate !== void 0 ? { negate } : {}
|
|
1520
1605
|
});
|
|
1521
1606
|
continue;
|
|
@@ -1527,13 +1612,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1527
1612
|
continue;
|
|
1528
1613
|
}
|
|
1529
1614
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1530
|
-
const required2 =
|
|
1615
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
1616
|
+
rawEvaluator.required,
|
|
1617
|
+
rawEvaluator.min_score,
|
|
1618
|
+
name,
|
|
1619
|
+
evalId
|
|
1620
|
+
);
|
|
1531
1621
|
evaluators.push({
|
|
1532
1622
|
name,
|
|
1533
1623
|
type: typeValue,
|
|
1534
1624
|
value,
|
|
1535
1625
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1536
1626
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
1627
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
1537
1628
|
...negate !== void 0 ? { negate } : {}
|
|
1538
1629
|
});
|
|
1539
1630
|
continue;
|
|
@@ -1546,7 +1637,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1546
1637
|
}
|
|
1547
1638
|
const flags = asString(rawEvaluator.flags);
|
|
1548
1639
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1549
|
-
const required2 =
|
|
1640
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
1641
|
+
rawEvaluator.required,
|
|
1642
|
+
rawEvaluator.min_score,
|
|
1643
|
+
name,
|
|
1644
|
+
evalId
|
|
1645
|
+
);
|
|
1550
1646
|
evaluators.push({
|
|
1551
1647
|
name,
|
|
1552
1648
|
type: "regex",
|
|
@@ -1554,18 +1650,25 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1554
1650
|
...flags !== void 0 ? { flags } : {},
|
|
1555
1651
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1556
1652
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
1653
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
1557
1654
|
...negate !== void 0 ? { negate } : {}
|
|
1558
1655
|
});
|
|
1559
1656
|
continue;
|
|
1560
1657
|
}
|
|
1561
1658
|
if (typeValue === "is-json") {
|
|
1562
1659
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1563
|
-
const required2 =
|
|
1660
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
1661
|
+
rawEvaluator.required,
|
|
1662
|
+
rawEvaluator.min_score,
|
|
1663
|
+
name,
|
|
1664
|
+
evalId
|
|
1665
|
+
);
|
|
1564
1666
|
evaluators.push({
|
|
1565
1667
|
name,
|
|
1566
1668
|
type: "is-json",
|
|
1567
1669
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1568
1670
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
1671
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
1569
1672
|
...negate !== void 0 ? { negate } : {}
|
|
1570
1673
|
});
|
|
1571
1674
|
continue;
|
|
@@ -1577,13 +1680,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1577
1680
|
continue;
|
|
1578
1681
|
}
|
|
1579
1682
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1580
|
-
const required2 =
|
|
1683
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
1684
|
+
rawEvaluator.required,
|
|
1685
|
+
rawEvaluator.min_score,
|
|
1686
|
+
name,
|
|
1687
|
+
evalId
|
|
1688
|
+
);
|
|
1581
1689
|
evaluators.push({
|
|
1582
1690
|
name,
|
|
1583
1691
|
type: "equals",
|
|
1584
1692
|
value,
|
|
1585
1693
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1586
1694
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
1695
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
1587
1696
|
...negate !== void 0 ? { negate } : {}
|
|
1588
1697
|
});
|
|
1589
1698
|
continue;
|
|
@@ -1619,7 +1728,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1619
1728
|
continue;
|
|
1620
1729
|
}
|
|
1621
1730
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1622
|
-
const required2 =
|
|
1731
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
1732
|
+
rawEvaluator.required,
|
|
1733
|
+
rawEvaluator.min_score,
|
|
1734
|
+
name,
|
|
1735
|
+
evalId
|
|
1736
|
+
);
|
|
1623
1737
|
evaluators.push({
|
|
1624
1738
|
name,
|
|
1625
1739
|
type: "llm-grader",
|
|
@@ -1627,6 +1741,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1627
1741
|
...graderTargetName ? { target: graderTargetName } : {},
|
|
1628
1742
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1629
1743
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
1744
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
1630
1745
|
...negate !== void 0 ? { negate } : {}
|
|
1631
1746
|
});
|
|
1632
1747
|
continue;
|
|
@@ -1696,7 +1811,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1696
1811
|
continue;
|
|
1697
1812
|
}
|
|
1698
1813
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1699
|
-
const required2 =
|
|
1814
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
1815
|
+
rawEvaluator.required,
|
|
1816
|
+
rawEvaluator.min_score,
|
|
1817
|
+
name,
|
|
1818
|
+
evalId
|
|
1819
|
+
);
|
|
1700
1820
|
evaluators.push({
|
|
1701
1821
|
name,
|
|
1702
1822
|
type: "llm-grader",
|
|
@@ -1704,12 +1824,18 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1704
1824
|
...graderTargetName ? { target: graderTargetName } : {},
|
|
1705
1825
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1706
1826
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
1827
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
1707
1828
|
...negate !== void 0 ? { negate } : {}
|
|
1708
1829
|
});
|
|
1709
1830
|
continue;
|
|
1710
1831
|
}
|
|
1711
1832
|
const weight = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1712
|
-
const required =
|
|
1833
|
+
const { required, min_score } = parseRequiredAndMinScore(
|
|
1834
|
+
rawEvaluator.required,
|
|
1835
|
+
rawEvaluator.min_score,
|
|
1836
|
+
name,
|
|
1837
|
+
evalId
|
|
1838
|
+
);
|
|
1713
1839
|
const knownProps = /* @__PURE__ */ new Set([
|
|
1714
1840
|
"name",
|
|
1715
1841
|
"type",
|
|
@@ -1720,6 +1846,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1720
1846
|
"weight",
|
|
1721
1847
|
"config",
|
|
1722
1848
|
"required",
|
|
1849
|
+
"min_score",
|
|
1723
1850
|
"negate",
|
|
1724
1851
|
"max_steps",
|
|
1725
1852
|
"maxSteps",
|
|
@@ -1749,6 +1876,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1749
1876
|
...graderTargetName ? { target: graderTargetName } : {},
|
|
1750
1877
|
...weight !== void 0 ? { weight } : {},
|
|
1751
1878
|
...required !== void 0 ? { required } : {},
|
|
1879
|
+
...min_score !== void 0 ? { min_score } : {},
|
|
1752
1880
|
...negate !== void 0 ? { negate } : {},
|
|
1753
1881
|
...finalConfig ? { config: finalConfig } : {},
|
|
1754
1882
|
...llmMaxSteps !== void 0 ? { max_steps: llmMaxSteps } : {},
|
|
@@ -1880,10 +2008,23 @@ ${detailBlock}${ANSI_RESET4}`);
|
|
|
1880
2008
|
console.warn(`${ANSI_YELLOW3}Warning: ${message}${ANSI_RESET4}`);
|
|
1881
2009
|
}
|
|
1882
2010
|
}
|
|
1883
|
-
function
|
|
1884
|
-
|
|
1885
|
-
if (typeof
|
|
1886
|
-
|
|
2011
|
+
function parseRequiredAndMinScore(rawRequired, rawMinScore, evaluatorName, evalId) {
|
|
2012
|
+
const result = {};
|
|
2013
|
+
if (typeof rawMinScore === "number" && rawMinScore > 0 && rawMinScore <= 1) {
|
|
2014
|
+
result.min_score = rawMinScore;
|
|
2015
|
+
}
|
|
2016
|
+
if (rawRequired === true) {
|
|
2017
|
+
result.required = true;
|
|
2018
|
+
} else if (typeof rawRequired === "number" && rawRequired > 0 && rawRequired <= 1) {
|
|
2019
|
+
if (result.min_score === void 0) {
|
|
2020
|
+
result.min_score = rawRequired;
|
|
2021
|
+
}
|
|
2022
|
+
result.required = rawRequired;
|
|
2023
|
+
logWarning2(
|
|
2024
|
+
`Evaluator '${evaluatorName}' in '${evalId}': 'required: ${rawRequired}' is deprecated. Use 'required: true' + 'min_score: ${rawRequired}' instead.`
|
|
2025
|
+
);
|
|
2026
|
+
}
|
|
2027
|
+
return result;
|
|
1887
2028
|
}
|
|
1888
2029
|
function validateWeight(rawWeight, evaluatorName, evalId) {
|
|
1889
2030
|
if (rawWeight === void 0) {
|
|
@@ -1926,16 +2067,30 @@ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
|
|
|
1926
2067
|
const id = asString(rawRubric.id) ?? `rubric-${index + 1}`;
|
|
1927
2068
|
const expectedOutcome = asString(rawRubric.outcome) ?? "";
|
|
1928
2069
|
const weight = typeof rawRubric.weight === "number" ? rawRubric.weight : 1;
|
|
2070
|
+
let minScore;
|
|
1929
2071
|
let requiredMinScore;
|
|
1930
2072
|
let required;
|
|
1931
|
-
if (typeof rawRubric.
|
|
1932
|
-
const
|
|
1933
|
-
if (
|
|
2073
|
+
if (typeof rawRubric.min_score === "number") {
|
|
2074
|
+
const ms = rawRubric.min_score;
|
|
2075
|
+
if (ms <= 0 || ms > 1) {
|
|
1934
2076
|
throw new Error(
|
|
1935
|
-
`Invalid
|
|
2077
|
+
`Invalid min_score for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be in (0, 1] (got ${ms})`
|
|
1936
2078
|
);
|
|
1937
2079
|
}
|
|
1938
|
-
|
|
2080
|
+
minScore = ms;
|
|
2081
|
+
requiredMinScore = Math.round(ms * 10);
|
|
2082
|
+
} else if (typeof rawRubric.required_min_score === "number") {
|
|
2083
|
+
const rms = rawRubric.required_min_score;
|
|
2084
|
+
if (!Number.isInteger(rms) || rms < 0 || rms > 10) {
|
|
2085
|
+
throw new Error(
|
|
2086
|
+
`Invalid required_min_score for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be an integer 0-10 (got ${rms})`
|
|
2087
|
+
);
|
|
2088
|
+
}
|
|
2089
|
+
requiredMinScore = rms;
|
|
2090
|
+
minScore = rms / 10;
|
|
2091
|
+
logWarning2(
|
|
2092
|
+
`Rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': 'required_min_score: ${rms}' is deprecated. Use 'min_score: ${rms / 10}' (0-1 scale) instead.`
|
|
2093
|
+
);
|
|
1939
2094
|
}
|
|
1940
2095
|
if (typeof rawRubric.required === "boolean") {
|
|
1941
2096
|
required = rawRubric.required;
|
|
@@ -1955,6 +2110,7 @@ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
|
|
|
1955
2110
|
weight,
|
|
1956
2111
|
...expectedOutcome.length > 0 ? { outcome: expectedOutcome } : {},
|
|
1957
2112
|
...required !== void 0 ? { required } : {},
|
|
2113
|
+
...minScore !== void 0 ? { min_score: minScore } : {},
|
|
1958
2114
|
...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {},
|
|
1959
2115
|
score_ranges: scoreRanges
|
|
1960
2116
|
});
|
|
@@ -1971,6 +2127,7 @@ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
|
|
|
1971
2127
|
weight,
|
|
1972
2128
|
// Default to required: true if not specified (backward compatibility)
|
|
1973
2129
|
required: required ?? true,
|
|
2130
|
+
...minScore !== void 0 ? { min_score: minScore } : {},
|
|
1974
2131
|
...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {}
|
|
1975
2132
|
});
|
|
1976
2133
|
}
|
|
@@ -2099,12 +2256,22 @@ function parseInlineRubrics(rawRubrics) {
|
|
|
2099
2256
|
id: asString(rubric.id) ?? `rubric-${index + 1}`,
|
|
2100
2257
|
weight: typeof rubric.weight === "number" ? rubric.weight : 1
|
|
2101
2258
|
};
|
|
2259
|
+
let inlineMinScore;
|
|
2260
|
+
let inlineRequiredMinScore;
|
|
2261
|
+
if (typeof rubric.min_score === "number") {
|
|
2262
|
+
inlineMinScore = rubric.min_score;
|
|
2263
|
+
inlineRequiredMinScore = Math.round(inlineMinScore * 10);
|
|
2264
|
+
} else if (typeof rubric.required_min_score === "number") {
|
|
2265
|
+
inlineRequiredMinScore = rubric.required_min_score;
|
|
2266
|
+
inlineMinScore = inlineRequiredMinScore / 10;
|
|
2267
|
+
}
|
|
2102
2268
|
if (scoreRanges && scoreRanges.length > 0) {
|
|
2103
2269
|
return {
|
|
2104
2270
|
...baseRubric,
|
|
2105
2271
|
...expectedOutcome.length > 0 ? { outcome: expectedOutcome } : {},
|
|
2106
2272
|
...typeof rubric.required === "boolean" ? { required: rubric.required } : {},
|
|
2107
|
-
...
|
|
2273
|
+
...inlineMinScore !== void 0 ? { min_score: inlineMinScore } : {},
|
|
2274
|
+
...inlineRequiredMinScore !== void 0 ? { required_min_score: inlineRequiredMinScore } : {},
|
|
2108
2275
|
score_ranges: scoreRanges
|
|
2109
2276
|
};
|
|
2110
2277
|
}
|
|
@@ -2112,7 +2279,8 @@ function parseInlineRubrics(rawRubrics) {
|
|
|
2112
2279
|
...baseRubric,
|
|
2113
2280
|
outcome: expectedOutcome,
|
|
2114
2281
|
required: typeof rubric.required === "boolean" ? rubric.required : true,
|
|
2115
|
-
...
|
|
2282
|
+
...inlineMinScore !== void 0 ? { min_score: inlineMinScore } : {},
|
|
2283
|
+
...inlineRequiredMinScore !== void 0 ? { required_min_score: inlineRequiredMinScore } : {}
|
|
2116
2284
|
};
|
|
2117
2285
|
}).filter((r) => r.outcome && r.outcome.length > 0 || "score_ranges" in r && r.score_ranges);
|
|
2118
2286
|
if (rubricItems.length === 0) {
|
|
@@ -2512,6 +2680,9 @@ function resolveExpectedMessages(raw) {
|
|
|
2512
2680
|
var ANSI_YELLOW5 = "\x1B[33m";
|
|
2513
2681
|
var ANSI_RED2 = "\x1B[31m";
|
|
2514
2682
|
var ANSI_RESET6 = "\x1B[0m";
|
|
2683
|
+
function matchesFilter(id, filter) {
|
|
2684
|
+
return typeof filter === "string" ? micromatch.isMatch(id, filter) : filter.some((pattern) => micromatch.isMatch(id, pattern));
|
|
2685
|
+
}
|
|
2515
2686
|
function detectFormat(filePath) {
|
|
2516
2687
|
const ext = path6.extname(filePath).toLowerCase();
|
|
2517
2688
|
if (ext === ".jsonl") return "jsonl";
|
|
@@ -2579,40 +2750,40 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
2579
2750
|
const sidecar = await loadSidecarMetadata(absoluteTestPath, verbose);
|
|
2580
2751
|
const rawFile = await readFile5(absoluteTestPath, "utf8");
|
|
2581
2752
|
const rawCases = parseJsonlContent(rawFile, evalFilePath);
|
|
2582
|
-
const
|
|
2583
|
-
const
|
|
2753
|
+
const fallbackSuiteName = path6.basename(absoluteTestPath, ".jsonl") || "eval";
|
|
2754
|
+
const suiteName = sidecar.name && sidecar.name.trim().length > 0 ? sidecar.name : fallbackSuiteName;
|
|
2584
2755
|
const globalEvaluator = coerceEvaluator(sidecar.evaluator, "sidecar") ?? "llm-grader";
|
|
2585
2756
|
const globalExecution = sidecar.execution;
|
|
2586
2757
|
if (verbose) {
|
|
2587
2758
|
console.log(`
|
|
2588
|
-
[JSONL
|
|
2759
|
+
[JSONL Suite: ${evalFilePath}]`);
|
|
2589
2760
|
console.log(` Cases: ${rawCases.length}`);
|
|
2590
|
-
console.log(`
|
|
2761
|
+
console.log(` Suite: ${suiteName}`);
|
|
2591
2762
|
if (sidecar.description) {
|
|
2592
2763
|
console.log(` Description: ${sidecar.description}`);
|
|
2593
2764
|
}
|
|
2594
2765
|
}
|
|
2595
2766
|
const results = [];
|
|
2596
2767
|
for (let lineIndex = 0; lineIndex < rawCases.length; lineIndex++) {
|
|
2597
|
-
const
|
|
2768
|
+
const testCaseConfig = rawCases[lineIndex];
|
|
2598
2769
|
const lineNumber = lineIndex + 1;
|
|
2599
|
-
const id = asString4(
|
|
2600
|
-
if (filterPattern && (!id || !
|
|
2770
|
+
const id = asString4(testCaseConfig.id);
|
|
2771
|
+
if (filterPattern && (!id || !matchesFilter(id, filterPattern))) {
|
|
2601
2772
|
continue;
|
|
2602
2773
|
}
|
|
2603
|
-
const conversationId = asString4(
|
|
2604
|
-
let outcome = asString4(
|
|
2605
|
-
if (!outcome &&
|
|
2606
|
-
outcome = asString4(
|
|
2774
|
+
const conversationId = asString4(testCaseConfig.conversation_id);
|
|
2775
|
+
let outcome = asString4(testCaseConfig.criteria);
|
|
2776
|
+
if (!outcome && testCaseConfig.expected_outcome !== void 0) {
|
|
2777
|
+
outcome = asString4(testCaseConfig.expected_outcome);
|
|
2607
2778
|
if (outcome) {
|
|
2608
2779
|
logWarning4(
|
|
2609
|
-
`Test '${asString4(
|
|
2780
|
+
`Test '${asString4(testCaseConfig.id) ?? "unknown"}': 'expected_outcome' is deprecated. Use 'criteria' instead.`
|
|
2610
2781
|
);
|
|
2611
2782
|
}
|
|
2612
2783
|
}
|
|
2613
|
-
const rawInputMessages = resolveInputMessages(
|
|
2614
|
-
const expectedMessages = resolveExpectedMessages(
|
|
2615
|
-
const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 ||
|
|
2784
|
+
const rawInputMessages = resolveInputMessages(testCaseConfig);
|
|
2785
|
+
const expectedMessages = resolveExpectedMessages(testCaseConfig) ?? [];
|
|
2786
|
+
const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || testCaseConfig.assert !== void 0;
|
|
2616
2787
|
if (!id || !hasEvaluationSpec || !rawInputMessages || rawInputMessages.length === 0) {
|
|
2617
2788
|
logError2(
|
|
2618
2789
|
`Skipping incomplete test at line ${lineNumber}: ${id ?? "unknown"}. Missing required fields: id, input, and at least one of criteria/expected_output/assert`
|
|
@@ -2649,18 +2820,23 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
2649
2820
|
}
|
|
2650
2821
|
}
|
|
2651
2822
|
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
2652
|
-
const caseExecution = isJsonObject(
|
|
2823
|
+
const caseExecution = isJsonObject(testCaseConfig.execution) ? testCaseConfig.execution : void 0;
|
|
2653
2824
|
const mergedExecution = caseExecution ?? globalExecution;
|
|
2654
|
-
const
|
|
2825
|
+
const testCaseEvaluatorKind = coerceEvaluator(testCaseConfig.evaluator, id) ?? globalEvaluator;
|
|
2655
2826
|
let evaluators;
|
|
2656
2827
|
try {
|
|
2657
|
-
evaluators = await parseEvaluators(
|
|
2828
|
+
evaluators = await parseEvaluators(
|
|
2829
|
+
testCaseConfig,
|
|
2830
|
+
mergedExecution,
|
|
2831
|
+
searchRoots,
|
|
2832
|
+
id ?? "unknown"
|
|
2833
|
+
);
|
|
2658
2834
|
} catch (error) {
|
|
2659
2835
|
const message = error instanceof Error ? error.message : String(error);
|
|
2660
2836
|
logError2(`Skipping test '${id}' at line ${lineNumber}: ${message}`);
|
|
2661
2837
|
continue;
|
|
2662
2838
|
}
|
|
2663
|
-
const inlineRubrics =
|
|
2839
|
+
const inlineRubrics = testCaseConfig.rubrics;
|
|
2664
2840
|
if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
|
|
2665
2841
|
const rubricEvaluator = parseInlineRubrics(inlineRubrics);
|
|
2666
2842
|
if (rubricEvaluator) {
|
|
@@ -2671,7 +2847,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
2671
2847
|
const userFilePaths = collectResolvedInputFilePaths(inputMessages);
|
|
2672
2848
|
const testCase = {
|
|
2673
2849
|
id,
|
|
2674
|
-
|
|
2850
|
+
suite: suiteName,
|
|
2675
2851
|
conversation_id: conversationId,
|
|
2676
2852
|
question,
|
|
2677
2853
|
input: inputMessages,
|
|
@@ -2679,7 +2855,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
2679
2855
|
reference_answer: referenceAnswer,
|
|
2680
2856
|
file_paths: userFilePaths,
|
|
2681
2857
|
criteria: outcome ?? "",
|
|
2682
|
-
evaluator:
|
|
2858
|
+
evaluator: testCaseEvaluatorKind,
|
|
2683
2859
|
assertions: evaluators
|
|
2684
2860
|
};
|
|
2685
2861
|
results.push(testCase);
|
|
@@ -2862,6 +3038,9 @@ function buildChatPromptFromSegments(options) {
|
|
|
2862
3038
|
var ANSI_YELLOW6 = "\x1B[33m";
|
|
2863
3039
|
var ANSI_RED3 = "\x1B[31m";
|
|
2864
3040
|
var ANSI_RESET7 = "\x1B[0m";
|
|
3041
|
+
function matchesFilter2(id, filter) {
|
|
3042
|
+
return typeof filter === "string" ? micromatch2.isMatch(id, filter) : filter.some((pattern) => micromatch2.isMatch(id, pattern));
|
|
3043
|
+
}
|
|
2865
3044
|
function resolveTests(suite) {
|
|
2866
3045
|
if (suite.tests !== void 0) return suite.tests;
|
|
2867
3046
|
if (suite.eval_cases !== void 0) {
|
|
@@ -2941,18 +3120,18 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
2941
3120
|
throw new Error(`Invalid test file format: ${evalFilePath}`);
|
|
2942
3121
|
}
|
|
2943
3122
|
const suite = interpolated;
|
|
2944
|
-
const
|
|
2945
|
-
const
|
|
2946
|
-
const
|
|
2947
|
-
const
|
|
3123
|
+
const suiteNameFromFile = asString5(suite.name)?.trim();
|
|
3124
|
+
const fallbackSuiteName = path7.basename(absoluteTestPath).replace(/\.eval\.ya?ml$/i, "").replace(/\.ya?ml$/i, "") || "eval";
|
|
3125
|
+
const suiteName = suiteNameFromFile && suiteNameFromFile.length > 0 ? suiteNameFromFile : fallbackSuiteName;
|
|
3126
|
+
const rawTestCases = resolveTests(suite);
|
|
2948
3127
|
const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm-grader";
|
|
2949
3128
|
const evalFileDir = path7.dirname(absoluteTestPath);
|
|
2950
|
-
let
|
|
2951
|
-
if (typeof
|
|
2952
|
-
const externalPath = path7.resolve(evalFileDir,
|
|
2953
|
-
|
|
2954
|
-
} else if (Array.isArray(
|
|
2955
|
-
|
|
3129
|
+
let expandedTestCases;
|
|
3130
|
+
if (typeof rawTestCases === "string") {
|
|
3131
|
+
const externalPath = path7.resolve(evalFileDir, rawTestCases);
|
|
3132
|
+
expandedTestCases = await loadCasesFromFile(externalPath);
|
|
3133
|
+
} else if (Array.isArray(rawTestCases)) {
|
|
3134
|
+
expandedTestCases = await expandFileReferences(rawTestCases, evalFileDir);
|
|
2956
3135
|
} else {
|
|
2957
3136
|
throw new Error(`Invalid test file format: ${evalFilePath} - missing 'tests' field`);
|
|
2958
3137
|
}
|
|
@@ -2967,32 +3146,33 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
2967
3146
|
}
|
|
2968
3147
|
const globalExecution = suiteAssertions !== void 0 ? { ...rawGlobalExecution ?? {}, assertions: suiteAssertions } : rawGlobalExecution;
|
|
2969
3148
|
const results = [];
|
|
2970
|
-
for (const
|
|
2971
|
-
if (!isJsonObject(
|
|
3149
|
+
for (const rawTestCase of expandedTestCases) {
|
|
3150
|
+
if (!isJsonObject(rawTestCase)) {
|
|
2972
3151
|
logWarning5("Skipping invalid test entry (expected object)");
|
|
2973
3152
|
continue;
|
|
2974
3153
|
}
|
|
2975
|
-
const
|
|
2976
|
-
const id = asString5(
|
|
2977
|
-
if (filterPattern && (!id || !
|
|
3154
|
+
const testCaseConfig = rawTestCase;
|
|
3155
|
+
const id = asString5(testCaseConfig.id);
|
|
3156
|
+
if (filterPattern && (!id || !matchesFilter2(id, filterPattern))) {
|
|
2978
3157
|
continue;
|
|
2979
3158
|
}
|
|
2980
|
-
const conversationId = asString5(
|
|
2981
|
-
let outcome = asString5(
|
|
2982
|
-
if (!outcome &&
|
|
2983
|
-
outcome = asString5(
|
|
3159
|
+
const conversationId = asString5(testCaseConfig.conversation_id);
|
|
3160
|
+
let outcome = asString5(testCaseConfig.criteria);
|
|
3161
|
+
if (!outcome && testCaseConfig.expected_outcome !== void 0) {
|
|
3162
|
+
outcome = asString5(testCaseConfig.expected_outcome);
|
|
2984
3163
|
if (outcome) {
|
|
2985
3164
|
logWarning5(
|
|
2986
|
-
`Test '${asString5(
|
|
3165
|
+
`Test '${asString5(testCaseConfig.id) ?? "unknown"}': 'expected_outcome' is deprecated. Use 'criteria' instead.`
|
|
2987
3166
|
);
|
|
2988
3167
|
}
|
|
2989
3168
|
}
|
|
2990
|
-
const caseExecution = isJsonObject(
|
|
3169
|
+
const caseExecution = isJsonObject(testCaseConfig.execution) ? testCaseConfig.execution : void 0;
|
|
2991
3170
|
const skipDefaults = caseExecution?.skip_defaults === true;
|
|
3171
|
+
const caseThreshold = typeof caseExecution?.threshold === "number" && caseExecution.threshold >= 0 && caseExecution.threshold <= 1 ? caseExecution.threshold : void 0;
|
|
2992
3172
|
const effectiveSuiteInputFiles = suiteInputFiles && !skipDefaults ? suiteInputFiles : void 0;
|
|
2993
|
-
const testInputMessages = resolveInputMessages(
|
|
2994
|
-
const expectedMessages = resolveExpectedMessages(
|
|
2995
|
-
const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 ||
|
|
3173
|
+
const testInputMessages = resolveInputMessages(testCaseConfig, effectiveSuiteInputFiles);
|
|
3174
|
+
const expectedMessages = resolveExpectedMessages(testCaseConfig) ?? [];
|
|
3175
|
+
const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || testCaseConfig.assertions !== void 0 || testCaseConfig.assert !== void 0;
|
|
2996
3176
|
if (!id || !hasEvaluationSpec || !testInputMessages || testInputMessages.length === 0) {
|
|
2997
3177
|
logError3(
|
|
2998
3178
|
`Skipping incomplete test: ${id ?? "unknown"}. Missing required fields: id, input, and at least one of criteria/expected_output/assertions`
|
|
@@ -3039,16 +3219,21 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
3039
3219
|
}
|
|
3040
3220
|
}
|
|
3041
3221
|
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
3042
|
-
const
|
|
3222
|
+
const testCaseEvaluatorKind = coerceEvaluator(testCaseConfig.evaluator, id) ?? globalEvaluator;
|
|
3043
3223
|
let evaluators;
|
|
3044
3224
|
try {
|
|
3045
|
-
evaluators = await parseEvaluators(
|
|
3225
|
+
evaluators = await parseEvaluators(
|
|
3226
|
+
testCaseConfig,
|
|
3227
|
+
globalExecution,
|
|
3228
|
+
searchRoots,
|
|
3229
|
+
id ?? "unknown"
|
|
3230
|
+
);
|
|
3046
3231
|
} catch (error) {
|
|
3047
3232
|
const message = error instanceof Error ? error.message : String(error);
|
|
3048
3233
|
logError3(`Skipping test '${id}': ${message}`);
|
|
3049
3234
|
continue;
|
|
3050
3235
|
}
|
|
3051
|
-
const inlineRubrics =
|
|
3236
|
+
const inlineRubrics = testCaseConfig.rubrics;
|
|
3052
3237
|
if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
|
|
3053
3238
|
const rubricEvaluator = parseInlineRubrics(inlineRubrics);
|
|
3054
3239
|
if (rubricEvaluator) {
|
|
@@ -3057,13 +3242,13 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
3057
3242
|
}
|
|
3058
3243
|
warnUnconsumedCriteria(outcome, evaluators, id ?? "unknown");
|
|
3059
3244
|
const userFilePaths = collectResolvedInputFilePaths(inputMessages);
|
|
3060
|
-
const caseWorkspace = await resolveWorkspaceConfig(
|
|
3245
|
+
const caseWorkspace = await resolveWorkspaceConfig(testCaseConfig.workspace, evalFileDir);
|
|
3061
3246
|
const mergedWorkspace = mergeWorkspaceConfigs(suiteWorkspace, caseWorkspace);
|
|
3062
|
-
const metadata = isJsonObject(
|
|
3063
|
-
const caseTargets = extractTargetsFromTestCase(
|
|
3247
|
+
const metadata = isJsonObject(testCaseConfig.metadata) ? testCaseConfig.metadata : void 0;
|
|
3248
|
+
const caseTargets = extractTargetsFromTestCase(testCaseConfig);
|
|
3064
3249
|
const testCase = {
|
|
3065
3250
|
id,
|
|
3066
|
-
|
|
3251
|
+
suite: suiteName,
|
|
3067
3252
|
category: options?.category,
|
|
3068
3253
|
conversation_id: conversationId,
|
|
3069
3254
|
question,
|
|
@@ -3072,11 +3257,12 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
3072
3257
|
reference_answer: referenceAnswer,
|
|
3073
3258
|
file_paths: userFilePaths,
|
|
3074
3259
|
criteria: outcome ?? "",
|
|
3075
|
-
evaluator:
|
|
3260
|
+
evaluator: testCaseEvaluatorKind,
|
|
3076
3261
|
assertions: evaluators,
|
|
3077
3262
|
workspace: mergedWorkspace,
|
|
3078
3263
|
metadata,
|
|
3079
|
-
targets: caseTargets
|
|
3264
|
+
targets: caseTargets,
|
|
3265
|
+
...caseThreshold !== void 0 ? { threshold: caseThreshold } : {}
|
|
3080
3266
|
};
|
|
3081
3267
|
results.push(testCase);
|
|
3082
3268
|
}
|
|
@@ -3620,7 +3806,7 @@ var AzureProvider = class {
|
|
|
3620
3806
|
};
|
|
3621
3807
|
this.retryConfig = config.retry;
|
|
3622
3808
|
const azure = createAzure(buildAzureOptions(config));
|
|
3623
|
-
this.model = azure.chat(config.deploymentName);
|
|
3809
|
+
this.model = config.apiFormat === "responses" ? azure(config.deploymentName) : azure.chat(config.deploymentName);
|
|
3624
3810
|
}
|
|
3625
3811
|
id;
|
|
3626
3812
|
kind = "azure";
|
|
@@ -3746,7 +3932,9 @@ function buildAzureOptions(config) {
|
|
|
3746
3932
|
const options = {
|
|
3747
3933
|
apiKey: config.apiKey,
|
|
3748
3934
|
apiVersion: config.version,
|
|
3749
|
-
|
|
3935
|
+
// Chat completions still use deployment-scoped Azure URLs for compatibility
|
|
3936
|
+
// with existing deployments. Responses API should use the SDK's v1 path.
|
|
3937
|
+
useDeploymentBasedUrls: config.apiFormat !== "responses"
|
|
3750
3938
|
};
|
|
3751
3939
|
const baseURL = normalizeAzureBaseUrl(config.resourceName);
|
|
3752
3940
|
if (baseURL) {
|
|
@@ -6946,6 +7134,25 @@ var CopilotSdkProvider = class {
|
|
|
6946
7134
|
content: systemPrompt
|
|
6947
7135
|
};
|
|
6948
7136
|
}
|
|
7137
|
+
if (this.config.byokBaseUrl) {
|
|
7138
|
+
const byokType = this.config.byokType ?? "openai";
|
|
7139
|
+
const provider = {
|
|
7140
|
+
type: byokType,
|
|
7141
|
+
baseUrl: normalizeByokBaseUrl(this.config.byokBaseUrl, byokType)
|
|
7142
|
+
};
|
|
7143
|
+
if (this.config.byokBearerToken) {
|
|
7144
|
+
provider.bearerToken = this.config.byokBearerToken;
|
|
7145
|
+
} else if (this.config.byokApiKey) {
|
|
7146
|
+
provider.apiKey = this.config.byokApiKey;
|
|
7147
|
+
}
|
|
7148
|
+
if (this.config.byokWireApi) {
|
|
7149
|
+
provider.wireApi = this.config.byokWireApi;
|
|
7150
|
+
}
|
|
7151
|
+
if (this.config.byokType === "azure" && this.config.byokApiVersion) {
|
|
7152
|
+
provider.azure = { apiVersion: this.config.byokApiVersion };
|
|
7153
|
+
}
|
|
7154
|
+
sessionOptions.provider = provider;
|
|
7155
|
+
}
|
|
6949
7156
|
let session;
|
|
6950
7157
|
try {
|
|
6951
7158
|
session = await client.createSession(sessionOptions);
|
|
@@ -7177,6 +7384,16 @@ function resolveSkillDirectories(cwd) {
|
|
|
7177
7384
|
];
|
|
7178
7385
|
return candidates.filter((dir) => existsSync2(dir));
|
|
7179
7386
|
}
|
|
7387
|
+
function normalizeByokBaseUrl(baseUrl, type) {
|
|
7388
|
+
const trimmed = baseUrl.trim().replace(/\/+$/, "");
|
|
7389
|
+
if (/^https?:\/\//i.test(trimmed)) {
|
|
7390
|
+
return trimmed;
|
|
7391
|
+
}
|
|
7392
|
+
if (type === "azure") {
|
|
7393
|
+
return `https://${trimmed}.openai.azure.com`;
|
|
7394
|
+
}
|
|
7395
|
+
return trimmed;
|
|
7396
|
+
}
|
|
7180
7397
|
function summarizeSdkEvent(eventType, data) {
|
|
7181
7398
|
if (!data || typeof data !== "object") {
|
|
7182
7399
|
return eventType;
|
|
@@ -7354,6 +7571,22 @@ function extractAzureResourceName(baseUrl) {
|
|
|
7354
7571
|
if (urlMatch) return urlMatch[1];
|
|
7355
7572
|
return baseUrl;
|
|
7356
7573
|
}
|
|
7574
|
+
function normalizeAzureSdkBaseUrl(baseUrl) {
|
|
7575
|
+
const trimmed = baseUrl.trim().replace(/\/+$/, "");
|
|
7576
|
+
if (!trimmed) {
|
|
7577
|
+
return trimmed;
|
|
7578
|
+
}
|
|
7579
|
+
if (!/^https?:\/\//i.test(trimmed)) {
|
|
7580
|
+
return `https://${trimmed}.openai.azure.com/openai/v1`;
|
|
7581
|
+
}
|
|
7582
|
+
if (/\/openai\/v1$/i.test(trimmed)) {
|
|
7583
|
+
return trimmed;
|
|
7584
|
+
}
|
|
7585
|
+
if (/\/openai$/i.test(trimmed)) {
|
|
7586
|
+
return `${trimmed}/v1`;
|
|
7587
|
+
}
|
|
7588
|
+
return `${trimmed}/openai/v1`;
|
|
7589
|
+
}
|
|
7357
7590
|
|
|
7358
7591
|
// src/evaluation/providers/pi-utils.ts
|
|
7359
7592
|
function extractPiTextContent(content) {
|
|
@@ -8184,11 +8417,41 @@ async function defaultPiRunner(options) {
|
|
|
8184
8417
|
// src/evaluation/providers/pi-coding-agent.ts
|
|
8185
8418
|
import { execSync as execSync2 } from "node:child_process";
|
|
8186
8419
|
import { randomUUID as randomUUID8 } from "node:crypto";
|
|
8187
|
-
import { accessSync as accessSync2, createWriteStream as createWriteStream6 } from "node:fs";
|
|
8420
|
+
import { accessSync as accessSync2, createWriteStream as createWriteStream6, mkdirSync } from "node:fs";
|
|
8188
8421
|
import { mkdir as mkdir7 } from "node:fs/promises";
|
|
8189
|
-
import
|
|
8422
|
+
import path21 from "node:path";
|
|
8190
8423
|
import { createInterface } from "node:readline";
|
|
8191
|
-
import { fileURLToPath as fileURLToPath3 } from "node:url";
|
|
8424
|
+
import { fileURLToPath as fileURLToPath3, pathToFileURL } from "node:url";
|
|
8425
|
+
|
|
8426
|
+
// src/paths.ts
|
|
8427
|
+
import os2 from "node:os";
|
|
8428
|
+
import path20 from "node:path";
|
|
8429
|
+
var logged = false;
|
|
8430
|
+
function getAgentvHome() {
|
|
8431
|
+
const envHome = process.env.AGENTV_HOME;
|
|
8432
|
+
if (envHome && envHome !== "undefined") {
|
|
8433
|
+
if (!logged) {
|
|
8434
|
+
logged = true;
|
|
8435
|
+
console.warn(`Using AGENTV_HOME: ${envHome}`);
|
|
8436
|
+
}
|
|
8437
|
+
return envHome;
|
|
8438
|
+
}
|
|
8439
|
+
return path20.join(os2.homedir(), ".agentv");
|
|
8440
|
+
}
|
|
8441
|
+
function getWorkspacesRoot() {
|
|
8442
|
+
return path20.join(getAgentvHome(), "workspaces");
|
|
8443
|
+
}
|
|
8444
|
+
function getSubagentsRoot() {
|
|
8445
|
+
return path20.join(getAgentvHome(), "subagents");
|
|
8446
|
+
}
|
|
8447
|
+
function getTraceStateRoot() {
|
|
8448
|
+
return path20.join(getAgentvHome(), "trace-state");
|
|
8449
|
+
}
|
|
8450
|
+
function getWorkspacePoolRoot() {
|
|
8451
|
+
return path20.join(getAgentvHome(), "workspace-pool");
|
|
8452
|
+
}
|
|
8453
|
+
|
|
8454
|
+
// src/evaluation/providers/pi-coding-agent.ts
|
|
8192
8455
|
var piCodingAgentModule = null;
|
|
8193
8456
|
var piAiModule = null;
|
|
8194
8457
|
var loadingPromise = null;
|
|
@@ -8206,46 +8469,126 @@ async function promptInstall() {
|
|
|
8206
8469
|
rl.close();
|
|
8207
8470
|
}
|
|
8208
8471
|
}
|
|
8209
|
-
function
|
|
8210
|
-
|
|
8211
|
-
|
|
8212
|
-
|
|
8472
|
+
function findManagedSdkInstallRoot() {
|
|
8473
|
+
return path21.join(getAgentvHome(), "deps", "pi-sdk");
|
|
8474
|
+
}
|
|
8475
|
+
function resolveGlobalNpmRoot() {
|
|
8476
|
+
try {
|
|
8477
|
+
const root = execSync2("npm root -g", {
|
|
8478
|
+
encoding: "utf-8",
|
|
8479
|
+
stdio: ["ignore", "pipe", "ignore"]
|
|
8480
|
+
}).trim();
|
|
8481
|
+
return root.length > 0 ? root : void 0;
|
|
8482
|
+
} catch {
|
|
8483
|
+
return void 0;
|
|
8484
|
+
}
|
|
8485
|
+
}
|
|
8486
|
+
function buildGlobalModuleEntry(moduleName, globalNpmRoot) {
|
|
8487
|
+
return path21.join(globalNpmRoot, ...moduleName.split("/"), "dist", "index.js");
|
|
8488
|
+
}
|
|
8489
|
+
function findAccessiblePath(paths) {
|
|
8490
|
+
for (const candidate of paths) {
|
|
8213
8491
|
try {
|
|
8214
|
-
|
|
8215
|
-
|
|
8216
|
-
return dir;
|
|
8492
|
+
accessSync2(candidate);
|
|
8493
|
+
return candidate;
|
|
8217
8494
|
} catch {
|
|
8218
|
-
const parent = path20.dirname(dir);
|
|
8219
|
-
if (parent === dir) break;
|
|
8220
|
-
dir = parent;
|
|
8221
8495
|
}
|
|
8222
8496
|
}
|
|
8223
|
-
return
|
|
8497
|
+
return void 0;
|
|
8224
8498
|
}
|
|
8225
|
-
async function
|
|
8499
|
+
async function tryImportLocalSdkModules() {
|
|
8226
8500
|
try {
|
|
8227
8501
|
[piCodingAgentModule, piAiModule] = await Promise.all([
|
|
8228
8502
|
import("@mariozechner/pi-coding-agent"),
|
|
8229
8503
|
import("@mariozechner/pi-ai")
|
|
8230
8504
|
]);
|
|
8505
|
+
return true;
|
|
8231
8506
|
} catch {
|
|
8232
|
-
|
|
8233
|
-
|
|
8234
|
-
|
|
8235
|
-
|
|
8236
|
-
|
|
8237
|
-
|
|
8238
|
-
|
|
8239
|
-
|
|
8240
|
-
|
|
8241
|
-
|
|
8242
|
-
|
|
8243
|
-
|
|
8244
|
-
|
|
8245
|
-
|
|
8246
|
-
|
|
8507
|
+
return false;
|
|
8508
|
+
}
|
|
8509
|
+
}
|
|
8510
|
+
async function tryImportManagedSdkModules() {
|
|
8511
|
+
const managedRoot = findManagedSdkInstallRoot();
|
|
8512
|
+
const piCodingAgentEntry = findAccessiblePath([
|
|
8513
|
+
path21.join(managedRoot, "node_modules", "@mariozechner", "pi-coding-agent", "dist", "index.js")
|
|
8514
|
+
]);
|
|
8515
|
+
const piAiEntry = findAccessiblePath([
|
|
8516
|
+
path21.join(managedRoot, "node_modules", "@mariozechner", "pi-ai", "dist", "index.js"),
|
|
8517
|
+
path21.join(
|
|
8518
|
+
managedRoot,
|
|
8519
|
+
"node_modules",
|
|
8520
|
+
"@mariozechner",
|
|
8521
|
+
"pi-coding-agent",
|
|
8522
|
+
"node_modules",
|
|
8523
|
+
"@mariozechner",
|
|
8524
|
+
"pi-ai",
|
|
8525
|
+
"dist",
|
|
8526
|
+
"index.js"
|
|
8527
|
+
)
|
|
8528
|
+
]);
|
|
8529
|
+
if (!piCodingAgentEntry || !piAiEntry) return false;
|
|
8530
|
+
try {
|
|
8531
|
+
[piCodingAgentModule, piAiModule] = await Promise.all([
|
|
8532
|
+
import(pathToFileURL(piCodingAgentEntry).href),
|
|
8533
|
+
import(pathToFileURL(piAiEntry).href)
|
|
8534
|
+
]);
|
|
8535
|
+
return true;
|
|
8536
|
+
} catch {
|
|
8537
|
+
return false;
|
|
8538
|
+
}
|
|
8539
|
+
}
|
|
8540
|
+
async function tryImportGlobalSdkModules() {
|
|
8541
|
+
const globalNpmRoot = resolveGlobalNpmRoot();
|
|
8542
|
+
if (!globalNpmRoot) return false;
|
|
8543
|
+
const piCodingAgentEntry = findAccessiblePath([
|
|
8544
|
+
buildGlobalModuleEntry("@mariozechner/pi-coding-agent", globalNpmRoot)
|
|
8545
|
+
]);
|
|
8546
|
+
const piAiEntry = findAccessiblePath([
|
|
8547
|
+
buildGlobalModuleEntry("@mariozechner/pi-ai", globalNpmRoot),
|
|
8548
|
+
path21.join(
|
|
8549
|
+
globalNpmRoot,
|
|
8550
|
+
"@mariozechner",
|
|
8551
|
+
"pi-coding-agent",
|
|
8552
|
+
"node_modules",
|
|
8553
|
+
"@mariozechner",
|
|
8554
|
+
"pi-ai",
|
|
8555
|
+
"dist",
|
|
8556
|
+
"index.js"
|
|
8557
|
+
)
|
|
8558
|
+
]);
|
|
8559
|
+
if (!piCodingAgentEntry || !piAiEntry) return false;
|
|
8560
|
+
try {
|
|
8561
|
+
[piCodingAgentModule, piAiModule] = await Promise.all([
|
|
8562
|
+
import(pathToFileURL(piCodingAgentEntry).href),
|
|
8563
|
+
import(pathToFileURL(piAiEntry).href)
|
|
8564
|
+
]);
|
|
8565
|
+
return true;
|
|
8566
|
+
} catch {
|
|
8567
|
+
return false;
|
|
8568
|
+
}
|
|
8569
|
+
}
|
|
8570
|
+
function installSdkModules(installDir) {
|
|
8571
|
+
console.error(`Installing @mariozechner/pi-coding-agent into ${installDir} via npm...`);
|
|
8572
|
+
mkdirSync(installDir, { recursive: true });
|
|
8573
|
+
execSync2("npm install --no-save --no-package-lock @mariozechner/pi-coding-agent", {
|
|
8574
|
+
cwd: installDir,
|
|
8575
|
+
stdio: "inherit"
|
|
8576
|
+
});
|
|
8577
|
+
}
|
|
8578
|
+
async function doLoadSdkModules() {
|
|
8579
|
+
if (await tryImportLocalSdkModules() || await tryImportManagedSdkModules() || await tryImportGlobalSdkModules()) {
|
|
8580
|
+
return;
|
|
8581
|
+
}
|
|
8582
|
+
if (await promptInstall()) {
|
|
8583
|
+
const installDir = findManagedSdkInstallRoot();
|
|
8584
|
+
installSdkModules(installDir);
|
|
8585
|
+
if (await tryImportManagedSdkModules()) {
|
|
8586
|
+
return;
|
|
8247
8587
|
}
|
|
8248
8588
|
}
|
|
8589
|
+
throw new Error(
|
|
8590
|
+
"pi-coding-agent SDK is not installed. Install it with:\n npm install @mariozechner/pi-coding-agent"
|
|
8591
|
+
);
|
|
8249
8592
|
}
|
|
8250
8593
|
async function loadSdkModules() {
|
|
8251
8594
|
if (!piCodingAgentModule || !piAiModule) {
|
|
@@ -8302,12 +8645,16 @@ var PiCodingAgentProvider = class {
|
|
|
8302
8645
|
try {
|
|
8303
8646
|
const cwd = this.resolveCwd(request.cwd);
|
|
8304
8647
|
const rawProvider = this.config.subprovider ?? "google";
|
|
8305
|
-
const
|
|
8648
|
+
const normalizedBaseUrl = this.normalizeSdkBaseUrl(rawProvider, this.config.baseUrl);
|
|
8649
|
+
const hasBaseUrl = !!normalizedBaseUrl;
|
|
8306
8650
|
const providerName = resolveSubprovider(rawProvider, hasBaseUrl);
|
|
8307
8651
|
const modelId = this.config.model ?? "gemini-2.5-flash";
|
|
8308
8652
|
this.setApiKeyEnv(rawProvider, hasBaseUrl);
|
|
8309
|
-
this.setBaseUrlEnv(rawProvider, hasBaseUrl);
|
|
8653
|
+
this.setBaseUrlEnv(rawProvider, normalizedBaseUrl, hasBaseUrl);
|
|
8310
8654
|
let model = sdk.getModel(providerName, modelId);
|
|
8655
|
+
if (model && normalizedBaseUrl) {
|
|
8656
|
+
model = { ...model, baseUrl: normalizedBaseUrl };
|
|
8657
|
+
}
|
|
8311
8658
|
if (!model) {
|
|
8312
8659
|
const envProvider = providerName.replace(/-responses$/, "");
|
|
8313
8660
|
model = {
|
|
@@ -8315,7 +8662,7 @@ var PiCodingAgentProvider = class {
|
|
|
8315
8662
|
name: modelId,
|
|
8316
8663
|
api: providerName,
|
|
8317
8664
|
provider: envProvider,
|
|
8318
|
-
baseUrl:
|
|
8665
|
+
baseUrl: normalizedBaseUrl ?? "",
|
|
8319
8666
|
reasoning: false,
|
|
8320
8667
|
input: ["text"],
|
|
8321
8668
|
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
|
|
@@ -8482,19 +8829,27 @@ ${fileList}`;
|
|
|
8482
8829
|
}
|
|
8483
8830
|
}
|
|
8484
8831
|
/** Maps config baseUrl to the provider-specific env var the SDK reads. */
|
|
8485
|
-
setBaseUrlEnv(providerName, hasBaseUrl = false) {
|
|
8486
|
-
|
|
8832
|
+
setBaseUrlEnv(providerName, baseUrl = this.config.baseUrl, hasBaseUrl = false) {
|
|
8833
|
+
const normalizedBaseUrl = this.normalizeSdkBaseUrl(providerName, baseUrl);
|
|
8834
|
+
if (!normalizedBaseUrl) return;
|
|
8487
8835
|
const envKey = resolveEnvBaseUrlName(providerName, hasBaseUrl);
|
|
8488
8836
|
if (envKey) {
|
|
8489
|
-
process.env[envKey] =
|
|
8837
|
+
process.env[envKey] = normalizedBaseUrl;
|
|
8490
8838
|
}
|
|
8491
8839
|
}
|
|
8840
|
+
normalizeSdkBaseUrl(providerName, baseUrl) {
|
|
8841
|
+
if (!baseUrl) return void 0;
|
|
8842
|
+
if (providerName.toLowerCase() === "azure") {
|
|
8843
|
+
return normalizeAzureSdkBaseUrl(baseUrl);
|
|
8844
|
+
}
|
|
8845
|
+
return baseUrl;
|
|
8846
|
+
}
|
|
8492
8847
|
resolveCwd(cwdOverride) {
|
|
8493
8848
|
if (cwdOverride) {
|
|
8494
|
-
return
|
|
8849
|
+
return path21.resolve(cwdOverride);
|
|
8495
8850
|
}
|
|
8496
8851
|
if (this.config.cwd) {
|
|
8497
|
-
return
|
|
8852
|
+
return path21.resolve(this.config.cwd);
|
|
8498
8853
|
}
|
|
8499
8854
|
return process.cwd();
|
|
8500
8855
|
}
|
|
@@ -8513,9 +8868,9 @@ ${fileList}`;
|
|
|
8513
8868
|
}
|
|
8514
8869
|
resolveLogDirectory() {
|
|
8515
8870
|
if (this.config.logDir) {
|
|
8516
|
-
return
|
|
8871
|
+
return path21.resolve(this.config.logDir);
|
|
8517
8872
|
}
|
|
8518
|
-
return
|
|
8873
|
+
return path21.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
|
|
8519
8874
|
}
|
|
8520
8875
|
async createStreamLogger(request) {
|
|
8521
8876
|
const logDir = this.resolveLogDirectory();
|
|
@@ -8529,7 +8884,7 @@ ${fileList}`;
|
|
|
8529
8884
|
console.warn(`Skipping Pi stream logging (could not create ${logDir}): ${message}`);
|
|
8530
8885
|
return void 0;
|
|
8531
8886
|
}
|
|
8532
|
-
const filePath =
|
|
8887
|
+
const filePath = path21.join(logDir, buildLogFilename6(request, this.targetName));
|
|
8533
8888
|
try {
|
|
8534
8889
|
const logger = await PiStreamLogger2.create({
|
|
8535
8890
|
filePath,
|
|
@@ -8754,7 +9109,7 @@ import path30 from "node:path";
|
|
|
8754
9109
|
// src/evaluation/providers/vscode/utils/fs.ts
|
|
8755
9110
|
import { constants as constants2 } from "node:fs";
|
|
8756
9111
|
import { access as access2, mkdir as mkdir8, readdir as readdir2, rm as rm2, stat as stat2 } from "node:fs/promises";
|
|
8757
|
-
import
|
|
9112
|
+
import path22 from "node:path";
|
|
8758
9113
|
async function pathExists(target) {
|
|
8759
9114
|
try {
|
|
8760
9115
|
await access2(target, constants2.F_OK);
|
|
@@ -8770,7 +9125,7 @@ async function readDirEntries(target) {
|
|
|
8770
9125
|
const entries = await readdir2(target, { withFileTypes: true });
|
|
8771
9126
|
return entries.map((entry) => ({
|
|
8772
9127
|
name: entry.name,
|
|
8773
|
-
absolutePath:
|
|
9128
|
+
absolutePath: path22.join(target, entry.name),
|
|
8774
9129
|
isDirectory: entry.isDirectory()
|
|
8775
9130
|
}));
|
|
8776
9131
|
}
|
|
@@ -8785,9 +9140,9 @@ async function removeIfExists(target) {
|
|
|
8785
9140
|
}
|
|
8786
9141
|
|
|
8787
9142
|
// src/evaluation/providers/vscode/utils/path.ts
|
|
8788
|
-
import
|
|
9143
|
+
import path23 from "node:path";
|
|
8789
9144
|
function pathToFileUri2(filePath) {
|
|
8790
|
-
const absolutePath =
|
|
9145
|
+
const absolutePath = path23.isAbsolute(filePath) ? filePath : path23.resolve(filePath);
|
|
8791
9146
|
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
8792
9147
|
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
8793
9148
|
return `file:///${normalizedPath}`;
|
|
@@ -8796,7 +9151,7 @@ function pathToFileUri2(filePath) {
|
|
|
8796
9151
|
}
|
|
8797
9152
|
|
|
8798
9153
|
// src/evaluation/providers/vscode/dispatch/promptBuilder.ts
|
|
8799
|
-
import
|
|
9154
|
+
import path24 from "node:path";
|
|
8800
9155
|
|
|
8801
9156
|
// src/evaluation/providers/vscode/utils/template.ts
|
|
8802
9157
|
function renderTemplate2(content, variables) {
|
|
@@ -8888,8 +9243,8 @@ function createBatchRequestPrompt(userQuery, responseFileTmp, responseFileFinal,
|
|
|
8888
9243
|
});
|
|
8889
9244
|
}
|
|
8890
9245
|
function createBatchOrchestratorPrompt(requestFiles, responseFiles, templateContent) {
|
|
8891
|
-
const requestLines = requestFiles.map((file, index) => `${index + 1}. messages/${
|
|
8892
|
-
const responseList = responseFiles.map((file) => `"${
|
|
9246
|
+
const requestLines = requestFiles.map((file, index) => `${index + 1}. messages/${path24.basename(file)}`).join("\n");
|
|
9247
|
+
const responseList = responseFiles.map((file) => `"${path24.basename(file)}"`).join(", ");
|
|
8893
9248
|
return renderTemplate2(templateContent, {
|
|
8894
9249
|
requestFiles: requestLines,
|
|
8895
9250
|
responseList
|
|
@@ -8898,7 +9253,7 @@ function createBatchOrchestratorPrompt(requestFiles, responseFiles, templateCont
|
|
|
8898
9253
|
|
|
8899
9254
|
// src/evaluation/providers/vscode/dispatch/responseWaiter.ts
|
|
8900
9255
|
import { readFile as readFile9 } from "node:fs/promises";
|
|
8901
|
-
import
|
|
9256
|
+
import path25 from "node:path";
|
|
8902
9257
|
|
|
8903
9258
|
// src/evaluation/providers/vscode/utils/time.ts
|
|
8904
9259
|
function sleep2(ms) {
|
|
@@ -8957,7 +9312,7 @@ async function waitForResponseOutput(responseFileFinal, pollInterval = 1e3, sile
|
|
|
8957
9312
|
}
|
|
8958
9313
|
async function waitForBatchResponses(responseFilesFinal, pollInterval = 1e3, silent = false, timeoutMs = DEFAULT_TIMEOUT_MS) {
|
|
8959
9314
|
if (!silent) {
|
|
8960
|
-
const fileList = responseFilesFinal.map((file) =>
|
|
9315
|
+
const fileList = responseFilesFinal.map((file) => path25.basename(file)).join(", ");
|
|
8961
9316
|
console.error(`waiting for ${responseFilesFinal.length} batch response(s): ${fileList}`);
|
|
8962
9317
|
}
|
|
8963
9318
|
const deadline = Date.now() + timeoutMs;
|
|
@@ -8966,7 +9321,7 @@ async function waitForBatchResponses(responseFilesFinal, pollInterval = 1e3, sil
|
|
|
8966
9321
|
while (pending.size > 0) {
|
|
8967
9322
|
if (Date.now() >= deadline) {
|
|
8968
9323
|
if (!silent) {
|
|
8969
|
-
const remaining = [...pending].map((f) =>
|
|
9324
|
+
const remaining = [...pending].map((f) => path25.basename(f)).join(", ");
|
|
8970
9325
|
console.error(
|
|
8971
9326
|
`error: timed out after ${Math.round(timeoutMs / 1e3)}s waiting for batch responses. Still pending: ${remaining}`
|
|
8972
9327
|
);
|
|
@@ -9022,36 +9377,6 @@ import { promisify as promisify2 } from "node:util";
|
|
|
9022
9377
|
|
|
9023
9378
|
// src/evaluation/providers/vscode/dispatch/constants.ts
|
|
9024
9379
|
import path26 from "node:path";
|
|
9025
|
-
|
|
9026
|
-
// src/paths.ts
|
|
9027
|
-
import os2 from "node:os";
|
|
9028
|
-
import path25 from "node:path";
|
|
9029
|
-
var logged = false;
|
|
9030
|
-
function getAgentvHome() {
|
|
9031
|
-
const envHome = process.env.AGENTV_HOME;
|
|
9032
|
-
if (envHome && envHome !== "undefined") {
|
|
9033
|
-
if (!logged) {
|
|
9034
|
-
logged = true;
|
|
9035
|
-
console.warn(`Using AGENTV_HOME: ${envHome}`);
|
|
9036
|
-
}
|
|
9037
|
-
return envHome;
|
|
9038
|
-
}
|
|
9039
|
-
return path25.join(os2.homedir(), ".agentv");
|
|
9040
|
-
}
|
|
9041
|
-
function getWorkspacesRoot() {
|
|
9042
|
-
return path25.join(getAgentvHome(), "workspaces");
|
|
9043
|
-
}
|
|
9044
|
-
function getSubagentsRoot() {
|
|
9045
|
-
return path25.join(getAgentvHome(), "subagents");
|
|
9046
|
-
}
|
|
9047
|
-
function getTraceStateRoot() {
|
|
9048
|
-
return path25.join(getAgentvHome(), "trace-state");
|
|
9049
|
-
}
|
|
9050
|
-
function getWorkspacePoolRoot() {
|
|
9051
|
-
return path25.join(getAgentvHome(), "workspace-pool");
|
|
9052
|
-
}
|
|
9053
|
-
|
|
9054
|
-
// src/evaluation/providers/vscode/dispatch/constants.ts
|
|
9055
9380
|
var DEFAULT_LOCK_NAME = "subagent.lock";
|
|
9056
9381
|
var DEFAULT_ALIVE_FILENAME = ".alive";
|
|
9057
9382
|
function getDefaultSubagentRoot(vscodeCmd = "code") {
|
|
@@ -10308,9 +10633,10 @@ function resolveAndCreateProvider(definition, env = process.env) {
|
|
|
10308
10633
|
}
|
|
10309
10634
|
|
|
10310
10635
|
// src/evaluation/evaluators/scoring.ts
|
|
10311
|
-
var
|
|
10312
|
-
|
|
10313
|
-
|
|
10636
|
+
var DEFAULT_THRESHOLD = 0.8;
|
|
10637
|
+
var PASS_THRESHOLD = DEFAULT_THRESHOLD;
|
|
10638
|
+
function scoreToVerdict(score, threshold = DEFAULT_THRESHOLD) {
|
|
10639
|
+
return score >= threshold ? "pass" : "fail";
|
|
10314
10640
|
}
|
|
10315
10641
|
function clampScore(value) {
|
|
10316
10642
|
if (Number.isNaN(value) || !Number.isFinite(value)) {
|
|
@@ -10499,13 +10825,13 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
|
|
|
10499
10825
|
async function execShellWithStdin(command, stdinPayload, options = {}) {
|
|
10500
10826
|
const { mkdir: mkdir16, readFile: readFile15, rm: rm6, writeFile: writeFile9 } = await import("node:fs/promises");
|
|
10501
10827
|
const { tmpdir: tmpdir3 } = await import("node:os");
|
|
10502
|
-
const
|
|
10828
|
+
const path50 = await import("node:path");
|
|
10503
10829
|
const { randomUUID: randomUUID10 } = await import("node:crypto");
|
|
10504
|
-
const dir =
|
|
10830
|
+
const dir = path50.join(tmpdir3(), `agentv-exec-${randomUUID10()}`);
|
|
10505
10831
|
await mkdir16(dir, { recursive: true });
|
|
10506
|
-
const stdinPath =
|
|
10507
|
-
const stdoutPath =
|
|
10508
|
-
const stderrPath =
|
|
10832
|
+
const stdinPath = path50.join(dir, "stdin.txt");
|
|
10833
|
+
const stdoutPath = path50.join(dir, "stdout.txt");
|
|
10834
|
+
const stderrPath = path50.join(dir, "stderr.txt");
|
|
10509
10835
|
await writeFile9(stdinPath, stdinPayload, "utf8");
|
|
10510
10836
|
const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
|
|
10511
10837
|
const { spawn: spawn5 } = await import("node:child_process");
|
|
@@ -11703,7 +12029,7 @@ ${outputSchema}`;
|
|
|
11703
12029
|
parts.push("[[ ## scoring_criteria ## ]]");
|
|
11704
12030
|
for (const rubric of rubrics) {
|
|
11705
12031
|
const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
|
|
11706
|
-
const minScoreLabel = rubric.required_min_score !== void 0 ? ` [REQUIRED: min score ${rubric.required_min_score}]` : "";
|
|
12032
|
+
const minScoreLabel = rubric.min_score !== void 0 ? ` [REQUIRED: min score ${rubric.min_score}]` : rubric.required_min_score !== void 0 ? ` [REQUIRED: min score ${rubric.required_min_score}]` : "";
|
|
11707
12033
|
parts.push("", `### Criterion: ${rubric.id}${weightLabel}${minScoreLabel}`);
|
|
11708
12034
|
if (rubric.outcome) {
|
|
11709
12035
|
parts.push(`Description: ${rubric.outcome}`);
|
|
@@ -11757,54 +12083,106 @@ ${outputSchema}`;
|
|
|
11757
12083
|
async runWithRetry(options) {
|
|
11758
12084
|
const { context, graderProvider, systemPrompt, userPrompt, schema, images } = options;
|
|
11759
12085
|
let lastError;
|
|
12086
|
+
let lastInvalidResponse;
|
|
12087
|
+
let shouldAttemptStructureFix = false;
|
|
11760
12088
|
for (let attempt = 1; attempt <= 3; attempt++) {
|
|
11761
12089
|
try {
|
|
11762
|
-
const
|
|
11763
|
-
|
|
11764
|
-
|
|
11765
|
-
|
|
11766
|
-
|
|
11767
|
-
|
|
11768
|
-
|
|
11769
|
-
|
|
11770
|
-
|
|
11771
|
-
|
|
11772
|
-
|
|
11773
|
-
|
|
11774
|
-
|
|
11775
|
-
|
|
11776
|
-
|
|
11777
|
-
|
|
11778
|
-
]
|
|
11779
|
-
}
|
|
11780
|
-
],
|
|
11781
|
-
...modelOptions
|
|
11782
|
-
}) : await generateText2({
|
|
11783
|
-
model,
|
|
11784
|
-
system: systemPrompt,
|
|
11785
|
-
prompt: userPrompt,
|
|
11786
|
-
...modelOptions
|
|
11787
|
-
});
|
|
11788
|
-
const data2 = schema.parse(parseJsonFromText(result.text));
|
|
11789
|
-
const rawUsage = result.usage;
|
|
11790
|
-
const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens } : void 0;
|
|
11791
|
-
return { data: data2, tokenUsage };
|
|
12090
|
+
const result = await this.generateStructuredResponse({
|
|
12091
|
+
context,
|
|
12092
|
+
graderProvider,
|
|
12093
|
+
systemPrompt,
|
|
12094
|
+
userPrompt,
|
|
12095
|
+
images
|
|
12096
|
+
});
|
|
12097
|
+
const canRepairResponse = result.text.trim().length > 0;
|
|
12098
|
+
lastInvalidResponse = canRepairResponse ? result : void 0;
|
|
12099
|
+
let data;
|
|
12100
|
+
try {
|
|
12101
|
+
data = schema.parse(parseJsonFromText(result.text));
|
|
12102
|
+
} catch (e) {
|
|
12103
|
+
lastError = e instanceof Error ? e : new Error(String(e));
|
|
12104
|
+
shouldAttemptStructureFix = canRepairResponse;
|
|
12105
|
+
continue;
|
|
11792
12106
|
}
|
|
11793
|
-
|
|
11794
|
-
|
|
12107
|
+
return {
|
|
12108
|
+
data,
|
|
12109
|
+
providerResponse: result.providerResponse,
|
|
12110
|
+
tokenUsage: result.tokenUsage
|
|
12111
|
+
};
|
|
12112
|
+
} catch (e) {
|
|
12113
|
+
lastError = e instanceof Error ? e : new Error(String(e));
|
|
12114
|
+
}
|
|
12115
|
+
}
|
|
12116
|
+
if (shouldAttemptStructureFix && lastInvalidResponse) {
|
|
12117
|
+
try {
|
|
12118
|
+
const repaired = await this.generateStructuredResponse({
|
|
12119
|
+
context,
|
|
12120
|
+
graderProvider,
|
|
11795
12121
|
systemPrompt,
|
|
11796
|
-
|
|
11797
|
-
|
|
11798
|
-
|
|
11799
|
-
|
|
12122
|
+
userPrompt: buildStructureRepairPrompt({
|
|
12123
|
+
validationError: lastError?.message ?? "Schema validation failed",
|
|
12124
|
+
invalidResponse: lastInvalidResponse.text
|
|
12125
|
+
})
|
|
11800
12126
|
});
|
|
11801
|
-
const data = schema.parse(parseJsonFromText(
|
|
11802
|
-
return {
|
|
12127
|
+
const data = schema.parse(parseJsonFromText(repaired.text));
|
|
12128
|
+
return {
|
|
12129
|
+
data,
|
|
12130
|
+
providerResponse: repaired.providerResponse,
|
|
12131
|
+
tokenUsage: sumTokenUsage(lastInvalidResponse.tokenUsage, repaired.tokenUsage)
|
|
12132
|
+
};
|
|
11803
12133
|
} catch (e) {
|
|
11804
12134
|
lastError = e instanceof Error ? e : new Error(String(e));
|
|
11805
12135
|
}
|
|
11806
12136
|
}
|
|
11807
|
-
throw new Error(
|
|
12137
|
+
throw new Error(
|
|
12138
|
+
`Failed to parse evaluator response after 3 attempts and 1 structure-fix attempt: ${lastError?.message}`
|
|
12139
|
+
);
|
|
12140
|
+
}
|
|
12141
|
+
async generateStructuredResponse(options) {
|
|
12142
|
+
const { context, graderProvider, systemPrompt, userPrompt, images } = options;
|
|
12143
|
+
const model = graderProvider.asLanguageModel?.();
|
|
12144
|
+
if (model) {
|
|
12145
|
+
const modelOptions = {
|
|
12146
|
+
...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
|
|
12147
|
+
...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
|
|
12148
|
+
};
|
|
12149
|
+
const hasImages = images && images.length > 0;
|
|
12150
|
+
const result = hasImages ? await generateText2({
|
|
12151
|
+
model,
|
|
12152
|
+
system: systemPrompt,
|
|
12153
|
+
messages: [
|
|
12154
|
+
{
|
|
12155
|
+
role: "user",
|
|
12156
|
+
content: [
|
|
12157
|
+
{ type: "text", text: userPrompt },
|
|
12158
|
+
...toAiSdkImageParts(images)
|
|
12159
|
+
]
|
|
12160
|
+
}
|
|
12161
|
+
],
|
|
12162
|
+
...modelOptions
|
|
12163
|
+
}) : await generateText2({
|
|
12164
|
+
model,
|
|
12165
|
+
system: systemPrompt,
|
|
12166
|
+
prompt: userPrompt,
|
|
12167
|
+
...modelOptions
|
|
12168
|
+
});
|
|
12169
|
+
const rawUsage = result.usage;
|
|
12170
|
+
const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens } : void 0;
|
|
12171
|
+
return { text: result.text, tokenUsage };
|
|
12172
|
+
}
|
|
12173
|
+
const response = await graderProvider.invoke({
|
|
12174
|
+
question: userPrompt,
|
|
12175
|
+
systemPrompt,
|
|
12176
|
+
evalCaseId: context.evalCase.id,
|
|
12177
|
+
attempt: context.attempt,
|
|
12178
|
+
maxOutputTokens: this.maxOutputTokens,
|
|
12179
|
+
temperature: this.temperature
|
|
12180
|
+
});
|
|
12181
|
+
return {
|
|
12182
|
+
text: extractLastAssistantContent(response.output),
|
|
12183
|
+
providerResponse: response,
|
|
12184
|
+
tokenUsage: response.tokenUsage
|
|
12185
|
+
};
|
|
11808
12186
|
}
|
|
11809
12187
|
};
|
|
11810
12188
|
function buildOutputSchema() {
|
|
@@ -11824,6 +12202,29 @@ function buildOutputSchema() {
|
|
|
11824
12202
|
"}"
|
|
11825
12203
|
].join("\n");
|
|
11826
12204
|
}
|
|
12205
|
+
function buildStructureRepairPrompt(options) {
|
|
12206
|
+
const { validationError, invalidResponse } = options;
|
|
12207
|
+
return [
|
|
12208
|
+
"The following evaluation response has useful grading content but invalid JSON structure.",
|
|
12209
|
+
"Repair it to satisfy the schema in the system prompt.",
|
|
12210
|
+
"Preserve the evaluation meaning, do not re-grade the answer, and return only a single JSON object.",
|
|
12211
|
+
"",
|
|
12212
|
+
"Validation error:",
|
|
12213
|
+
validationError,
|
|
12214
|
+
"",
|
|
12215
|
+
"Invalid response:",
|
|
12216
|
+
invalidResponse
|
|
12217
|
+
].join("\n");
|
|
12218
|
+
}
|
|
12219
|
+
function sumTokenUsage(first, second) {
|
|
12220
|
+
if (!first && !second) {
|
|
12221
|
+
return void 0;
|
|
12222
|
+
}
|
|
12223
|
+
return {
|
|
12224
|
+
input: (first?.input ?? 0) + (second?.input ?? 0),
|
|
12225
|
+
output: (first?.output ?? 0) + (second?.output ?? 0)
|
|
12226
|
+
};
|
|
12227
|
+
}
|
|
11827
12228
|
function buildRubricOutputSchema() {
|
|
11828
12229
|
return `You are an expert evaluator. Evaluate the candidate answer against each rubric item.
|
|
11829
12230
|
You must return a valid JSON object matching this schema:
|
|
@@ -11923,19 +12324,21 @@ function calculateScoreRangeResult(result, rubrics) {
|
|
|
11923
12324
|
rawScores[rubric.id] = rawScore;
|
|
11924
12325
|
totalWeight += rubric.weight;
|
|
11925
12326
|
weightedScoreSum += normalizedScore * rubric.weight;
|
|
11926
|
-
let
|
|
11927
|
-
if (rubric.
|
|
11928
|
-
|
|
12327
|
+
let minScoreThreshold;
|
|
12328
|
+
if (rubric.min_score !== void 0) {
|
|
12329
|
+
minScoreThreshold = rubric.min_score;
|
|
12330
|
+
} else if (rubric.required_min_score !== void 0) {
|
|
12331
|
+
minScoreThreshold = rubric.required_min_score / 10;
|
|
11929
12332
|
} else if (rubric.required === true) {
|
|
11930
|
-
|
|
12333
|
+
minScoreThreshold = 1;
|
|
11931
12334
|
}
|
|
11932
12335
|
const matchingRange = rubric.score_ranges?.find(
|
|
11933
12336
|
(r) => rawScore >= r.score_range[0] && rawScore <= r.score_range[1]
|
|
11934
12337
|
);
|
|
11935
12338
|
const rangeDescription = matchingRange?.outcome ?? "";
|
|
11936
12339
|
const criterionLabel = rubric.outcome ?? rubric.id;
|
|
11937
|
-
const passed = !(
|
|
11938
|
-
if (
|
|
12340
|
+
const passed = !(minScoreThreshold !== void 0 && normalizedScore < minScoreThreshold) && rawScore >= 7;
|
|
12341
|
+
if (minScoreThreshold !== void 0 && normalizedScore < minScoreThreshold) {
|
|
11939
12342
|
failedRequired = true;
|
|
11940
12343
|
}
|
|
11941
12344
|
assertions.push({
|
|
@@ -12012,11 +12415,11 @@ function createFilesystemTools(workspacePath) {
|
|
|
12012
12415
|
execute: async (input) => {
|
|
12013
12416
|
try {
|
|
12014
12417
|
const resolved = resolveSandboxed(workspacePath, input.path);
|
|
12015
|
-
const
|
|
12016
|
-
if (
|
|
12418
|
+
const stat11 = await fs2.stat(resolved);
|
|
12419
|
+
if (stat11.isDirectory()) {
|
|
12017
12420
|
return { error: `'${input.path}' is a directory, not a file` };
|
|
12018
12421
|
}
|
|
12019
|
-
const buffer = Buffer.alloc(Math.min(
|
|
12422
|
+
const buffer = Buffer.alloc(Math.min(stat11.size, MAX_FILE_SIZE));
|
|
12020
12423
|
const fd = await fs2.open(resolved, "r");
|
|
12021
12424
|
try {
|
|
12022
12425
|
await fd.read(buffer, 0, buffer.length, 0);
|
|
@@ -12024,8 +12427,8 @@ function createFilesystemTools(workspacePath) {
|
|
|
12024
12427
|
await fd.close();
|
|
12025
12428
|
}
|
|
12026
12429
|
const content = buffer.toString("utf-8");
|
|
12027
|
-
const truncated =
|
|
12028
|
-
return { content, truncated, size:
|
|
12430
|
+
const truncated = stat11.size > MAX_FILE_SIZE;
|
|
12431
|
+
return { content, truncated, size: stat11.size };
|
|
12029
12432
|
} catch (error) {
|
|
12030
12433
|
return { error: error instanceof Error ? error.message : String(error) };
|
|
12031
12434
|
}
|
|
@@ -12076,8 +12479,8 @@ async function searchDirectory(dirPath, workspacePath, regex, matches) {
|
|
|
12076
12479
|
const ext = path35.extname(entry.name).toLowerCase();
|
|
12077
12480
|
if (BINARY_EXTENSIONS.has(ext)) continue;
|
|
12078
12481
|
try {
|
|
12079
|
-
const
|
|
12080
|
-
if (
|
|
12482
|
+
const stat11 = await fs2.stat(fullPath);
|
|
12483
|
+
if (stat11.size > MAX_FILE_SIZE) continue;
|
|
12081
12484
|
const content = await fs2.readFile(fullPath, "utf-8");
|
|
12082
12485
|
const lines = content.split("\n");
|
|
12083
12486
|
for (let i = 0; i < lines.length; i++) {
|
|
@@ -12718,115 +13121,115 @@ var FieldAccuracyEvaluator = class {
|
|
|
12718
13121
|
* Evaluate a single field against the expected value.
|
|
12719
13122
|
*/
|
|
12720
13123
|
evaluateField(fieldConfig, candidateData, expectedData) {
|
|
12721
|
-
const { path:
|
|
12722
|
-
const candidateValue = resolvePath(candidateData,
|
|
12723
|
-
const expectedValue = resolvePath(expectedData,
|
|
13124
|
+
const { path: path50, match, required = true, weight = 1 } = fieldConfig;
|
|
13125
|
+
const candidateValue = resolvePath(candidateData, path50);
|
|
13126
|
+
const expectedValue = resolvePath(expectedData, path50);
|
|
12724
13127
|
if (expectedValue === void 0) {
|
|
12725
13128
|
return {
|
|
12726
|
-
path:
|
|
13129
|
+
path: path50,
|
|
12727
13130
|
score: 1,
|
|
12728
13131
|
// No expected value means no comparison needed
|
|
12729
13132
|
weight,
|
|
12730
13133
|
hit: true,
|
|
12731
|
-
message: `${
|
|
13134
|
+
message: `${path50}: no expected value`
|
|
12732
13135
|
};
|
|
12733
13136
|
}
|
|
12734
13137
|
if (candidateValue === void 0) {
|
|
12735
13138
|
if (required) {
|
|
12736
13139
|
return {
|
|
12737
|
-
path:
|
|
13140
|
+
path: path50,
|
|
12738
13141
|
score: 0,
|
|
12739
13142
|
weight,
|
|
12740
13143
|
hit: false,
|
|
12741
|
-
message: `${
|
|
13144
|
+
message: `${path50} (required, missing)`
|
|
12742
13145
|
};
|
|
12743
13146
|
}
|
|
12744
13147
|
return {
|
|
12745
|
-
path:
|
|
13148
|
+
path: path50,
|
|
12746
13149
|
score: 1,
|
|
12747
13150
|
// Don't penalize missing optional fields
|
|
12748
13151
|
weight: 0,
|
|
12749
13152
|
// Zero weight means it won't affect the score
|
|
12750
13153
|
hit: true,
|
|
12751
|
-
message: `${
|
|
13154
|
+
message: `${path50}: optional field missing`
|
|
12752
13155
|
};
|
|
12753
13156
|
}
|
|
12754
13157
|
switch (match) {
|
|
12755
13158
|
case "exact":
|
|
12756
|
-
return this.compareExact(
|
|
13159
|
+
return this.compareExact(path50, candidateValue, expectedValue, weight);
|
|
12757
13160
|
case "numeric_tolerance":
|
|
12758
13161
|
return this.compareNumericTolerance(
|
|
12759
|
-
|
|
13162
|
+
path50,
|
|
12760
13163
|
candidateValue,
|
|
12761
13164
|
expectedValue,
|
|
12762
13165
|
fieldConfig,
|
|
12763
13166
|
weight
|
|
12764
13167
|
);
|
|
12765
13168
|
case "date":
|
|
12766
|
-
return this.compareDate(
|
|
13169
|
+
return this.compareDate(path50, candidateValue, expectedValue, fieldConfig, weight);
|
|
12767
13170
|
default:
|
|
12768
13171
|
return {
|
|
12769
|
-
path:
|
|
13172
|
+
path: path50,
|
|
12770
13173
|
score: 0,
|
|
12771
13174
|
weight,
|
|
12772
13175
|
hit: false,
|
|
12773
|
-
message: `${
|
|
13176
|
+
message: `${path50}: unknown match type "${match}"`
|
|
12774
13177
|
};
|
|
12775
13178
|
}
|
|
12776
13179
|
}
|
|
12777
13180
|
/**
|
|
12778
13181
|
* Exact equality comparison.
|
|
12779
13182
|
*/
|
|
12780
|
-
compareExact(
|
|
13183
|
+
compareExact(path50, candidateValue, expectedValue, weight) {
|
|
12781
13184
|
if (deepEqual(candidateValue, expectedValue)) {
|
|
12782
13185
|
return {
|
|
12783
|
-
path:
|
|
13186
|
+
path: path50,
|
|
12784
13187
|
score: 1,
|
|
12785
13188
|
weight,
|
|
12786
13189
|
hit: true,
|
|
12787
|
-
message:
|
|
13190
|
+
message: path50
|
|
12788
13191
|
};
|
|
12789
13192
|
}
|
|
12790
13193
|
if (typeof candidateValue !== typeof expectedValue) {
|
|
12791
13194
|
return {
|
|
12792
|
-
path:
|
|
13195
|
+
path: path50,
|
|
12793
13196
|
score: 0,
|
|
12794
13197
|
weight,
|
|
12795
13198
|
hit: false,
|
|
12796
|
-
message: `${
|
|
13199
|
+
message: `${path50} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
|
|
12797
13200
|
};
|
|
12798
13201
|
}
|
|
12799
13202
|
return {
|
|
12800
|
-
path:
|
|
13203
|
+
path: path50,
|
|
12801
13204
|
score: 0,
|
|
12802
13205
|
weight,
|
|
12803
13206
|
hit: false,
|
|
12804
|
-
message: `${
|
|
13207
|
+
message: `${path50} (value mismatch)`
|
|
12805
13208
|
};
|
|
12806
13209
|
}
|
|
12807
13210
|
/**
|
|
12808
13211
|
* Numeric comparison with absolute or relative tolerance.
|
|
12809
13212
|
*/
|
|
12810
|
-
compareNumericTolerance(
|
|
13213
|
+
compareNumericTolerance(path50, candidateValue, expectedValue, fieldConfig, weight) {
|
|
12811
13214
|
const { tolerance = 0, relative = false } = fieldConfig;
|
|
12812
13215
|
const candidateNum = toNumber(candidateValue);
|
|
12813
13216
|
const expectedNum = toNumber(expectedValue);
|
|
12814
13217
|
if (candidateNum === null || expectedNum === null) {
|
|
12815
13218
|
return {
|
|
12816
|
-
path:
|
|
13219
|
+
path: path50,
|
|
12817
13220
|
score: 0,
|
|
12818
13221
|
weight,
|
|
12819
13222
|
hit: false,
|
|
12820
|
-
message: `${
|
|
13223
|
+
message: `${path50} (non-numeric value)`
|
|
12821
13224
|
};
|
|
12822
13225
|
}
|
|
12823
13226
|
if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
|
|
12824
13227
|
return {
|
|
12825
|
-
path:
|
|
13228
|
+
path: path50,
|
|
12826
13229
|
score: 0,
|
|
12827
13230
|
weight,
|
|
12828
13231
|
hit: false,
|
|
12829
|
-
message: `${
|
|
13232
|
+
message: `${path50} (invalid numeric value)`
|
|
12830
13233
|
};
|
|
12831
13234
|
}
|
|
12832
13235
|
const diff = Math.abs(candidateNum - expectedNum);
|
|
@@ -12839,61 +13242,61 @@ var FieldAccuracyEvaluator = class {
|
|
|
12839
13242
|
}
|
|
12840
13243
|
if (withinTolerance) {
|
|
12841
13244
|
return {
|
|
12842
|
-
path:
|
|
13245
|
+
path: path50,
|
|
12843
13246
|
score: 1,
|
|
12844
13247
|
weight,
|
|
12845
13248
|
hit: true,
|
|
12846
|
-
message: `${
|
|
13249
|
+
message: `${path50} (within tolerance: diff=${diff.toFixed(2)})`
|
|
12847
13250
|
};
|
|
12848
13251
|
}
|
|
12849
13252
|
return {
|
|
12850
|
-
path:
|
|
13253
|
+
path: path50,
|
|
12851
13254
|
score: 0,
|
|
12852
13255
|
weight,
|
|
12853
13256
|
hit: false,
|
|
12854
|
-
message: `${
|
|
13257
|
+
message: `${path50} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
|
|
12855
13258
|
};
|
|
12856
13259
|
}
|
|
12857
13260
|
/**
|
|
12858
13261
|
* Date comparison with format normalization.
|
|
12859
13262
|
*/
|
|
12860
|
-
compareDate(
|
|
13263
|
+
compareDate(path50, candidateValue, expectedValue, fieldConfig, weight) {
|
|
12861
13264
|
const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
|
|
12862
13265
|
const candidateDate = parseDate(String(candidateValue), formats);
|
|
12863
13266
|
const expectedDate = parseDate(String(expectedValue), formats);
|
|
12864
13267
|
if (candidateDate === null) {
|
|
12865
13268
|
return {
|
|
12866
|
-
path:
|
|
13269
|
+
path: path50,
|
|
12867
13270
|
score: 0,
|
|
12868
13271
|
weight,
|
|
12869
13272
|
hit: false,
|
|
12870
|
-
message: `${
|
|
13273
|
+
message: `${path50} (unparseable candidate date)`
|
|
12871
13274
|
};
|
|
12872
13275
|
}
|
|
12873
13276
|
if (expectedDate === null) {
|
|
12874
13277
|
return {
|
|
12875
|
-
path:
|
|
13278
|
+
path: path50,
|
|
12876
13279
|
score: 0,
|
|
12877
13280
|
weight,
|
|
12878
13281
|
hit: false,
|
|
12879
|
-
message: `${
|
|
13282
|
+
message: `${path50} (unparseable expected date)`
|
|
12880
13283
|
};
|
|
12881
13284
|
}
|
|
12882
13285
|
if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
|
|
12883
13286
|
return {
|
|
12884
|
-
path:
|
|
13287
|
+
path: path50,
|
|
12885
13288
|
score: 1,
|
|
12886
13289
|
weight,
|
|
12887
13290
|
hit: true,
|
|
12888
|
-
message:
|
|
13291
|
+
message: path50
|
|
12889
13292
|
};
|
|
12890
13293
|
}
|
|
12891
13294
|
return {
|
|
12892
|
-
path:
|
|
13295
|
+
path: path50,
|
|
12893
13296
|
score: 0,
|
|
12894
13297
|
weight,
|
|
12895
13298
|
hit: false,
|
|
12896
|
-
message: `${
|
|
13299
|
+
message: `${path50} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
|
|
12897
13300
|
};
|
|
12898
13301
|
}
|
|
12899
13302
|
/**
|
|
@@ -12926,11 +13329,11 @@ var FieldAccuracyEvaluator = class {
|
|
|
12926
13329
|
};
|
|
12927
13330
|
}
|
|
12928
13331
|
};
|
|
12929
|
-
function resolvePath(obj,
|
|
12930
|
-
if (!
|
|
13332
|
+
function resolvePath(obj, path50) {
|
|
13333
|
+
if (!path50 || !obj) {
|
|
12931
13334
|
return void 0;
|
|
12932
13335
|
}
|
|
12933
|
-
const parts =
|
|
13336
|
+
const parts = path50.split(/\.|\[|\]/).filter((p) => p.length > 0);
|
|
12934
13337
|
let current = obj;
|
|
12935
13338
|
for (const part of parts) {
|
|
12936
13339
|
if (current === null || current === void 0) {
|
|
@@ -13422,8 +13825,8 @@ var TokenUsageEvaluator = class {
|
|
|
13422
13825
|
};
|
|
13423
13826
|
|
|
13424
13827
|
// src/evaluation/evaluators/tool-trajectory.ts
|
|
13425
|
-
function getNestedValue(obj,
|
|
13426
|
-
const parts =
|
|
13828
|
+
function getNestedValue(obj, path50) {
|
|
13829
|
+
const parts = path50.split(".");
|
|
13427
13830
|
let current = obj;
|
|
13428
13831
|
for (const part of parts) {
|
|
13429
13832
|
if (current === null || current === void 0 || typeof current !== "object") {
|
|
@@ -15195,7 +15598,7 @@ var WorkspacePoolManager = class {
|
|
|
15195
15598
|
}
|
|
15196
15599
|
/**
|
|
15197
15600
|
* Reset an existing slot for reuse:
|
|
15198
|
-
* 1. Reset repos (git reset --hard
|
|
15601
|
+
* 1. Reset repos (fetch from origin when resolve=remote, then git reset --hard && git clean per repo)
|
|
15199
15602
|
* 2. Re-copy template files (skip repo directories)
|
|
15200
15603
|
*/
|
|
15201
15604
|
async resetSlot(slotPath, templatePath, repos, poolReset = "fast") {
|
|
@@ -15208,7 +15611,17 @@ var WorkspacePoolManager = class {
|
|
|
15208
15611
|
continue;
|
|
15209
15612
|
}
|
|
15210
15613
|
const ref = repo.checkout?.ref ?? "HEAD";
|
|
15211
|
-
|
|
15614
|
+
const resolve = repo.checkout?.resolve ?? "remote";
|
|
15615
|
+
if (resolve === "remote") {
|
|
15616
|
+
const fetchArgs = ["fetch", "origin", ref];
|
|
15617
|
+
if (repo.clone?.depth) {
|
|
15618
|
+
fetchArgs.splice(1, 0, "--depth", String(repo.clone.depth));
|
|
15619
|
+
}
|
|
15620
|
+
await git(fetchArgs, { cwd: repoDir });
|
|
15621
|
+
await git(["reset", "--hard", "FETCH_HEAD"], { cwd: repoDir });
|
|
15622
|
+
} else {
|
|
15623
|
+
await git(["reset", "--hard", ref], { cwd: repoDir });
|
|
15624
|
+
}
|
|
15212
15625
|
const cleanFlag = poolReset === "strict" ? "-fdx" : "-fd";
|
|
15213
15626
|
await git(["clean", cleanFlag], { cwd: repoDir });
|
|
15214
15627
|
}
|
|
@@ -15505,7 +15918,7 @@ async function executeWorkspaceScript(config, context, failureMode = "fatal") {
|
|
|
15505
15918
|
}
|
|
15506
15919
|
|
|
15507
15920
|
// src/evaluation/orchestrator.ts
|
|
15508
|
-
function classifyQualityStatus(score, threshold =
|
|
15921
|
+
function classifyQualityStatus(score, threshold = DEFAULT_THRESHOLD) {
|
|
15509
15922
|
return score >= threshold ? "ok" : "quality_failure";
|
|
15510
15923
|
}
|
|
15511
15924
|
function buildSkippedEvaluatorError(scores) {
|
|
@@ -15597,7 +16010,7 @@ async function runEvaluation(options) {
|
|
|
15597
16010
|
const filteredEvalCases = filterEvalCases(evalCases, filter);
|
|
15598
16011
|
if (filteredEvalCases.length === 0) {
|
|
15599
16012
|
if (filter) {
|
|
15600
|
-
throw new Error(`No tests matched filter '${filter}' in ${evalFilePath}`);
|
|
16013
|
+
throw new Error(`No tests matched filter '${formatFilter(filter)}' in ${evalFilePath}`);
|
|
15601
16014
|
}
|
|
15602
16015
|
return [];
|
|
15603
16016
|
}
|
|
@@ -15649,6 +16062,9 @@ async function runEvaluation(options) {
|
|
|
15649
16062
|
const graderName = targetContext.graderTarget ?? targetContext.name;
|
|
15650
16063
|
const resolvedGrader = resolveTargetByName(graderName);
|
|
15651
16064
|
if (!resolvedGrader) {
|
|
16065
|
+
if (!LLM_GRADER_CAPABLE_KINDS.includes(targetContext.kind)) {
|
|
16066
|
+
return void 0;
|
|
16067
|
+
}
|
|
15652
16068
|
return getOrCreateProvider(targetContext);
|
|
15653
16069
|
}
|
|
15654
16070
|
return getOrCreateProvider(resolvedGrader);
|
|
@@ -15979,7 +16395,7 @@ async function runEvaluation(options) {
|
|
|
15979
16395
|
const budgetResult = {
|
|
15980
16396
|
timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
|
|
15981
16397
|
testId: evalCase.id,
|
|
15982
|
-
|
|
16398
|
+
suite: evalCase.suite,
|
|
15983
16399
|
category: evalCase.category,
|
|
15984
16400
|
score: 0,
|
|
15985
16401
|
assertions: [],
|
|
@@ -16016,7 +16432,7 @@ async function runEvaluation(options) {
|
|
|
16016
16432
|
const haltResult = {
|
|
16017
16433
|
timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
|
|
16018
16434
|
testId: evalCase.id,
|
|
16019
|
-
|
|
16435
|
+
suite: evalCase.suite,
|
|
16020
16436
|
category: evalCase.category,
|
|
16021
16437
|
score: 0,
|
|
16022
16438
|
assertions: [],
|
|
@@ -16328,7 +16744,7 @@ async function runBatchEvaluation(options) {
|
|
|
16328
16744
|
targetResolver,
|
|
16329
16745
|
availableTargets,
|
|
16330
16746
|
verbose,
|
|
16331
|
-
threshold: batchThreshold
|
|
16747
|
+
threshold: evalCase.threshold ?? batchThreshold
|
|
16332
16748
|
});
|
|
16333
16749
|
if (providerError) {
|
|
16334
16750
|
result = {
|
|
@@ -16790,8 +17206,9 @@ async function runEvalCase(options) {
|
|
|
16790
17206
|
fileChanges,
|
|
16791
17207
|
workspacePath,
|
|
16792
17208
|
verbose,
|
|
16793
|
-
threshold: caseThreshold
|
|
17209
|
+
threshold: evalCase.threshold ?? caseThreshold
|
|
16794
17210
|
});
|
|
17211
|
+
const effectiveThreshold = evalCase.threshold ?? caseThreshold;
|
|
16795
17212
|
const totalDurationMs = Date.now() - caseStartMs;
|
|
16796
17213
|
const graderTokens = aggregateEvaluatorTokenUsage(result.scores);
|
|
16797
17214
|
const evalRunTokenUsage = tokenUsage || graderTokens ? {
|
|
@@ -16805,7 +17222,7 @@ async function runEvalCase(options) {
|
|
|
16805
17222
|
...evalRunTokenUsage ? { tokenUsage: evalRunTokenUsage } : {}
|
|
16806
17223
|
};
|
|
16807
17224
|
const skippedEvaluatorError = buildSkippedEvaluatorError(result.scores);
|
|
16808
|
-
const executionStatus = providerError || skippedEvaluatorError ? "execution_error" : classifyQualityStatus(result.score,
|
|
17225
|
+
const executionStatus = providerError || skippedEvaluatorError ? "execution_error" : classifyQualityStatus(result.score, effectiveThreshold);
|
|
16809
17226
|
const targetUsedField = targetUsed ? { targetUsed } : {};
|
|
16810
17227
|
const finalResult = providerError ? {
|
|
16811
17228
|
...result,
|
|
@@ -17006,7 +17423,8 @@ async function evaluateCandidate(options) {
|
|
|
17006
17423
|
targetResolver,
|
|
17007
17424
|
availableTargets,
|
|
17008
17425
|
fileChanges,
|
|
17009
|
-
workspacePath
|
|
17426
|
+
workspacePath,
|
|
17427
|
+
threshold: evalThreshold
|
|
17010
17428
|
});
|
|
17011
17429
|
const completedAt = nowFn();
|
|
17012
17430
|
let agentRequest;
|
|
@@ -17037,7 +17455,7 @@ async function evaluateCandidate(options) {
|
|
|
17037
17455
|
return {
|
|
17038
17456
|
timestamp: completedAt.toISOString(),
|
|
17039
17457
|
testId: evalCase.id,
|
|
17040
|
-
|
|
17458
|
+
suite: evalCase.suite,
|
|
17041
17459
|
category: evalCase.category,
|
|
17042
17460
|
conversationId: evalCase.conversation_id,
|
|
17043
17461
|
score: score.score,
|
|
@@ -17080,7 +17498,8 @@ async function runEvaluatorsForCase(options) {
|
|
|
17080
17498
|
targetResolver,
|
|
17081
17499
|
availableTargets,
|
|
17082
17500
|
fileChanges,
|
|
17083
|
-
workspacePath
|
|
17501
|
+
workspacePath,
|
|
17502
|
+
threshold
|
|
17084
17503
|
} = options;
|
|
17085
17504
|
if (evalCase.assertions && evalCase.assertions.length > 0) {
|
|
17086
17505
|
return runEvaluatorList({
|
|
@@ -17106,7 +17525,8 @@ async function runEvaluatorsForCase(options) {
|
|
|
17106
17525
|
targetResolver,
|
|
17107
17526
|
availableTargets,
|
|
17108
17527
|
fileChanges,
|
|
17109
|
-
workspacePath
|
|
17528
|
+
workspacePath,
|
|
17529
|
+
threshold
|
|
17110
17530
|
});
|
|
17111
17531
|
}
|
|
17112
17532
|
const evaluatorKind = evalCase.evaluator ?? "llm-grader";
|
|
@@ -17208,7 +17628,8 @@ async function runEvaluatorList(options) {
|
|
|
17208
17628
|
name: evaluatorConfig.name,
|
|
17209
17629
|
type: evaluatorConfig.type,
|
|
17210
17630
|
weight,
|
|
17211
|
-
...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {}
|
|
17631
|
+
...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {},
|
|
17632
|
+
...evaluatorConfig.min_score !== void 0 ? { min_score: evaluatorConfig.min_score } : {}
|
|
17212
17633
|
});
|
|
17213
17634
|
scores.push({
|
|
17214
17635
|
name: evaluatorConfig.name,
|
|
@@ -17243,7 +17664,8 @@ async function runEvaluatorList(options) {
|
|
|
17243
17664
|
name: evaluatorConfig.name ?? "unknown",
|
|
17244
17665
|
type: evaluatorConfig.type ?? "llm-grader",
|
|
17245
17666
|
weight,
|
|
17246
|
-
...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {}
|
|
17667
|
+
...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {},
|
|
17668
|
+
...evaluatorConfig.min_score !== void 0 ? { min_score: evaluatorConfig.min_score } : {}
|
|
17247
17669
|
});
|
|
17248
17670
|
scores.push({
|
|
17249
17671
|
name: evaluatorConfig.name ?? "unknown",
|
|
@@ -17277,9 +17699,10 @@ async function runEvaluatorList(options) {
|
|
|
17277
17699
|
}
|
|
17278
17700
|
}
|
|
17279
17701
|
}
|
|
17702
|
+
const effectiveThreshold = options.threshold ?? DEFAULT_THRESHOLD;
|
|
17280
17703
|
const hasRequiredFailure = scored.some((entry) => {
|
|
17281
17704
|
if (!entry.required) return false;
|
|
17282
|
-
const minScore = typeof entry.required === "number" ? entry.required :
|
|
17705
|
+
const minScore = entry.min_score ?? (typeof entry.required === "number" ? entry.required : effectiveThreshold);
|
|
17283
17706
|
return entry.score.score < minScore;
|
|
17284
17707
|
});
|
|
17285
17708
|
const scorable = scored.filter((entry) => entry.score.verdict !== "skip");
|
|
@@ -17290,17 +17713,23 @@ async function runEvaluatorList(options) {
|
|
|
17290
17713
|
const expectedAspectCount = assertions.length || 1;
|
|
17291
17714
|
const score = {
|
|
17292
17715
|
score: aggregateScore,
|
|
17293
|
-
verdict: scoreToVerdict(aggregateScore),
|
|
17716
|
+
verdict: scoreToVerdict(aggregateScore, effectiveThreshold),
|
|
17294
17717
|
assertions,
|
|
17295
17718
|
expectedAspectCount
|
|
17296
17719
|
};
|
|
17297
17720
|
return { score, scores };
|
|
17298
17721
|
}
|
|
17722
|
+
function formatFilter(filter) {
|
|
17723
|
+
return typeof filter === "string" ? filter : filter.join(", ");
|
|
17724
|
+
}
|
|
17725
|
+
function matchesFilter3(id, filter) {
|
|
17726
|
+
return typeof filter === "string" ? micromatch3.isMatch(id, filter) : filter.some((pattern) => micromatch3.isMatch(id, pattern));
|
|
17727
|
+
}
|
|
17299
17728
|
function filterEvalCases(evalCases, filter) {
|
|
17300
17729
|
if (!filter) {
|
|
17301
17730
|
return evalCases;
|
|
17302
17731
|
}
|
|
17303
|
-
return evalCases.filter((evalCase) =>
|
|
17732
|
+
return evalCases.filter((evalCase) => matchesFilter3(evalCase.id, filter));
|
|
17304
17733
|
}
|
|
17305
17734
|
function buildEvaluatorRegistry(overrides, resolveGraderProvider) {
|
|
17306
17735
|
const llmGrader = overrides?.["llm-grader"] ?? new LlmGraderEvaluator({
|
|
@@ -17387,7 +17816,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
17387
17816
|
return {
|
|
17388
17817
|
timestamp: timestamp.toISOString(),
|
|
17389
17818
|
testId: evalCase.id,
|
|
17390
|
-
|
|
17819
|
+
suite: evalCase.suite,
|
|
17391
17820
|
category: evalCase.category,
|
|
17392
17821
|
conversationId: evalCase.conversation_id,
|
|
17393
17822
|
score: 0,
|
|
@@ -17659,6 +18088,7 @@ async function evaluate(config) {
|
|
|
17659
18088
|
verbose: config.verbose,
|
|
17660
18089
|
maxConcurrency: config.workers ?? 3,
|
|
17661
18090
|
filter: config.filter,
|
|
18091
|
+
threshold: config.threshold,
|
|
17662
18092
|
evalCases,
|
|
17663
18093
|
onResult: async (result) => {
|
|
17664
18094
|
collectedResults.push(result);
|
|
@@ -17669,19 +18099,19 @@ async function evaluate(config) {
|
|
|
17669
18099
|
const durationMs = Date.now() - startTime;
|
|
17670
18100
|
return {
|
|
17671
18101
|
results: allResults,
|
|
17672
|
-
summary: computeSummary(allResults, durationMs)
|
|
18102
|
+
summary: computeSummary(allResults, durationMs, config.threshold)
|
|
17673
18103
|
};
|
|
17674
18104
|
}
|
|
17675
18105
|
function mapAssertionType(type) {
|
|
17676
18106
|
return type.replace(/_/g, "-");
|
|
17677
18107
|
}
|
|
17678
|
-
function computeSummary(results, durationMs) {
|
|
18108
|
+
function computeSummary(results, durationMs, threshold = DEFAULT_THRESHOLD) {
|
|
17679
18109
|
const total = results.length;
|
|
17680
18110
|
let passed = 0;
|
|
17681
18111
|
let scoreSum = 0;
|
|
17682
18112
|
for (const r of results) {
|
|
17683
18113
|
scoreSum += r.score;
|
|
17684
|
-
if (r.score >=
|
|
18114
|
+
if (r.score >= threshold) {
|
|
17685
18115
|
passed++;
|
|
17686
18116
|
}
|
|
17687
18117
|
}
|
|
@@ -17794,7 +18224,7 @@ var CONFIG_FILE_NAMES = [
|
|
|
17794
18224
|
];
|
|
17795
18225
|
async function loadTsConfig(projectRoot) {
|
|
17796
18226
|
const { existsSync: existsSync7 } = await import("node:fs");
|
|
17797
|
-
const { pathToFileURL } = await import("node:url");
|
|
18227
|
+
const { pathToFileURL: pathToFileURL2 } = await import("node:url");
|
|
17798
18228
|
const { join: join2 } = await import("node:path");
|
|
17799
18229
|
for (const fileName of CONFIG_FILE_NAMES) {
|
|
17800
18230
|
const filePath = join2(projectRoot, fileName);
|
|
@@ -17802,7 +18232,7 @@ async function loadTsConfig(projectRoot) {
|
|
|
17802
18232
|
continue;
|
|
17803
18233
|
}
|
|
17804
18234
|
try {
|
|
17805
|
-
const fileUrl =
|
|
18235
|
+
const fileUrl = pathToFileURL2(filePath).href;
|
|
17806
18236
|
const mod = await import(fileUrl);
|
|
17807
18237
|
const config = mod.default ?? mod;
|
|
17808
18238
|
return AgentVConfigSchema.parse(config);
|
|
@@ -17936,7 +18366,7 @@ function shouldSkipCacheForTemperature(targetConfig) {
|
|
|
17936
18366
|
}
|
|
17937
18367
|
|
|
17938
18368
|
// src/projects.ts
|
|
17939
|
-
import { existsSync as existsSync6, mkdirSync, readFileSync as readFileSync3, readdirSync as readdirSync3, statSync as statSync2, writeFileSync } from "node:fs";
|
|
18369
|
+
import { existsSync as existsSync6, mkdirSync as mkdirSync2, readFileSync as readFileSync3, readdirSync as readdirSync3, statSync as statSync2, writeFileSync } from "node:fs";
|
|
17940
18370
|
import path47 from "node:path";
|
|
17941
18371
|
import { parse as parseYaml3, stringify as stringifyYaml } from "yaml";
|
|
17942
18372
|
function getProjectsRegistryPath() {
|
|
@@ -17962,7 +18392,7 @@ function saveProjectRegistry(registry) {
|
|
|
17962
18392
|
const registryPath = getProjectsRegistryPath();
|
|
17963
18393
|
const dir = path47.dirname(registryPath);
|
|
17964
18394
|
if (!existsSync6(dir)) {
|
|
17965
|
-
|
|
18395
|
+
mkdirSync2(dir, { recursive: true });
|
|
17966
18396
|
}
|
|
17967
18397
|
writeFileSync(registryPath, stringifyYaml(registry), "utf-8");
|
|
17968
18398
|
}
|
|
@@ -18228,7 +18658,7 @@ var OtelTraceExporter = class {
|
|
|
18228
18658
|
rootSpan.setAttribute("gen_ai.system", "agentv");
|
|
18229
18659
|
rootSpan.setAttribute("agentv.test_id", result.testId);
|
|
18230
18660
|
rootSpan.setAttribute("agentv.target", result.target);
|
|
18231
|
-
if (result.
|
|
18661
|
+
if (result.suite) rootSpan.setAttribute("agentv.suite", result.suite);
|
|
18232
18662
|
rootSpan.setAttribute("agentv.score", result.score);
|
|
18233
18663
|
if (captureContent && result.output.length > 0) {
|
|
18234
18664
|
const lastMsg = result.output[result.output.length - 1];
|
|
@@ -18437,7 +18867,7 @@ var OtelStreamingObserver = class {
|
|
|
18437
18867
|
this.rootSpan.setAttribute("gen_ai.system", "agentv");
|
|
18438
18868
|
this.rootSpan.setAttribute("agentv.test_id", testId);
|
|
18439
18869
|
this.rootSpan.setAttribute("agentv.target", target);
|
|
18440
|
-
if (evalSet) this.rootSpan.setAttribute("agentv.
|
|
18870
|
+
if (evalSet) this.rootSpan.setAttribute("agentv.suite", evalSet);
|
|
18441
18871
|
this.rootCtx = this.api.trace.setSpan(this.api.context.active(), this.rootSpan);
|
|
18442
18872
|
}
|
|
18443
18873
|
/** Create and immediately export a tool span */
|
|
@@ -18786,11 +19216,241 @@ function extractToolResultContent(content) {
|
|
|
18786
19216
|
return parts.length > 0 ? parts.join("") : void 0;
|
|
18787
19217
|
}
|
|
18788
19218
|
|
|
18789
|
-
// src/import/
|
|
19219
|
+
// src/import/codex-parser.ts
|
|
19220
|
+
function parseCodexSession(jsonl) {
|
|
19221
|
+
const messages = [];
|
|
19222
|
+
let sessionId = "";
|
|
19223
|
+
let cwd;
|
|
19224
|
+
let model;
|
|
19225
|
+
let version;
|
|
19226
|
+
let startTimestamp;
|
|
19227
|
+
let endTimestamp;
|
|
19228
|
+
const pendingCalls = /* @__PURE__ */ new Map();
|
|
19229
|
+
const lines = jsonl.split("\n").filter((l) => l.trim().length > 0);
|
|
19230
|
+
for (const line of lines) {
|
|
19231
|
+
let entry;
|
|
19232
|
+
try {
|
|
19233
|
+
entry = JSON.parse(line);
|
|
19234
|
+
} catch {
|
|
19235
|
+
continue;
|
|
19236
|
+
}
|
|
19237
|
+
if (!entry.type) continue;
|
|
19238
|
+
if (entry.timestamp) {
|
|
19239
|
+
if (!startTimestamp) startTimestamp = entry.timestamp;
|
|
19240
|
+
endTimestamp = entry.timestamp;
|
|
19241
|
+
}
|
|
19242
|
+
const payload = entry.payload ?? {};
|
|
19243
|
+
switch (entry.type) {
|
|
19244
|
+
case "session_meta": {
|
|
19245
|
+
sessionId = String(payload.id ?? "");
|
|
19246
|
+
cwd = payload.cwd ? String(payload.cwd) : void 0;
|
|
19247
|
+
version = payload.cli_version ? String(payload.cli_version) : void 0;
|
|
19248
|
+
if (payload.model && !model) {
|
|
19249
|
+
model = String(payload.model);
|
|
19250
|
+
}
|
|
19251
|
+
break;
|
|
19252
|
+
}
|
|
19253
|
+
case "turn_context": {
|
|
19254
|
+
if (payload.model && !model) {
|
|
19255
|
+
model = String(payload.model);
|
|
19256
|
+
}
|
|
19257
|
+
if (payload.cwd && !cwd) {
|
|
19258
|
+
cwd = String(payload.cwd);
|
|
19259
|
+
}
|
|
19260
|
+
break;
|
|
19261
|
+
}
|
|
19262
|
+
case "response_item": {
|
|
19263
|
+
const itemType = String(payload.type ?? "");
|
|
19264
|
+
const role = String(payload.role ?? "");
|
|
19265
|
+
switch (itemType) {
|
|
19266
|
+
case "message": {
|
|
19267
|
+
if (role === "developer") break;
|
|
19268
|
+
const content = extractResponseItemContent(payload.content);
|
|
19269
|
+
if (role === "user" && content) {
|
|
19270
|
+
messages.push({ role: "user", content });
|
|
19271
|
+
} else if (role === "assistant" && content) {
|
|
19272
|
+
messages.push({ role: "assistant", content });
|
|
19273
|
+
}
|
|
19274
|
+
break;
|
|
19275
|
+
}
|
|
19276
|
+
case "function_call": {
|
|
19277
|
+
const toolName = String(payload.name ?? "");
|
|
19278
|
+
const callId = String(payload.call_id ?? "");
|
|
19279
|
+
let input;
|
|
19280
|
+
if (typeof payload.arguments === "string") {
|
|
19281
|
+
try {
|
|
19282
|
+
input = JSON.parse(payload.arguments);
|
|
19283
|
+
} catch {
|
|
19284
|
+
input = payload.arguments;
|
|
19285
|
+
}
|
|
19286
|
+
} else {
|
|
19287
|
+
input = payload.arguments;
|
|
19288
|
+
}
|
|
19289
|
+
const toolCall = { tool: toolName, input, id: callId };
|
|
19290
|
+
const msgIdx = messages.length;
|
|
19291
|
+
messages.push({
|
|
19292
|
+
role: "assistant",
|
|
19293
|
+
toolCalls: [toolCall]
|
|
19294
|
+
});
|
|
19295
|
+
if (callId) {
|
|
19296
|
+
pendingCalls.set(callId, { msgIdx, toolIdx: 0 });
|
|
19297
|
+
}
|
|
19298
|
+
break;
|
|
19299
|
+
}
|
|
19300
|
+
case "custom_tool_call": {
|
|
19301
|
+
const toolName = String(payload.name ?? "");
|
|
19302
|
+
const callId = String(payload.call_id ?? "");
|
|
19303
|
+
let input;
|
|
19304
|
+
if (typeof payload.arguments === "string") {
|
|
19305
|
+
try {
|
|
19306
|
+
input = JSON.parse(payload.arguments);
|
|
19307
|
+
} catch {
|
|
19308
|
+
input = payload.arguments;
|
|
19309
|
+
}
|
|
19310
|
+
} else {
|
|
19311
|
+
input = payload.arguments;
|
|
19312
|
+
}
|
|
19313
|
+
const toolCall = { tool: toolName, input, id: callId };
|
|
19314
|
+
const msgIdx = messages.length;
|
|
19315
|
+
messages.push({
|
|
19316
|
+
role: "assistant",
|
|
19317
|
+
toolCalls: [toolCall]
|
|
19318
|
+
});
|
|
19319
|
+
if (callId) {
|
|
19320
|
+
pendingCalls.set(callId, { msgIdx, toolIdx: 0 });
|
|
19321
|
+
}
|
|
19322
|
+
break;
|
|
19323
|
+
}
|
|
19324
|
+
case "function_call_output":
|
|
19325
|
+
case "custom_tool_call_output": {
|
|
19326
|
+
const callId = String(payload.call_id ?? "");
|
|
19327
|
+
const pending = pendingCalls.get(callId);
|
|
19328
|
+
if (pending) {
|
|
19329
|
+
const existingMsg = messages[pending.msgIdx];
|
|
19330
|
+
const existingCalls = [...existingMsg.toolCalls ?? []];
|
|
19331
|
+
existingCalls[pending.toolIdx] = {
|
|
19332
|
+
...existingCalls[pending.toolIdx],
|
|
19333
|
+
output: payload.output
|
|
19334
|
+
};
|
|
19335
|
+
messages[pending.msgIdx] = { ...existingMsg, toolCalls: existingCalls };
|
|
19336
|
+
pendingCalls.delete(callId);
|
|
19337
|
+
}
|
|
19338
|
+
break;
|
|
19339
|
+
}
|
|
19340
|
+
// Skip reasoning blocks (thinking tokens)
|
|
19341
|
+
case "reasoning":
|
|
19342
|
+
break;
|
|
19343
|
+
}
|
|
19344
|
+
break;
|
|
19345
|
+
}
|
|
19346
|
+
}
|
|
19347
|
+
}
|
|
19348
|
+
let durationMs;
|
|
19349
|
+
if (startTimestamp && endTimestamp) {
|
|
19350
|
+
durationMs = new Date(endTimestamp).getTime() - new Date(startTimestamp).getTime();
|
|
19351
|
+
}
|
|
19352
|
+
const source = {
|
|
19353
|
+
provider: "codex",
|
|
19354
|
+
sessionId,
|
|
19355
|
+
cwd,
|
|
19356
|
+
startedAt: startTimestamp,
|
|
19357
|
+
model,
|
|
19358
|
+
version
|
|
19359
|
+
};
|
|
19360
|
+
return {
|
|
19361
|
+
messages,
|
|
19362
|
+
source,
|
|
19363
|
+
// Codex rollout files don't include token counts (only rate limit info)
|
|
19364
|
+
tokenUsage: void 0,
|
|
19365
|
+
durationMs,
|
|
19366
|
+
costUsd: null
|
|
19367
|
+
};
|
|
19368
|
+
}
|
|
19369
|
+
function extractResponseItemContent(content) {
|
|
19370
|
+
if (typeof content === "string") return content;
|
|
19371
|
+
if (!Array.isArray(content)) return void 0;
|
|
19372
|
+
const parts = [];
|
|
19373
|
+
for (const block of content) {
|
|
19374
|
+
if (typeof block === "object" && block !== null) {
|
|
19375
|
+
const b = block;
|
|
19376
|
+
if (typeof b.text === "string") {
|
|
19377
|
+
parts.push(b.text);
|
|
19378
|
+
}
|
|
19379
|
+
}
|
|
19380
|
+
}
|
|
19381
|
+
return parts.length > 0 ? parts.join("") : void 0;
|
|
19382
|
+
}
|
|
19383
|
+
|
|
19384
|
+
// src/import/codex-session-discovery.ts
|
|
18790
19385
|
import { readdir as readdir8, stat as stat9 } from "node:fs/promises";
|
|
18791
19386
|
import { homedir as homedir3 } from "node:os";
|
|
18792
19387
|
import path48 from "node:path";
|
|
18793
|
-
var
|
|
19388
|
+
var DEFAULT_SESSIONS_DIR = () => path48.join(homedir3(), ".codex", "sessions");
|
|
19389
|
+
async function discoverCodexSessions(opts) {
|
|
19390
|
+
const sessionsDir = opts?.sessionsDir ?? DEFAULT_SESSIONS_DIR();
|
|
19391
|
+
const limit = opts?.latest ? 1 : opts?.limit ?? 10;
|
|
19392
|
+
const sessions = [];
|
|
19393
|
+
let yearDirs;
|
|
19394
|
+
try {
|
|
19395
|
+
yearDirs = await readdir8(sessionsDir);
|
|
19396
|
+
} catch {
|
|
19397
|
+
return [];
|
|
19398
|
+
}
|
|
19399
|
+
for (const year of yearDirs) {
|
|
19400
|
+
const yearPath = path48.join(sessionsDir, year);
|
|
19401
|
+
let monthDirs;
|
|
19402
|
+
try {
|
|
19403
|
+
monthDirs = await readdir8(yearPath);
|
|
19404
|
+
} catch {
|
|
19405
|
+
continue;
|
|
19406
|
+
}
|
|
19407
|
+
for (const month of monthDirs) {
|
|
19408
|
+
const monthPath = path48.join(yearPath, month);
|
|
19409
|
+
let dayDirs;
|
|
19410
|
+
try {
|
|
19411
|
+
dayDirs = await readdir8(monthPath);
|
|
19412
|
+
} catch {
|
|
19413
|
+
continue;
|
|
19414
|
+
}
|
|
19415
|
+
for (const day of dayDirs) {
|
|
19416
|
+
if (opts?.date) {
|
|
19417
|
+
const dirDate = `${year}-${month}-${day}`;
|
|
19418
|
+
if (dirDate !== opts.date) continue;
|
|
19419
|
+
}
|
|
19420
|
+
const dayPath = path48.join(monthPath, day);
|
|
19421
|
+
let files;
|
|
19422
|
+
try {
|
|
19423
|
+
files = await readdir8(dayPath);
|
|
19424
|
+
} catch {
|
|
19425
|
+
continue;
|
|
19426
|
+
}
|
|
19427
|
+
for (const file of files) {
|
|
19428
|
+
if (!file.startsWith("rollout-") || !file.endsWith(".jsonl")) continue;
|
|
19429
|
+
const filePath = path48.join(dayPath, file);
|
|
19430
|
+
const nameWithoutExt = file.replace(/\.jsonl$/, "");
|
|
19431
|
+
const parts = nameWithoutExt.split("-");
|
|
19432
|
+
const sessionId = parts.length >= 6 ? parts.slice(-5).join("-") : nameWithoutExt;
|
|
19433
|
+
let updatedAt;
|
|
19434
|
+
try {
|
|
19435
|
+
const fileStat = await stat9(filePath);
|
|
19436
|
+
updatedAt = fileStat.mtime;
|
|
19437
|
+
} catch {
|
|
19438
|
+
updatedAt = /* @__PURE__ */ new Date(0);
|
|
19439
|
+
}
|
|
19440
|
+
sessions.push({ sessionId, filePath, filename: file, updatedAt });
|
|
19441
|
+
}
|
|
19442
|
+
}
|
|
19443
|
+
}
|
|
19444
|
+
}
|
|
19445
|
+
sessions.sort((a, b) => b.updatedAt.getTime() - a.updatedAt.getTime());
|
|
19446
|
+
return sessions.slice(0, limit);
|
|
19447
|
+
}
|
|
19448
|
+
|
|
19449
|
+
// src/import/session-discovery.ts
|
|
19450
|
+
import { readdir as readdir9, stat as stat10 } from "node:fs/promises";
|
|
19451
|
+
import { homedir as homedir4 } from "node:os";
|
|
19452
|
+
import path49 from "node:path";
|
|
19453
|
+
var DEFAULT_PROJECTS_DIR = () => path49.join(homedir4(), ".claude", "projects");
|
|
18794
19454
|
function encodeProjectPath(projectPath) {
|
|
18795
19455
|
return projectPath.replace(/\//g, "-");
|
|
18796
19456
|
}
|
|
@@ -18799,7 +19459,7 @@ async function discoverClaudeSessions(opts) {
|
|
|
18799
19459
|
const limit = opts?.latest ? 1 : opts?.limit ?? 10;
|
|
18800
19460
|
let projectDirs;
|
|
18801
19461
|
try {
|
|
18802
|
-
projectDirs = await
|
|
19462
|
+
projectDirs = await readdir9(projectsDir);
|
|
18803
19463
|
} catch {
|
|
18804
19464
|
return [];
|
|
18805
19465
|
}
|
|
@@ -18809,10 +19469,10 @@ async function discoverClaudeSessions(opts) {
|
|
|
18809
19469
|
}
|
|
18810
19470
|
const sessions = [];
|
|
18811
19471
|
for (const projectDir of projectDirs) {
|
|
18812
|
-
const dirPath =
|
|
19472
|
+
const dirPath = path49.join(projectsDir, projectDir);
|
|
18813
19473
|
let entries;
|
|
18814
19474
|
try {
|
|
18815
|
-
entries = await
|
|
19475
|
+
entries = await readdir9(dirPath);
|
|
18816
19476
|
} catch {
|
|
18817
19477
|
continue;
|
|
18818
19478
|
}
|
|
@@ -18820,10 +19480,10 @@ async function discoverClaudeSessions(opts) {
|
|
|
18820
19480
|
if (!entry.endsWith(".jsonl")) continue;
|
|
18821
19481
|
const sessionId = entry.replace(/\.jsonl$/, "");
|
|
18822
19482
|
if (opts?.sessionId && sessionId !== opts.sessionId) continue;
|
|
18823
|
-
const filePath =
|
|
19483
|
+
const filePath = path49.join(dirPath, entry);
|
|
18824
19484
|
let updatedAt;
|
|
18825
19485
|
try {
|
|
18826
|
-
const fileStat = await
|
|
19486
|
+
const fileStat = await stat10(filePath);
|
|
18827
19487
|
updatedAt = fileStat.mtime;
|
|
18828
19488
|
} catch {
|
|
18829
19489
|
updatedAt = /* @__PURE__ */ new Date(0);
|
|
@@ -18842,10 +19502,85 @@ async function discoverClaudeSessions(opts) {
|
|
|
18842
19502
|
|
|
18843
19503
|
// src/import/types.ts
|
|
18844
19504
|
import { readFile as readFile14 } from "node:fs/promises";
|
|
19505
|
+
function toTranscriptJsonLine(entry) {
|
|
19506
|
+
const firstUserMessage = entry.messages.find((m) => m.role === "user");
|
|
19507
|
+
const input = typeof firstUserMessage?.content === "string" ? firstUserMessage.content : "";
|
|
19508
|
+
return {
|
|
19509
|
+
input,
|
|
19510
|
+
output: entry.messages,
|
|
19511
|
+
token_usage: entry.tokenUsage ? {
|
|
19512
|
+
input: entry.tokenUsage.input,
|
|
19513
|
+
output: entry.tokenUsage.output,
|
|
19514
|
+
cached: entry.tokenUsage.cached
|
|
19515
|
+
} : void 0,
|
|
19516
|
+
duration_ms: entry.durationMs,
|
|
19517
|
+
cost_usd: entry.costUsd,
|
|
19518
|
+
source: {
|
|
19519
|
+
provider: entry.source.provider,
|
|
19520
|
+
session_id: entry.source.sessionId,
|
|
19521
|
+
model: entry.source.model,
|
|
19522
|
+
timestamp: entry.source.startedAt,
|
|
19523
|
+
git_branch: entry.source.gitBranch,
|
|
19524
|
+
cwd: entry.source.cwd ?? entry.source.projectPath,
|
|
19525
|
+
version: entry.source.version
|
|
19526
|
+
}
|
|
19527
|
+
};
|
|
19528
|
+
}
|
|
19529
|
+
async function readTranscriptJsonl(filePath) {
|
|
19530
|
+
const text = await readFile14(filePath, "utf8");
|
|
19531
|
+
return text.split("\n").filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
19532
|
+
}
|
|
18845
19533
|
async function readTranscriptFile(filePath) {
|
|
18846
19534
|
return readFile14(filePath, "utf8");
|
|
18847
19535
|
}
|
|
18848
19536
|
|
|
19537
|
+
// src/import/transcript-provider.ts
|
|
19538
|
+
var TranscriptProvider = class _TranscriptProvider {
|
|
19539
|
+
id;
|
|
19540
|
+
kind = "transcript";
|
|
19541
|
+
targetName;
|
|
19542
|
+
lines;
|
|
19543
|
+
cursor = 0;
|
|
19544
|
+
constructor(targetName, lines) {
|
|
19545
|
+
this.targetName = targetName;
|
|
19546
|
+
this.id = `transcript:${targetName}`;
|
|
19547
|
+
this.lines = lines;
|
|
19548
|
+
}
|
|
19549
|
+
/**
|
|
19550
|
+
* Create a TranscriptProvider from a JSONL file path.
|
|
19551
|
+
*/
|
|
19552
|
+
static async fromFile(filePath) {
|
|
19553
|
+
const lines = await readTranscriptJsonl(filePath);
|
|
19554
|
+
if (lines.length === 0) {
|
|
19555
|
+
throw new Error(`Transcript file is empty: ${filePath}`);
|
|
19556
|
+
}
|
|
19557
|
+
const providerName = lines[0].source.provider ?? "transcript";
|
|
19558
|
+
return new _TranscriptProvider(providerName, lines);
|
|
19559
|
+
}
|
|
19560
|
+
get lineCount() {
|
|
19561
|
+
return this.lines.length;
|
|
19562
|
+
}
|
|
19563
|
+
async invoke(_request) {
|
|
19564
|
+
if (this.cursor >= this.lines.length) {
|
|
19565
|
+
throw new Error(
|
|
19566
|
+
`Transcript exhausted: ${this.lines.length} line(s) available but ${this.cursor + 1} invocations attempted. Each transcript line maps to one test case.`
|
|
19567
|
+
);
|
|
19568
|
+
}
|
|
19569
|
+
const line = this.lines[this.cursor++];
|
|
19570
|
+
return {
|
|
19571
|
+
output: line.output,
|
|
19572
|
+
tokenUsage: line.token_usage ? {
|
|
19573
|
+
input: line.token_usage.input,
|
|
19574
|
+
output: line.token_usage.output,
|
|
19575
|
+
cached: line.token_usage.cached
|
|
19576
|
+
} : void 0,
|
|
19577
|
+
durationMs: line.duration_ms,
|
|
19578
|
+
costUsd: line.cost_usd ?? void 0,
|
|
19579
|
+
startTime: line.source.timestamp
|
|
19580
|
+
};
|
|
19581
|
+
}
|
|
19582
|
+
};
|
|
19583
|
+
|
|
18849
19584
|
// src/index.ts
|
|
18850
19585
|
function createAgentKernel() {
|
|
18851
19586
|
return { status: "stub" };
|
|
@@ -18859,6 +19594,7 @@ export {
|
|
|
18859
19594
|
DEFAULT_EVALUATOR_TEMPLATE,
|
|
18860
19595
|
DEFAULT_EVAL_PATTERNS,
|
|
18861
19596
|
DEFAULT_EXPLORATION_TOOLS,
|
|
19597
|
+
DEFAULT_THRESHOLD,
|
|
18862
19598
|
DeterministicAssertionEvaluator,
|
|
18863
19599
|
EvaluatorRegistry,
|
|
18864
19600
|
ExecutionMetricsEvaluator,
|
|
@@ -18880,6 +19616,7 @@ export {
|
|
|
18880
19616
|
TemplateNotFoundError,
|
|
18881
19617
|
TokenUsageEvaluator,
|
|
18882
19618
|
ToolTrajectoryEvaluator,
|
|
19619
|
+
TranscriptProvider,
|
|
18883
19620
|
WorkspaceCreationError,
|
|
18884
19621
|
WorkspacePoolManager,
|
|
18885
19622
|
addProject,
|
|
@@ -18916,6 +19653,7 @@ export {
|
|
|
18916
19653
|
detectFormat,
|
|
18917
19654
|
discoverAssertions,
|
|
18918
19655
|
discoverClaudeSessions,
|
|
19656
|
+
discoverCodexSessions,
|
|
18919
19657
|
discoverCopilotSessions,
|
|
18920
19658
|
discoverGraders,
|
|
18921
19659
|
discoverGraders as discoverJudges,
|
|
@@ -18976,6 +19714,8 @@ export {
|
|
|
18976
19714
|
normalizeLineEndings,
|
|
18977
19715
|
parseAgentSkillsEvals,
|
|
18978
19716
|
parseClaudeSession,
|
|
19717
|
+
parseCodexSession,
|
|
19718
|
+
parseCopilotEvents,
|
|
18979
19719
|
parseJsonFromText,
|
|
18980
19720
|
parseJsonSafe,
|
|
18981
19721
|
readJsonFile,
|
|
@@ -18983,6 +19723,7 @@ export {
|
|
|
18983
19723
|
readTestSuiteMetadata,
|
|
18984
19724
|
readTextFile,
|
|
18985
19725
|
readTranscriptFile,
|
|
19726
|
+
readTranscriptJsonl,
|
|
18986
19727
|
removeProject,
|
|
18987
19728
|
resolveAndCreateProvider,
|
|
18988
19729
|
resolveDelegatedTargetDefinition,
|
|
@@ -19015,6 +19756,7 @@ export {
|
|
|
19015
19756
|
substituteVariables,
|
|
19016
19757
|
toCamelCaseDeep,
|
|
19017
19758
|
toSnakeCaseDeep,
|
|
19759
|
+
toTranscriptJsonLine,
|
|
19018
19760
|
tokensPerTool,
|
|
19019
19761
|
touchProject,
|
|
19020
19762
|
transpileEvalYaml,
|