@agentv/core 4.6.0 → 4.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-AIQ5FO4G.js → chunk-75RFVESM.js} +273 -125
- package/dist/chunk-75RFVESM.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +110 -95
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +30 -72
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +1488 -517
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +291 -74
- package/dist/index.d.ts +291 -74
- package/dist/index.js +1187 -369
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-AIQ5FO4G.js.map +0 -1
package/dist/index.js
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import {
|
|
2
2
|
COMMON_TARGET_SETTINGS,
|
|
3
|
+
LLM_GRADER_CAPABLE_KINDS,
|
|
3
4
|
TEST_MESSAGE_ROLES,
|
|
4
5
|
buildDirectoryChain,
|
|
5
6
|
buildSearchRoots,
|
|
@@ -21,9 +22,10 @@ import {
|
|
|
21
22
|
normalizeLineEndings,
|
|
22
23
|
readJsonFile,
|
|
23
24
|
readTextFile,
|
|
25
|
+
resolveDelegatedTargetDefinition,
|
|
24
26
|
resolveFileReference,
|
|
25
27
|
resolveTargetDefinition
|
|
26
|
-
} from "./chunk-
|
|
28
|
+
} from "./chunk-75RFVESM.js";
|
|
27
29
|
import {
|
|
28
30
|
AgentvProvider
|
|
29
31
|
} from "./chunk-PRNXHNLF.js";
|
|
@@ -855,8 +857,13 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
855
857
|
const negate = rawEvaluator.negate === true ? true : void 0;
|
|
856
858
|
if (isCustomType) {
|
|
857
859
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
858
|
-
const required2 =
|
|
859
|
-
|
|
860
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
861
|
+
rawEvaluator.required,
|
|
862
|
+
rawEvaluator.min_score,
|
|
863
|
+
name,
|
|
864
|
+
evalId
|
|
865
|
+
);
|
|
866
|
+
const knownProps2 = /* @__PURE__ */ new Set(["name", "type", "weight", "required", "min_score", "negate"]);
|
|
860
867
|
const config2 = {};
|
|
861
868
|
for (const [key, value] of Object.entries(rawEvaluator)) {
|
|
862
869
|
if (!knownProps2.has(key) && value !== void 0) {
|
|
@@ -868,6 +875,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
868
875
|
type: customTypeName,
|
|
869
876
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
870
877
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
878
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
871
879
|
...negate !== void 0 ? { negate } : {},
|
|
872
880
|
...Object.keys(config2).length > 0 ? { config: config2 } : {}
|
|
873
881
|
});
|
|
@@ -937,7 +945,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
937
945
|
);
|
|
938
946
|
}
|
|
939
947
|
}
|
|
940
|
-
const required2 =
|
|
948
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
949
|
+
rawEvaluator.required,
|
|
950
|
+
rawEvaluator.min_score,
|
|
951
|
+
name,
|
|
952
|
+
evalId
|
|
953
|
+
);
|
|
941
954
|
const knownProps2 = /* @__PURE__ */ new Set([
|
|
942
955
|
"name",
|
|
943
956
|
"type",
|
|
@@ -963,6 +976,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
963
976
|
resolvedCwd,
|
|
964
977
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
965
978
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
979
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
966
980
|
...negate !== void 0 ? { negate } : {},
|
|
967
981
|
...Object.keys(config2).length > 0 ? { config: config2 } : {},
|
|
968
982
|
...targetConfig !== void 0 ? { target: targetConfig } : {}
|
|
@@ -1091,7 +1105,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1091
1105
|
};
|
|
1092
1106
|
}
|
|
1093
1107
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1094
|
-
const required2 =
|
|
1108
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
1109
|
+
rawEvaluator.required,
|
|
1110
|
+
rawEvaluator.min_score,
|
|
1111
|
+
name,
|
|
1112
|
+
evalId
|
|
1113
|
+
);
|
|
1095
1114
|
evaluators.push({
|
|
1096
1115
|
name,
|
|
1097
1116
|
type: "composite",
|
|
@@ -1099,6 +1118,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1099
1118
|
aggregator,
|
|
1100
1119
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1101
1120
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
1121
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
1102
1122
|
...negate !== void 0 ? { negate } : {}
|
|
1103
1123
|
});
|
|
1104
1124
|
continue;
|
|
@@ -1209,7 +1229,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1209
1229
|
continue;
|
|
1210
1230
|
}
|
|
1211
1231
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1212
|
-
const required2 =
|
|
1232
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
1233
|
+
rawEvaluator.required,
|
|
1234
|
+
rawEvaluator.min_score,
|
|
1235
|
+
name,
|
|
1236
|
+
evalId
|
|
1237
|
+
);
|
|
1213
1238
|
const config2 = {
|
|
1214
1239
|
name,
|
|
1215
1240
|
type: "tool-trajectory",
|
|
@@ -1218,6 +1243,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1218
1243
|
...expected ? { expected } : {},
|
|
1219
1244
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1220
1245
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
1246
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
1221
1247
|
...negate !== void 0 ? { negate } : {},
|
|
1222
1248
|
...argsMatch2 !== void 0 ? { argsMatch: argsMatch2 } : {}
|
|
1223
1249
|
};
|
|
@@ -1280,7 +1306,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1280
1306
|
const aggregation = asString(rawEvaluator.aggregation);
|
|
1281
1307
|
const validAggregation = isValidFieldAggregationType(aggregation) ? aggregation : void 0;
|
|
1282
1308
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1283
|
-
const required2 =
|
|
1309
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
1310
|
+
rawEvaluator.required,
|
|
1311
|
+
rawEvaluator.min_score,
|
|
1312
|
+
name,
|
|
1313
|
+
evalId
|
|
1314
|
+
);
|
|
1284
1315
|
evaluators.push({
|
|
1285
1316
|
name,
|
|
1286
1317
|
type: "field-accuracy",
|
|
@@ -1288,6 +1319,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1288
1319
|
...validAggregation ? { aggregation: validAggregation } : {},
|
|
1289
1320
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1290
1321
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
1322
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
1291
1323
|
...negate !== void 0 ? { negate } : {}
|
|
1292
1324
|
});
|
|
1293
1325
|
continue;
|
|
@@ -1301,13 +1333,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1301
1333
|
continue;
|
|
1302
1334
|
}
|
|
1303
1335
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1304
|
-
const required2 =
|
|
1336
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
1337
|
+
rawEvaluator.required,
|
|
1338
|
+
rawEvaluator.min_score,
|
|
1339
|
+
name,
|
|
1340
|
+
evalId
|
|
1341
|
+
);
|
|
1305
1342
|
evaluators.push({
|
|
1306
1343
|
name,
|
|
1307
1344
|
type: "latency",
|
|
1308
1345
|
threshold,
|
|
1309
1346
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1310
1347
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
1348
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
1311
1349
|
...negate !== void 0 ? { negate } : {}
|
|
1312
1350
|
});
|
|
1313
1351
|
continue;
|
|
@@ -1321,13 +1359,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1321
1359
|
continue;
|
|
1322
1360
|
}
|
|
1323
1361
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1324
|
-
const required2 =
|
|
1362
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
1363
|
+
rawEvaluator.required,
|
|
1364
|
+
rawEvaluator.min_score,
|
|
1365
|
+
name,
|
|
1366
|
+
evalId
|
|
1367
|
+
);
|
|
1325
1368
|
evaluators.push({
|
|
1326
1369
|
name,
|
|
1327
1370
|
type: "cost",
|
|
1328
1371
|
budget,
|
|
1329
1372
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1330
1373
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
1374
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
1331
1375
|
...negate !== void 0 ? { negate } : {}
|
|
1332
1376
|
});
|
|
1333
1377
|
continue;
|
|
@@ -1359,13 +1403,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1359
1403
|
continue;
|
|
1360
1404
|
}
|
|
1361
1405
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1362
|
-
const required2 =
|
|
1406
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
1407
|
+
rawEvaluator.required,
|
|
1408
|
+
rawEvaluator.min_score,
|
|
1409
|
+
name,
|
|
1410
|
+
evalId
|
|
1411
|
+
);
|
|
1363
1412
|
evaluators.push({
|
|
1364
1413
|
name,
|
|
1365
1414
|
type: "token-usage",
|
|
1366
1415
|
...validLimits,
|
|
1367
1416
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1368
1417
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
1418
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
1369
1419
|
...negate !== void 0 ? { negate } : {}
|
|
1370
1420
|
});
|
|
1371
1421
|
continue;
|
|
@@ -1411,13 +1461,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1411
1461
|
continue;
|
|
1412
1462
|
}
|
|
1413
1463
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1414
|
-
const required2 =
|
|
1464
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
1465
|
+
rawEvaluator.required,
|
|
1466
|
+
rawEvaluator.min_score,
|
|
1467
|
+
name,
|
|
1468
|
+
evalId
|
|
1469
|
+
);
|
|
1415
1470
|
evaluators.push({
|
|
1416
1471
|
name,
|
|
1417
1472
|
type: "execution-metrics",
|
|
1418
1473
|
...validThresholds,
|
|
1419
1474
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1420
1475
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
1476
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
1421
1477
|
...negate !== void 0 ? { negate } : {}
|
|
1422
1478
|
});
|
|
1423
1479
|
continue;
|
|
@@ -1431,7 +1487,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1431
1487
|
const rawShouldTrigger = rawEvaluator.should_trigger;
|
|
1432
1488
|
const shouldTrigger = typeof rawShouldTrigger === "boolean" ? rawShouldTrigger : void 0;
|
|
1433
1489
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1434
|
-
const required2 =
|
|
1490
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
1491
|
+
rawEvaluator.required,
|
|
1492
|
+
rawEvaluator.min_score,
|
|
1493
|
+
name,
|
|
1494
|
+
evalId
|
|
1495
|
+
);
|
|
1435
1496
|
evaluators.push({
|
|
1436
1497
|
name,
|
|
1437
1498
|
type: "skill-trigger",
|
|
@@ -1439,6 +1500,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1439
1500
|
...shouldTrigger !== void 0 ? { should_trigger: shouldTrigger } : {},
|
|
1440
1501
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1441
1502
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
1503
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
1442
1504
|
...negate !== void 0 ? { negate } : {}
|
|
1443
1505
|
});
|
|
1444
1506
|
continue;
|
|
@@ -1450,13 +1512,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1450
1512
|
continue;
|
|
1451
1513
|
}
|
|
1452
1514
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1453
|
-
const required2 =
|
|
1515
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
1516
|
+
rawEvaluator.required,
|
|
1517
|
+
rawEvaluator.min_score,
|
|
1518
|
+
name,
|
|
1519
|
+
evalId
|
|
1520
|
+
);
|
|
1454
1521
|
evaluators.push({
|
|
1455
1522
|
name,
|
|
1456
1523
|
type: "contains",
|
|
1457
1524
|
value,
|
|
1458
1525
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1459
1526
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
1527
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
1460
1528
|
...negate !== void 0 ? { negate } : {}
|
|
1461
1529
|
});
|
|
1462
1530
|
continue;
|
|
@@ -1470,13 +1538,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1470
1538
|
continue;
|
|
1471
1539
|
}
|
|
1472
1540
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1473
|
-
const required2 =
|
|
1541
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
1542
|
+
rawEvaluator.required,
|
|
1543
|
+
rawEvaluator.min_score,
|
|
1544
|
+
name,
|
|
1545
|
+
evalId
|
|
1546
|
+
);
|
|
1474
1547
|
evaluators.push({
|
|
1475
1548
|
name,
|
|
1476
1549
|
type: typeValue,
|
|
1477
1550
|
value,
|
|
1478
1551
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1479
1552
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
1553
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
1480
1554
|
...negate !== void 0 ? { negate } : {}
|
|
1481
1555
|
});
|
|
1482
1556
|
continue;
|
|
@@ -1488,13 +1562,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1488
1562
|
continue;
|
|
1489
1563
|
}
|
|
1490
1564
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1491
|
-
const required2 =
|
|
1565
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
1566
|
+
rawEvaluator.required,
|
|
1567
|
+
rawEvaluator.min_score,
|
|
1568
|
+
name,
|
|
1569
|
+
evalId
|
|
1570
|
+
);
|
|
1492
1571
|
evaluators.push({
|
|
1493
1572
|
name,
|
|
1494
1573
|
type: "icontains",
|
|
1495
1574
|
value,
|
|
1496
1575
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1497
1576
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
1577
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
1498
1578
|
...negate !== void 0 ? { negate } : {}
|
|
1499
1579
|
});
|
|
1500
1580
|
continue;
|
|
@@ -1508,13 +1588,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1508
1588
|
continue;
|
|
1509
1589
|
}
|
|
1510
1590
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1511
|
-
const required2 =
|
|
1591
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
1592
|
+
rawEvaluator.required,
|
|
1593
|
+
rawEvaluator.min_score,
|
|
1594
|
+
name,
|
|
1595
|
+
evalId
|
|
1596
|
+
);
|
|
1512
1597
|
evaluators.push({
|
|
1513
1598
|
name,
|
|
1514
1599
|
type: typeValue,
|
|
1515
1600
|
value,
|
|
1516
1601
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1517
1602
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
1603
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
1518
1604
|
...negate !== void 0 ? { negate } : {}
|
|
1519
1605
|
});
|
|
1520
1606
|
continue;
|
|
@@ -1526,13 +1612,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1526
1612
|
continue;
|
|
1527
1613
|
}
|
|
1528
1614
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1529
|
-
const required2 =
|
|
1615
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
1616
|
+
rawEvaluator.required,
|
|
1617
|
+
rawEvaluator.min_score,
|
|
1618
|
+
name,
|
|
1619
|
+
evalId
|
|
1620
|
+
);
|
|
1530
1621
|
evaluators.push({
|
|
1531
1622
|
name,
|
|
1532
1623
|
type: typeValue,
|
|
1533
1624
|
value,
|
|
1534
1625
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1535
1626
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
1627
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
1536
1628
|
...negate !== void 0 ? { negate } : {}
|
|
1537
1629
|
});
|
|
1538
1630
|
continue;
|
|
@@ -1545,7 +1637,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1545
1637
|
}
|
|
1546
1638
|
const flags = asString(rawEvaluator.flags);
|
|
1547
1639
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1548
|
-
const required2 =
|
|
1640
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
1641
|
+
rawEvaluator.required,
|
|
1642
|
+
rawEvaluator.min_score,
|
|
1643
|
+
name,
|
|
1644
|
+
evalId
|
|
1645
|
+
);
|
|
1549
1646
|
evaluators.push({
|
|
1550
1647
|
name,
|
|
1551
1648
|
type: "regex",
|
|
@@ -1553,18 +1650,25 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1553
1650
|
...flags !== void 0 ? { flags } : {},
|
|
1554
1651
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1555
1652
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
1653
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
1556
1654
|
...negate !== void 0 ? { negate } : {}
|
|
1557
1655
|
});
|
|
1558
1656
|
continue;
|
|
1559
1657
|
}
|
|
1560
1658
|
if (typeValue === "is-json") {
|
|
1561
1659
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1562
|
-
const required2 =
|
|
1660
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
1661
|
+
rawEvaluator.required,
|
|
1662
|
+
rawEvaluator.min_score,
|
|
1663
|
+
name,
|
|
1664
|
+
evalId
|
|
1665
|
+
);
|
|
1563
1666
|
evaluators.push({
|
|
1564
1667
|
name,
|
|
1565
1668
|
type: "is-json",
|
|
1566
1669
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1567
1670
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
1671
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
1568
1672
|
...negate !== void 0 ? { negate } : {}
|
|
1569
1673
|
});
|
|
1570
1674
|
continue;
|
|
@@ -1576,13 +1680,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1576
1680
|
continue;
|
|
1577
1681
|
}
|
|
1578
1682
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1579
|
-
const required2 =
|
|
1683
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
1684
|
+
rawEvaluator.required,
|
|
1685
|
+
rawEvaluator.min_score,
|
|
1686
|
+
name,
|
|
1687
|
+
evalId
|
|
1688
|
+
);
|
|
1580
1689
|
evaluators.push({
|
|
1581
1690
|
name,
|
|
1582
1691
|
type: "equals",
|
|
1583
1692
|
value,
|
|
1584
1693
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1585
1694
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
1695
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
1586
1696
|
...negate !== void 0 ? { negate } : {}
|
|
1587
1697
|
});
|
|
1588
1698
|
continue;
|
|
@@ -1618,7 +1728,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1618
1728
|
continue;
|
|
1619
1729
|
}
|
|
1620
1730
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1621
|
-
const required2 =
|
|
1731
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
1732
|
+
rawEvaluator.required,
|
|
1733
|
+
rawEvaluator.min_score,
|
|
1734
|
+
name,
|
|
1735
|
+
evalId
|
|
1736
|
+
);
|
|
1622
1737
|
evaluators.push({
|
|
1623
1738
|
name,
|
|
1624
1739
|
type: "llm-grader",
|
|
@@ -1626,6 +1741,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1626
1741
|
...graderTargetName ? { target: graderTargetName } : {},
|
|
1627
1742
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1628
1743
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
1744
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
1629
1745
|
...negate !== void 0 ? { negate } : {}
|
|
1630
1746
|
});
|
|
1631
1747
|
continue;
|
|
@@ -1695,7 +1811,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1695
1811
|
continue;
|
|
1696
1812
|
}
|
|
1697
1813
|
const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1698
|
-
const required2 =
|
|
1814
|
+
const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
|
|
1815
|
+
rawEvaluator.required,
|
|
1816
|
+
rawEvaluator.min_score,
|
|
1817
|
+
name,
|
|
1818
|
+
evalId
|
|
1819
|
+
);
|
|
1699
1820
|
evaluators.push({
|
|
1700
1821
|
name,
|
|
1701
1822
|
type: "llm-grader",
|
|
@@ -1703,12 +1824,18 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1703
1824
|
...graderTargetName ? { target: graderTargetName } : {},
|
|
1704
1825
|
...weight2 !== void 0 ? { weight: weight2 } : {},
|
|
1705
1826
|
...required2 !== void 0 ? { required: required2 } : {},
|
|
1827
|
+
...min_score2 !== void 0 ? { min_score: min_score2 } : {},
|
|
1706
1828
|
...negate !== void 0 ? { negate } : {}
|
|
1707
1829
|
});
|
|
1708
1830
|
continue;
|
|
1709
1831
|
}
|
|
1710
1832
|
const weight = validateWeight(rawEvaluator.weight, name, evalId);
|
|
1711
|
-
const required =
|
|
1833
|
+
const { required, min_score } = parseRequiredAndMinScore(
|
|
1834
|
+
rawEvaluator.required,
|
|
1835
|
+
rawEvaluator.min_score,
|
|
1836
|
+
name,
|
|
1837
|
+
evalId
|
|
1838
|
+
);
|
|
1712
1839
|
const knownProps = /* @__PURE__ */ new Set([
|
|
1713
1840
|
"name",
|
|
1714
1841
|
"type",
|
|
@@ -1719,6 +1846,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1719
1846
|
"weight",
|
|
1720
1847
|
"config",
|
|
1721
1848
|
"required",
|
|
1849
|
+
"min_score",
|
|
1722
1850
|
"negate",
|
|
1723
1851
|
"max_steps",
|
|
1724
1852
|
"maxSteps",
|
|
@@ -1748,6 +1876,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
1748
1876
|
...graderTargetName ? { target: graderTargetName } : {},
|
|
1749
1877
|
...weight !== void 0 ? { weight } : {},
|
|
1750
1878
|
...required !== void 0 ? { required } : {},
|
|
1879
|
+
...min_score !== void 0 ? { min_score } : {},
|
|
1751
1880
|
...negate !== void 0 ? { negate } : {},
|
|
1752
1881
|
...finalConfig ? { config: finalConfig } : {},
|
|
1753
1882
|
...llmMaxSteps !== void 0 ? { max_steps: llmMaxSteps } : {},
|
|
@@ -1879,10 +2008,23 @@ ${detailBlock}${ANSI_RESET4}`);
|
|
|
1879
2008
|
console.warn(`${ANSI_YELLOW3}Warning: ${message}${ANSI_RESET4}`);
|
|
1880
2009
|
}
|
|
1881
2010
|
}
|
|
1882
|
-
function
|
|
1883
|
-
|
|
1884
|
-
if (typeof
|
|
1885
|
-
|
|
2011
|
+
function parseRequiredAndMinScore(rawRequired, rawMinScore, evaluatorName, evalId) {
|
|
2012
|
+
const result = {};
|
|
2013
|
+
if (typeof rawMinScore === "number" && rawMinScore > 0 && rawMinScore <= 1) {
|
|
2014
|
+
result.min_score = rawMinScore;
|
|
2015
|
+
}
|
|
2016
|
+
if (rawRequired === true) {
|
|
2017
|
+
result.required = true;
|
|
2018
|
+
} else if (typeof rawRequired === "number" && rawRequired > 0 && rawRequired <= 1) {
|
|
2019
|
+
if (result.min_score === void 0) {
|
|
2020
|
+
result.min_score = rawRequired;
|
|
2021
|
+
}
|
|
2022
|
+
result.required = rawRequired;
|
|
2023
|
+
logWarning2(
|
|
2024
|
+
`Evaluator '${evaluatorName}' in '${evalId}': 'required: ${rawRequired}' is deprecated. Use 'required: true' + 'min_score: ${rawRequired}' instead.`
|
|
2025
|
+
);
|
|
2026
|
+
}
|
|
2027
|
+
return result;
|
|
1886
2028
|
}
|
|
1887
2029
|
function validateWeight(rawWeight, evaluatorName, evalId) {
|
|
1888
2030
|
if (rawWeight === void 0) {
|
|
@@ -1925,16 +2067,30 @@ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
|
|
|
1925
2067
|
const id = asString(rawRubric.id) ?? `rubric-${index + 1}`;
|
|
1926
2068
|
const expectedOutcome = asString(rawRubric.outcome) ?? "";
|
|
1927
2069
|
const weight = typeof rawRubric.weight === "number" ? rawRubric.weight : 1;
|
|
2070
|
+
let minScore;
|
|
1928
2071
|
let requiredMinScore;
|
|
1929
2072
|
let required;
|
|
1930
|
-
if (typeof rawRubric.
|
|
1931
|
-
const
|
|
1932
|
-
if (
|
|
2073
|
+
if (typeof rawRubric.min_score === "number") {
|
|
2074
|
+
const ms = rawRubric.min_score;
|
|
2075
|
+
if (ms <= 0 || ms > 1) {
|
|
2076
|
+
throw new Error(
|
|
2077
|
+
`Invalid min_score for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be in (0, 1] (got ${ms})`
|
|
2078
|
+
);
|
|
2079
|
+
}
|
|
2080
|
+
minScore = ms;
|
|
2081
|
+
requiredMinScore = Math.round(ms * 10);
|
|
2082
|
+
} else if (typeof rawRubric.required_min_score === "number") {
|
|
2083
|
+
const rms = rawRubric.required_min_score;
|
|
2084
|
+
if (!Number.isInteger(rms) || rms < 0 || rms > 10) {
|
|
1933
2085
|
throw new Error(
|
|
1934
|
-
`Invalid required_min_score for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be an integer 0-10 (got ${
|
|
2086
|
+
`Invalid required_min_score for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be an integer 0-10 (got ${rms})`
|
|
1935
2087
|
);
|
|
1936
2088
|
}
|
|
1937
|
-
requiredMinScore =
|
|
2089
|
+
requiredMinScore = rms;
|
|
2090
|
+
minScore = rms / 10;
|
|
2091
|
+
logWarning2(
|
|
2092
|
+
`Rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': 'required_min_score: ${rms}' is deprecated. Use 'min_score: ${rms / 10}' (0-1 scale) instead.`
|
|
2093
|
+
);
|
|
1938
2094
|
}
|
|
1939
2095
|
if (typeof rawRubric.required === "boolean") {
|
|
1940
2096
|
required = rawRubric.required;
|
|
@@ -1954,6 +2110,7 @@ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
|
|
|
1954
2110
|
weight,
|
|
1955
2111
|
...expectedOutcome.length > 0 ? { outcome: expectedOutcome } : {},
|
|
1956
2112
|
...required !== void 0 ? { required } : {},
|
|
2113
|
+
...minScore !== void 0 ? { min_score: minScore } : {},
|
|
1957
2114
|
...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {},
|
|
1958
2115
|
score_ranges: scoreRanges
|
|
1959
2116
|
});
|
|
@@ -1970,6 +2127,7 @@ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
|
|
|
1970
2127
|
weight,
|
|
1971
2128
|
// Default to required: true if not specified (backward compatibility)
|
|
1972
2129
|
required: required ?? true,
|
|
2130
|
+
...minScore !== void 0 ? { min_score: minScore } : {},
|
|
1973
2131
|
...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {}
|
|
1974
2132
|
});
|
|
1975
2133
|
}
|
|
@@ -2098,12 +2256,22 @@ function parseInlineRubrics(rawRubrics) {
|
|
|
2098
2256
|
id: asString(rubric.id) ?? `rubric-${index + 1}`,
|
|
2099
2257
|
weight: typeof rubric.weight === "number" ? rubric.weight : 1
|
|
2100
2258
|
};
|
|
2259
|
+
let inlineMinScore;
|
|
2260
|
+
let inlineRequiredMinScore;
|
|
2261
|
+
if (typeof rubric.min_score === "number") {
|
|
2262
|
+
inlineMinScore = rubric.min_score;
|
|
2263
|
+
inlineRequiredMinScore = Math.round(inlineMinScore * 10);
|
|
2264
|
+
} else if (typeof rubric.required_min_score === "number") {
|
|
2265
|
+
inlineRequiredMinScore = rubric.required_min_score;
|
|
2266
|
+
inlineMinScore = inlineRequiredMinScore / 10;
|
|
2267
|
+
}
|
|
2101
2268
|
if (scoreRanges && scoreRanges.length > 0) {
|
|
2102
2269
|
return {
|
|
2103
2270
|
...baseRubric,
|
|
2104
2271
|
...expectedOutcome.length > 0 ? { outcome: expectedOutcome } : {},
|
|
2105
2272
|
...typeof rubric.required === "boolean" ? { required: rubric.required } : {},
|
|
2106
|
-
...
|
|
2273
|
+
...inlineMinScore !== void 0 ? { min_score: inlineMinScore } : {},
|
|
2274
|
+
...inlineRequiredMinScore !== void 0 ? { required_min_score: inlineRequiredMinScore } : {},
|
|
2107
2275
|
score_ranges: scoreRanges
|
|
2108
2276
|
};
|
|
2109
2277
|
}
|
|
@@ -2111,7 +2279,8 @@ function parseInlineRubrics(rawRubrics) {
|
|
|
2111
2279
|
...baseRubric,
|
|
2112
2280
|
outcome: expectedOutcome,
|
|
2113
2281
|
required: typeof rubric.required === "boolean" ? rubric.required : true,
|
|
2114
|
-
...
|
|
2282
|
+
...inlineMinScore !== void 0 ? { min_score: inlineMinScore } : {},
|
|
2283
|
+
...inlineRequiredMinScore !== void 0 ? { required_min_score: inlineRequiredMinScore } : {}
|
|
2115
2284
|
};
|
|
2116
2285
|
}).filter((r) => r.outcome && r.outcome.length > 0 || "score_ranges" in r && r.score_ranges);
|
|
2117
2286
|
if (rubricItems.length === 0) {
|
|
@@ -2511,6 +2680,9 @@ function resolveExpectedMessages(raw) {
|
|
|
2511
2680
|
var ANSI_YELLOW5 = "\x1B[33m";
|
|
2512
2681
|
var ANSI_RED2 = "\x1B[31m";
|
|
2513
2682
|
var ANSI_RESET6 = "\x1B[0m";
|
|
2683
|
+
function matchesFilter(id, filter) {
|
|
2684
|
+
return typeof filter === "string" ? micromatch.isMatch(id, filter) : filter.some((pattern) => micromatch.isMatch(id, pattern));
|
|
2685
|
+
}
|
|
2514
2686
|
function detectFormat(filePath) {
|
|
2515
2687
|
const ext = path6.extname(filePath).toLowerCase();
|
|
2516
2688
|
if (ext === ".jsonl") return "jsonl";
|
|
@@ -2578,40 +2750,40 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
2578
2750
|
const sidecar = await loadSidecarMetadata(absoluteTestPath, verbose);
|
|
2579
2751
|
const rawFile = await readFile5(absoluteTestPath, "utf8");
|
|
2580
2752
|
const rawCases = parseJsonlContent(rawFile, evalFilePath);
|
|
2581
|
-
const
|
|
2582
|
-
const
|
|
2753
|
+
const fallbackSuiteName = path6.basename(absoluteTestPath, ".jsonl") || "eval";
|
|
2754
|
+
const suiteName = sidecar.name && sidecar.name.trim().length > 0 ? sidecar.name : fallbackSuiteName;
|
|
2583
2755
|
const globalEvaluator = coerceEvaluator(sidecar.evaluator, "sidecar") ?? "llm-grader";
|
|
2584
2756
|
const globalExecution = sidecar.execution;
|
|
2585
2757
|
if (verbose) {
|
|
2586
2758
|
console.log(`
|
|
2587
|
-
[JSONL
|
|
2759
|
+
[JSONL Suite: ${evalFilePath}]`);
|
|
2588
2760
|
console.log(` Cases: ${rawCases.length}`);
|
|
2589
|
-
console.log(`
|
|
2761
|
+
console.log(` Suite: ${suiteName}`);
|
|
2590
2762
|
if (sidecar.description) {
|
|
2591
2763
|
console.log(` Description: ${sidecar.description}`);
|
|
2592
2764
|
}
|
|
2593
2765
|
}
|
|
2594
2766
|
const results = [];
|
|
2595
2767
|
for (let lineIndex = 0; lineIndex < rawCases.length; lineIndex++) {
|
|
2596
|
-
const
|
|
2768
|
+
const testCaseConfig = rawCases[lineIndex];
|
|
2597
2769
|
const lineNumber = lineIndex + 1;
|
|
2598
|
-
const id = asString4(
|
|
2599
|
-
if (filterPattern && (!id || !
|
|
2770
|
+
const id = asString4(testCaseConfig.id);
|
|
2771
|
+
if (filterPattern && (!id || !matchesFilter(id, filterPattern))) {
|
|
2600
2772
|
continue;
|
|
2601
2773
|
}
|
|
2602
|
-
const conversationId = asString4(
|
|
2603
|
-
let outcome = asString4(
|
|
2604
|
-
if (!outcome &&
|
|
2605
|
-
outcome = asString4(
|
|
2774
|
+
const conversationId = asString4(testCaseConfig.conversation_id);
|
|
2775
|
+
let outcome = asString4(testCaseConfig.criteria);
|
|
2776
|
+
if (!outcome && testCaseConfig.expected_outcome !== void 0) {
|
|
2777
|
+
outcome = asString4(testCaseConfig.expected_outcome);
|
|
2606
2778
|
if (outcome) {
|
|
2607
2779
|
logWarning4(
|
|
2608
|
-
`Test '${asString4(
|
|
2780
|
+
`Test '${asString4(testCaseConfig.id) ?? "unknown"}': 'expected_outcome' is deprecated. Use 'criteria' instead.`
|
|
2609
2781
|
);
|
|
2610
2782
|
}
|
|
2611
2783
|
}
|
|
2612
|
-
const rawInputMessages = resolveInputMessages(
|
|
2613
|
-
const expectedMessages = resolveExpectedMessages(
|
|
2614
|
-
const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 ||
|
|
2784
|
+
const rawInputMessages = resolveInputMessages(testCaseConfig);
|
|
2785
|
+
const expectedMessages = resolveExpectedMessages(testCaseConfig) ?? [];
|
|
2786
|
+
const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || testCaseConfig.assert !== void 0;
|
|
2615
2787
|
if (!id || !hasEvaluationSpec || !rawInputMessages || rawInputMessages.length === 0) {
|
|
2616
2788
|
logError2(
|
|
2617
2789
|
`Skipping incomplete test at line ${lineNumber}: ${id ?? "unknown"}. Missing required fields: id, input, and at least one of criteria/expected_output/assert`
|
|
@@ -2648,18 +2820,23 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
2648
2820
|
}
|
|
2649
2821
|
}
|
|
2650
2822
|
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
2651
|
-
const caseExecution = isJsonObject(
|
|
2823
|
+
const caseExecution = isJsonObject(testCaseConfig.execution) ? testCaseConfig.execution : void 0;
|
|
2652
2824
|
const mergedExecution = caseExecution ?? globalExecution;
|
|
2653
|
-
const
|
|
2825
|
+
const testCaseEvaluatorKind = coerceEvaluator(testCaseConfig.evaluator, id) ?? globalEvaluator;
|
|
2654
2826
|
let evaluators;
|
|
2655
2827
|
try {
|
|
2656
|
-
evaluators = await parseEvaluators(
|
|
2828
|
+
evaluators = await parseEvaluators(
|
|
2829
|
+
testCaseConfig,
|
|
2830
|
+
mergedExecution,
|
|
2831
|
+
searchRoots,
|
|
2832
|
+
id ?? "unknown"
|
|
2833
|
+
);
|
|
2657
2834
|
} catch (error) {
|
|
2658
2835
|
const message = error instanceof Error ? error.message : String(error);
|
|
2659
2836
|
logError2(`Skipping test '${id}' at line ${lineNumber}: ${message}`);
|
|
2660
2837
|
continue;
|
|
2661
2838
|
}
|
|
2662
|
-
const inlineRubrics =
|
|
2839
|
+
const inlineRubrics = testCaseConfig.rubrics;
|
|
2663
2840
|
if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
|
|
2664
2841
|
const rubricEvaluator = parseInlineRubrics(inlineRubrics);
|
|
2665
2842
|
if (rubricEvaluator) {
|
|
@@ -2670,7 +2847,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
2670
2847
|
const userFilePaths = collectResolvedInputFilePaths(inputMessages);
|
|
2671
2848
|
const testCase = {
|
|
2672
2849
|
id,
|
|
2673
|
-
|
|
2850
|
+
suite: suiteName,
|
|
2674
2851
|
conversation_id: conversationId,
|
|
2675
2852
|
question,
|
|
2676
2853
|
input: inputMessages,
|
|
@@ -2678,7 +2855,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
2678
2855
|
reference_answer: referenceAnswer,
|
|
2679
2856
|
file_paths: userFilePaths,
|
|
2680
2857
|
criteria: outcome ?? "",
|
|
2681
|
-
evaluator:
|
|
2858
|
+
evaluator: testCaseEvaluatorKind,
|
|
2682
2859
|
assertions: evaluators
|
|
2683
2860
|
};
|
|
2684
2861
|
results.push(testCase);
|
|
@@ -2861,6 +3038,9 @@ function buildChatPromptFromSegments(options) {
|
|
|
2861
3038
|
var ANSI_YELLOW6 = "\x1B[33m";
|
|
2862
3039
|
var ANSI_RED3 = "\x1B[31m";
|
|
2863
3040
|
var ANSI_RESET7 = "\x1B[0m";
|
|
3041
|
+
function matchesFilter2(id, filter) {
|
|
3042
|
+
return typeof filter === "string" ? micromatch2.isMatch(id, filter) : filter.some((pattern) => micromatch2.isMatch(id, pattern));
|
|
3043
|
+
}
|
|
2864
3044
|
function resolveTests(suite) {
|
|
2865
3045
|
if (suite.tests !== void 0) return suite.tests;
|
|
2866
3046
|
if (suite.eval_cases !== void 0) {
|
|
@@ -2940,18 +3120,18 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
2940
3120
|
throw new Error(`Invalid test file format: ${evalFilePath}`);
|
|
2941
3121
|
}
|
|
2942
3122
|
const suite = interpolated;
|
|
2943
|
-
const
|
|
2944
|
-
const
|
|
2945
|
-
const
|
|
2946
|
-
const
|
|
3123
|
+
const suiteNameFromFile = asString5(suite.name)?.trim();
|
|
3124
|
+
const fallbackSuiteName = path7.basename(absoluteTestPath).replace(/\.eval\.ya?ml$/i, "").replace(/\.ya?ml$/i, "") || "eval";
|
|
3125
|
+
const suiteName = suiteNameFromFile && suiteNameFromFile.length > 0 ? suiteNameFromFile : fallbackSuiteName;
|
|
3126
|
+
const rawTestCases = resolveTests(suite);
|
|
2947
3127
|
const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm-grader";
|
|
2948
3128
|
const evalFileDir = path7.dirname(absoluteTestPath);
|
|
2949
|
-
let
|
|
2950
|
-
if (typeof
|
|
2951
|
-
const externalPath = path7.resolve(evalFileDir,
|
|
2952
|
-
|
|
2953
|
-
} else if (Array.isArray(
|
|
2954
|
-
|
|
3129
|
+
let expandedTestCases;
|
|
3130
|
+
if (typeof rawTestCases === "string") {
|
|
3131
|
+
const externalPath = path7.resolve(evalFileDir, rawTestCases);
|
|
3132
|
+
expandedTestCases = await loadCasesFromFile(externalPath);
|
|
3133
|
+
} else if (Array.isArray(rawTestCases)) {
|
|
3134
|
+
expandedTestCases = await expandFileReferences(rawTestCases, evalFileDir);
|
|
2955
3135
|
} else {
|
|
2956
3136
|
throw new Error(`Invalid test file format: ${evalFilePath} - missing 'tests' field`);
|
|
2957
3137
|
}
|
|
@@ -2966,32 +3146,33 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
2966
3146
|
}
|
|
2967
3147
|
const globalExecution = suiteAssertions !== void 0 ? { ...rawGlobalExecution ?? {}, assertions: suiteAssertions } : rawGlobalExecution;
|
|
2968
3148
|
const results = [];
|
|
2969
|
-
for (const
|
|
2970
|
-
if (!isJsonObject(
|
|
3149
|
+
for (const rawTestCase of expandedTestCases) {
|
|
3150
|
+
if (!isJsonObject(rawTestCase)) {
|
|
2971
3151
|
logWarning5("Skipping invalid test entry (expected object)");
|
|
2972
3152
|
continue;
|
|
2973
3153
|
}
|
|
2974
|
-
const
|
|
2975
|
-
const id = asString5(
|
|
2976
|
-
if (filterPattern && (!id || !
|
|
3154
|
+
const testCaseConfig = rawTestCase;
|
|
3155
|
+
const id = asString5(testCaseConfig.id);
|
|
3156
|
+
if (filterPattern && (!id || !matchesFilter2(id, filterPattern))) {
|
|
2977
3157
|
continue;
|
|
2978
3158
|
}
|
|
2979
|
-
const conversationId = asString5(
|
|
2980
|
-
let outcome = asString5(
|
|
2981
|
-
if (!outcome &&
|
|
2982
|
-
outcome = asString5(
|
|
3159
|
+
const conversationId = asString5(testCaseConfig.conversation_id);
|
|
3160
|
+
let outcome = asString5(testCaseConfig.criteria);
|
|
3161
|
+
if (!outcome && testCaseConfig.expected_outcome !== void 0) {
|
|
3162
|
+
outcome = asString5(testCaseConfig.expected_outcome);
|
|
2983
3163
|
if (outcome) {
|
|
2984
3164
|
logWarning5(
|
|
2985
|
-
`Test '${asString5(
|
|
3165
|
+
`Test '${asString5(testCaseConfig.id) ?? "unknown"}': 'expected_outcome' is deprecated. Use 'criteria' instead.`
|
|
2986
3166
|
);
|
|
2987
3167
|
}
|
|
2988
3168
|
}
|
|
2989
|
-
const caseExecution = isJsonObject(
|
|
3169
|
+
const caseExecution = isJsonObject(testCaseConfig.execution) ? testCaseConfig.execution : void 0;
|
|
2990
3170
|
const skipDefaults = caseExecution?.skip_defaults === true;
|
|
3171
|
+
const caseThreshold = typeof caseExecution?.threshold === "number" && caseExecution.threshold >= 0 && caseExecution.threshold <= 1 ? caseExecution.threshold : void 0;
|
|
2991
3172
|
const effectiveSuiteInputFiles = suiteInputFiles && !skipDefaults ? suiteInputFiles : void 0;
|
|
2992
|
-
const testInputMessages = resolveInputMessages(
|
|
2993
|
-
const expectedMessages = resolveExpectedMessages(
|
|
2994
|
-
const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 ||
|
|
3173
|
+
const testInputMessages = resolveInputMessages(testCaseConfig, effectiveSuiteInputFiles);
|
|
3174
|
+
const expectedMessages = resolveExpectedMessages(testCaseConfig) ?? [];
|
|
3175
|
+
const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || testCaseConfig.assertions !== void 0 || testCaseConfig.assert !== void 0;
|
|
2995
3176
|
if (!id || !hasEvaluationSpec || !testInputMessages || testInputMessages.length === 0) {
|
|
2996
3177
|
logError3(
|
|
2997
3178
|
`Skipping incomplete test: ${id ?? "unknown"}. Missing required fields: id, input, and at least one of criteria/expected_output/assertions`
|
|
@@ -3038,16 +3219,21 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
3038
3219
|
}
|
|
3039
3220
|
}
|
|
3040
3221
|
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
3041
|
-
const
|
|
3222
|
+
const testCaseEvaluatorKind = coerceEvaluator(testCaseConfig.evaluator, id) ?? globalEvaluator;
|
|
3042
3223
|
let evaluators;
|
|
3043
3224
|
try {
|
|
3044
|
-
evaluators = await parseEvaluators(
|
|
3225
|
+
evaluators = await parseEvaluators(
|
|
3226
|
+
testCaseConfig,
|
|
3227
|
+
globalExecution,
|
|
3228
|
+
searchRoots,
|
|
3229
|
+
id ?? "unknown"
|
|
3230
|
+
);
|
|
3045
3231
|
} catch (error) {
|
|
3046
3232
|
const message = error instanceof Error ? error.message : String(error);
|
|
3047
3233
|
logError3(`Skipping test '${id}': ${message}`);
|
|
3048
3234
|
continue;
|
|
3049
3235
|
}
|
|
3050
|
-
const inlineRubrics =
|
|
3236
|
+
const inlineRubrics = testCaseConfig.rubrics;
|
|
3051
3237
|
if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
|
|
3052
3238
|
const rubricEvaluator = parseInlineRubrics(inlineRubrics);
|
|
3053
3239
|
if (rubricEvaluator) {
|
|
@@ -3056,13 +3242,13 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
3056
3242
|
}
|
|
3057
3243
|
warnUnconsumedCriteria(outcome, evaluators, id ?? "unknown");
|
|
3058
3244
|
const userFilePaths = collectResolvedInputFilePaths(inputMessages);
|
|
3059
|
-
const caseWorkspace = await resolveWorkspaceConfig(
|
|
3245
|
+
const caseWorkspace = await resolveWorkspaceConfig(testCaseConfig.workspace, evalFileDir);
|
|
3060
3246
|
const mergedWorkspace = mergeWorkspaceConfigs(suiteWorkspace, caseWorkspace);
|
|
3061
|
-
const metadata = isJsonObject(
|
|
3062
|
-
const caseTargets = extractTargetsFromTestCase(
|
|
3247
|
+
const metadata = isJsonObject(testCaseConfig.metadata) ? testCaseConfig.metadata : void 0;
|
|
3248
|
+
const caseTargets = extractTargetsFromTestCase(testCaseConfig);
|
|
3063
3249
|
const testCase = {
|
|
3064
3250
|
id,
|
|
3065
|
-
|
|
3251
|
+
suite: suiteName,
|
|
3066
3252
|
category: options?.category,
|
|
3067
3253
|
conversation_id: conversationId,
|
|
3068
3254
|
question,
|
|
@@ -3071,11 +3257,12 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
3071
3257
|
reference_answer: referenceAnswer,
|
|
3072
3258
|
file_paths: userFilePaths,
|
|
3073
3259
|
criteria: outcome ?? "",
|
|
3074
|
-
evaluator:
|
|
3260
|
+
evaluator: testCaseEvaluatorKind,
|
|
3075
3261
|
assertions: evaluators,
|
|
3076
3262
|
workspace: mergedWorkspace,
|
|
3077
3263
|
metadata,
|
|
3078
|
-
targets: caseTargets
|
|
3264
|
+
targets: caseTargets,
|
|
3265
|
+
...caseThreshold !== void 0 ? { threshold: caseThreshold } : {}
|
|
3079
3266
|
};
|
|
3080
3267
|
results.push(testCase);
|
|
3081
3268
|
}
|
|
@@ -3619,7 +3806,7 @@ var AzureProvider = class {
|
|
|
3619
3806
|
};
|
|
3620
3807
|
this.retryConfig = config.retry;
|
|
3621
3808
|
const azure = createAzure(buildAzureOptions(config));
|
|
3622
|
-
this.model = azure.chat(config.deploymentName);
|
|
3809
|
+
this.model = config.apiFormat === "responses" ? azure(config.deploymentName) : azure.chat(config.deploymentName);
|
|
3623
3810
|
}
|
|
3624
3811
|
id;
|
|
3625
3812
|
kind = "azure";
|
|
@@ -3745,7 +3932,9 @@ function buildAzureOptions(config) {
|
|
|
3745
3932
|
const options = {
|
|
3746
3933
|
apiKey: config.apiKey,
|
|
3747
3934
|
apiVersion: config.version,
|
|
3748
|
-
|
|
3935
|
+
// Chat completions still use deployment-scoped Azure URLs for compatibility
|
|
3936
|
+
// with existing deployments. Responses API should use the SDK's v1 path.
|
|
3937
|
+
useDeploymentBasedUrls: config.apiFormat !== "responses"
|
|
3749
3938
|
};
|
|
3750
3939
|
const baseURL = normalizeAzureBaseUrl(config.resourceName);
|
|
3751
3940
|
if (baseURL) {
|
|
@@ -5216,15 +5405,16 @@ var CliProvider = class {
|
|
|
5216
5405
|
outputFilePath
|
|
5217
5406
|
);
|
|
5218
5407
|
const renderedCommand = renderTemplate(this.config.command, templateValues);
|
|
5408
|
+
const effectiveCwd = requests[0]?.cwd ?? this.config.cwd;
|
|
5219
5409
|
if (this.verbose) {
|
|
5220
5410
|
console.log(
|
|
5221
|
-
`[cli-provider:${this.targetName}] (batch size=${requests.length}) cwd=${
|
|
5411
|
+
`[cli-provider:${this.targetName}] (batch size=${requests.length}) cwd=${effectiveCwd ?? ""} command=${renderedCommand}`
|
|
5222
5412
|
);
|
|
5223
5413
|
}
|
|
5224
5414
|
try {
|
|
5225
5415
|
const startTime = Date.now();
|
|
5226
5416
|
const result = await this.runCommand(renderedCommand, {
|
|
5227
|
-
cwd:
|
|
5417
|
+
cwd: effectiveCwd,
|
|
5228
5418
|
env: process.env,
|
|
5229
5419
|
timeoutMs: this.config.timeoutMs,
|
|
5230
5420
|
signal: controller.signal
|
|
@@ -5257,7 +5447,7 @@ var CliProvider = class {
|
|
|
5257
5447
|
command: renderedCommand,
|
|
5258
5448
|
stderr: result.stderr,
|
|
5259
5449
|
exitCode: result.exitCode ?? 0,
|
|
5260
|
-
cwd:
|
|
5450
|
+
cwd: effectiveCwd,
|
|
5261
5451
|
outputFile: outputFilePath
|
|
5262
5452
|
}
|
|
5263
5453
|
};
|
|
@@ -5275,7 +5465,7 @@ var CliProvider = class {
|
|
|
5275
5465
|
command: renderedCommand,
|
|
5276
5466
|
stderr: result.stderr,
|
|
5277
5467
|
exitCode: result.exitCode ?? 0,
|
|
5278
|
-
cwd:
|
|
5468
|
+
cwd: effectiveCwd,
|
|
5279
5469
|
outputFile: outputFilePath,
|
|
5280
5470
|
error: errorMessage
|
|
5281
5471
|
}
|
|
@@ -5290,7 +5480,7 @@ var CliProvider = class {
|
|
|
5290
5480
|
command: renderedCommand,
|
|
5291
5481
|
stderr: result.stderr,
|
|
5292
5482
|
exitCode: result.exitCode ?? 0,
|
|
5293
|
-
cwd:
|
|
5483
|
+
cwd: effectiveCwd,
|
|
5294
5484
|
outputFile: outputFilePath,
|
|
5295
5485
|
recordId: evalCaseId
|
|
5296
5486
|
}
|
|
@@ -7240,9 +7430,9 @@ var MockProvider = class {
|
|
|
7240
7430
|
};
|
|
7241
7431
|
|
|
7242
7432
|
// src/evaluation/providers/pi-cli.ts
|
|
7243
|
-
import { spawn as spawn3 } from "node:child_process";
|
|
7433
|
+
import { execSync, spawn as spawn3 } from "node:child_process";
|
|
7244
7434
|
import { randomUUID as randomUUID7 } from "node:crypto";
|
|
7245
|
-
import { createWriteStream as createWriteStream5 } from "node:fs";
|
|
7435
|
+
import { accessSync, createWriteStream as createWriteStream5, readFileSync as readFileSync2 } from "node:fs";
|
|
7246
7436
|
import { mkdir as mkdir6, mkdtemp, rm, writeFile } from "node:fs/promises";
|
|
7247
7437
|
import { tmpdir } from "node:os";
|
|
7248
7438
|
import path19 from "node:path";
|
|
@@ -7300,6 +7490,75 @@ function subscribeToPiLogEntries(listener) {
|
|
|
7300
7490
|
};
|
|
7301
7491
|
}
|
|
7302
7492
|
|
|
7493
|
+
// src/evaluation/providers/pi-provider-aliases.ts
|
|
7494
|
+
var SUBPROVIDER_ALIASES = {
|
|
7495
|
+
azure: "azure-openai-responses"
|
|
7496
|
+
};
|
|
7497
|
+
var SUBPROVIDER_ALIASES_WITH_BASE_URL = {
|
|
7498
|
+
// Azure v1 endpoints are OpenAI-compatible; use the standard client
|
|
7499
|
+
// to avoid AzureOpenAI adding api-version query params.
|
|
7500
|
+
azure: "openai-responses"
|
|
7501
|
+
};
|
|
7502
|
+
var ENV_KEY_MAP = {
|
|
7503
|
+
google: "GEMINI_API_KEY",
|
|
7504
|
+
gemini: "GEMINI_API_KEY",
|
|
7505
|
+
anthropic: "ANTHROPIC_API_KEY",
|
|
7506
|
+
openai: "OPENAI_API_KEY",
|
|
7507
|
+
groq: "GROQ_API_KEY",
|
|
7508
|
+
xai: "XAI_API_KEY",
|
|
7509
|
+
openrouter: "OPENROUTER_API_KEY",
|
|
7510
|
+
azure: "AZURE_OPENAI_API_KEY"
|
|
7511
|
+
};
|
|
7512
|
+
var ENV_BASE_URL_MAP = {
|
|
7513
|
+
openai: "OPENAI_BASE_URL",
|
|
7514
|
+
azure: "AZURE_OPENAI_BASE_URL",
|
|
7515
|
+
openrouter: "OPENROUTER_BASE_URL"
|
|
7516
|
+
};
|
|
7517
|
+
function resolveSubprovider(name, hasBaseUrl = false) {
|
|
7518
|
+
const lower = name.toLowerCase();
|
|
7519
|
+
if (hasBaseUrl) {
|
|
7520
|
+
const alias = SUBPROVIDER_ALIASES_WITH_BASE_URL[lower];
|
|
7521
|
+
if (alias) return alias;
|
|
7522
|
+
}
|
|
7523
|
+
return SUBPROVIDER_ALIASES[lower] ?? name;
|
|
7524
|
+
}
|
|
7525
|
+
function resolveCliProvider(name) {
|
|
7526
|
+
const lower = name.toLowerCase();
|
|
7527
|
+
if (lower === "azure") return "azure-openai-responses";
|
|
7528
|
+
return name;
|
|
7529
|
+
}
|
|
7530
|
+
function resolveEnvKeyName(provider, hasBaseUrl = false) {
|
|
7531
|
+
const lower = provider.toLowerCase();
|
|
7532
|
+
if (hasBaseUrl && lower === "azure") return "OPENAI_API_KEY";
|
|
7533
|
+
return ENV_KEY_MAP[lower];
|
|
7534
|
+
}
|
|
7535
|
+
function resolveEnvBaseUrlName(provider, hasBaseUrl = false) {
|
|
7536
|
+
const lower = provider.toLowerCase();
|
|
7537
|
+
if (hasBaseUrl && lower === "azure") return "OPENAI_BASE_URL";
|
|
7538
|
+
return ENV_BASE_URL_MAP[lower];
|
|
7539
|
+
}
|
|
7540
|
+
function extractAzureResourceName(baseUrl) {
|
|
7541
|
+
const urlMatch = baseUrl.match(/^https?:\/\/([^./]+)/);
|
|
7542
|
+
if (urlMatch) return urlMatch[1];
|
|
7543
|
+
return baseUrl;
|
|
7544
|
+
}
|
|
7545
|
+
function normalizeAzureSdkBaseUrl(baseUrl) {
|
|
7546
|
+
const trimmed = baseUrl.trim().replace(/\/+$/, "");
|
|
7547
|
+
if (!trimmed) {
|
|
7548
|
+
return trimmed;
|
|
7549
|
+
}
|
|
7550
|
+
if (!/^https?:\/\//i.test(trimmed)) {
|
|
7551
|
+
return `https://${trimmed}.openai.azure.com/openai/v1`;
|
|
7552
|
+
}
|
|
7553
|
+
if (/\/openai\/v1$/i.test(trimmed)) {
|
|
7554
|
+
return trimmed;
|
|
7555
|
+
}
|
|
7556
|
+
if (/\/openai$/i.test(trimmed)) {
|
|
7557
|
+
return `${trimmed}/v1`;
|
|
7558
|
+
}
|
|
7559
|
+
return `${trimmed}/openai/v1`;
|
|
7560
|
+
}
|
|
7561
|
+
|
|
7303
7562
|
// src/evaluation/providers/pi-utils.ts
|
|
7304
7563
|
function extractPiTextContent(content) {
|
|
7305
7564
|
if (typeof content === "string") {
|
|
@@ -7458,12 +7717,12 @@ var PiCliProvider = class {
|
|
|
7458
7717
|
buildPiArgs(prompt, inputFiles) {
|
|
7459
7718
|
const args = [];
|
|
7460
7719
|
if (this.config.subprovider) {
|
|
7461
|
-
args.push("--provider", this.config.subprovider);
|
|
7720
|
+
args.push("--provider", resolveCliProvider(this.config.subprovider));
|
|
7462
7721
|
}
|
|
7463
7722
|
if (this.config.model) {
|
|
7464
7723
|
args.push("--model", this.config.model);
|
|
7465
7724
|
}
|
|
7466
|
-
if (this.config.apiKey) {
|
|
7725
|
+
if (this.config.apiKey && this.config.subprovider?.toLowerCase() !== "azure") {
|
|
7467
7726
|
args.push("--api-key", this.config.apiKey);
|
|
7468
7727
|
}
|
|
7469
7728
|
args.push("--mode", "json");
|
|
@@ -7515,35 +7774,35 @@ ${prompt}` : prompt;
|
|
|
7515
7774
|
}
|
|
7516
7775
|
buildEnv() {
|
|
7517
7776
|
const env = { ...process.env };
|
|
7518
|
-
|
|
7519
|
-
|
|
7520
|
-
|
|
7521
|
-
|
|
7522
|
-
|
|
7523
|
-
|
|
7524
|
-
|
|
7525
|
-
|
|
7526
|
-
|
|
7527
|
-
|
|
7528
|
-
|
|
7529
|
-
|
|
7530
|
-
|
|
7531
|
-
|
|
7777
|
+
const provider = this.config.subprovider?.toLowerCase() ?? "google";
|
|
7778
|
+
if (provider === "azure") {
|
|
7779
|
+
if (this.config.apiKey) {
|
|
7780
|
+
env.AZURE_OPENAI_API_KEY = this.config.apiKey;
|
|
7781
|
+
}
|
|
7782
|
+
if (this.config.baseUrl) {
|
|
7783
|
+
env.AZURE_OPENAI_RESOURCE_NAME = extractAzureResourceName(this.config.baseUrl);
|
|
7784
|
+
}
|
|
7785
|
+
} else {
|
|
7786
|
+
if (this.config.apiKey) {
|
|
7787
|
+
const envKey = resolveEnvKeyName(provider);
|
|
7788
|
+
if (envKey) {
|
|
7789
|
+
env[envKey] = this.config.apiKey;
|
|
7790
|
+
}
|
|
7532
7791
|
}
|
|
7533
7792
|
}
|
|
7534
7793
|
if (this.config.subprovider) {
|
|
7535
|
-
const
|
|
7794
|
+
const resolvedProvider = resolveCliProvider(this.config.subprovider);
|
|
7536
7795
|
const PROVIDER_OWN_PREFIXES = {
|
|
7537
7796
|
openrouter: ["OPENROUTER_"],
|
|
7538
7797
|
anthropic: ["ANTHROPIC_"],
|
|
7539
7798
|
openai: ["OPENAI_"],
|
|
7540
|
-
azure: ["AZURE_OPENAI_"],
|
|
7799
|
+
"azure-openai-responses": ["AZURE_OPENAI_"],
|
|
7541
7800
|
google: ["GEMINI_", "GOOGLE_GENERATIVE_AI_"],
|
|
7542
7801
|
gemini: ["GEMINI_", "GOOGLE_GENERATIVE_AI_"],
|
|
7543
7802
|
groq: ["GROQ_"],
|
|
7544
7803
|
xai: ["XAI_"]
|
|
7545
7804
|
};
|
|
7546
|
-
const ownPrefixes = PROVIDER_OWN_PREFIXES[
|
|
7805
|
+
const ownPrefixes = PROVIDER_OWN_PREFIXES[resolvedProvider] ?? [];
|
|
7547
7806
|
const allOtherPrefixes = Object.entries(PROVIDER_OWN_PREFIXES).filter(([key]) => key !== provider).flatMap(([, prefixes]) => prefixes);
|
|
7548
7807
|
for (const key of Object.keys(env)) {
|
|
7549
7808
|
if (allOtherPrefixes.some((prefix) => key.startsWith(prefix)) && !ownPrefixes.some((prefix) => key.startsWith(prefix))) {
|
|
@@ -7834,6 +8093,24 @@ function extractMessages(events) {
|
|
|
7834
8093
|
}
|
|
7835
8094
|
}
|
|
7836
8095
|
}
|
|
8096
|
+
if (messages) {
|
|
8097
|
+
for (let i = messages.length - 1; i >= 0; i--) {
|
|
8098
|
+
if (messages[i].role === "assistant" && !messages[i].content) {
|
|
8099
|
+
for (let j = events.length - 1; j >= 0; j--) {
|
|
8100
|
+
const evt = events[j];
|
|
8101
|
+
if (!evt || evt.type !== "message_end") continue;
|
|
8102
|
+
const msg = evt.message;
|
|
8103
|
+
if (msg?.role !== "assistant") continue;
|
|
8104
|
+
const text = extractPiTextContent(msg.content);
|
|
8105
|
+
if (text) {
|
|
8106
|
+
messages[i] = { ...messages[i], content: text };
|
|
8107
|
+
break;
|
|
8108
|
+
}
|
|
8109
|
+
}
|
|
8110
|
+
break;
|
|
8111
|
+
}
|
|
8112
|
+
}
|
|
8113
|
+
}
|
|
7837
8114
|
const eventToolCalls = extractToolCallsFromEvents(events);
|
|
7838
8115
|
if (eventToolCalls.length > 0) {
|
|
7839
8116
|
injectEventToolCalls(messages, eventToolCalls);
|
|
@@ -8018,17 +8295,43 @@ function formatTimeoutSuffix3(timeoutMs) {
|
|
|
8018
8295
|
if (!timeoutMs || timeoutMs <= 0) return "";
|
|
8019
8296
|
return ` after ${Math.ceil(timeoutMs / 1e3)}s`;
|
|
8020
8297
|
}
|
|
8298
|
+
function resolveWindowsCmd(executable) {
|
|
8299
|
+
if (process.platform !== "win32") return [executable, []];
|
|
8300
|
+
const lower = executable.toLowerCase();
|
|
8301
|
+
if (lower.endsWith(".js") || lower.endsWith(".exe")) return [executable, []];
|
|
8302
|
+
let fullPath;
|
|
8303
|
+
try {
|
|
8304
|
+
fullPath = execSync(`where ${executable}`, { encoding: "utf-8" }).trim().split(/\r?\n/)[0].trim();
|
|
8305
|
+
} catch {
|
|
8306
|
+
return [executable, []];
|
|
8307
|
+
}
|
|
8308
|
+
const cmdPath = fullPath.endsWith(".cmd") ? fullPath : `${fullPath}.cmd`;
|
|
8309
|
+
try {
|
|
8310
|
+
const content = readFileSync2(cmdPath, "utf-8");
|
|
8311
|
+
const match = content.match(/"?%_prog%"?\s+"([^"]+\.js)"/);
|
|
8312
|
+
if (match) {
|
|
8313
|
+
const dp0 = path19.dirname(path19.resolve(cmdPath));
|
|
8314
|
+
const scriptPath = match[1].replace(/%dp0%[/\\]?/gi, `${dp0}${path19.sep}`);
|
|
8315
|
+
try {
|
|
8316
|
+
accessSync(scriptPath);
|
|
8317
|
+
return ["node", [scriptPath]];
|
|
8318
|
+
} catch {
|
|
8319
|
+
}
|
|
8320
|
+
}
|
|
8321
|
+
} catch {
|
|
8322
|
+
}
|
|
8323
|
+
return [executable, []];
|
|
8324
|
+
}
|
|
8021
8325
|
async function defaultPiRunner(options) {
|
|
8022
8326
|
return await new Promise((resolve, reject) => {
|
|
8023
8327
|
const parts = options.executable.split(/\s+/);
|
|
8024
|
-
const
|
|
8025
|
-
const executableArgs = parts.slice(1);
|
|
8328
|
+
const [resolvedExe, prefixArgs] = resolveWindowsCmd(parts[0]);
|
|
8329
|
+
const executableArgs = [...prefixArgs, ...parts.slice(1)];
|
|
8026
8330
|
const allArgs = [...executableArgs, ...options.args];
|
|
8027
|
-
const child = spawn3(
|
|
8331
|
+
const child = spawn3(resolvedExe, allArgs, {
|
|
8028
8332
|
cwd: options.cwd,
|
|
8029
8333
|
env: options.env,
|
|
8030
|
-
stdio: ["pipe", "pipe", "pipe"]
|
|
8031
|
-
shell: false
|
|
8334
|
+
stdio: ["pipe", "pipe", "pipe"]
|
|
8032
8335
|
});
|
|
8033
8336
|
let stdout = "";
|
|
8034
8337
|
let stderr = "";
|
|
@@ -8083,13 +8386,43 @@ async function defaultPiRunner(options) {
|
|
|
8083
8386
|
}
|
|
8084
8387
|
|
|
8085
8388
|
// src/evaluation/providers/pi-coding-agent.ts
|
|
8086
|
-
import { execSync } from "node:child_process";
|
|
8389
|
+
import { execSync as execSync2 } from "node:child_process";
|
|
8087
8390
|
import { randomUUID as randomUUID8 } from "node:crypto";
|
|
8088
|
-
import { accessSync, createWriteStream as createWriteStream6 } from "node:fs";
|
|
8391
|
+
import { accessSync as accessSync2, createWriteStream as createWriteStream6, mkdirSync } from "node:fs";
|
|
8089
8392
|
import { mkdir as mkdir7 } from "node:fs/promises";
|
|
8090
|
-
import
|
|
8393
|
+
import path21 from "node:path";
|
|
8091
8394
|
import { createInterface } from "node:readline";
|
|
8092
|
-
import { fileURLToPath as fileURLToPath3 } from "node:url";
|
|
8395
|
+
import { fileURLToPath as fileURLToPath3, pathToFileURL } from "node:url";
|
|
8396
|
+
|
|
8397
|
+
// src/paths.ts
|
|
8398
|
+
import os2 from "node:os";
|
|
8399
|
+
import path20 from "node:path";
|
|
8400
|
+
var logged = false;
|
|
8401
|
+
function getAgentvHome() {
|
|
8402
|
+
const envHome = process.env.AGENTV_HOME;
|
|
8403
|
+
if (envHome && envHome !== "undefined") {
|
|
8404
|
+
if (!logged) {
|
|
8405
|
+
logged = true;
|
|
8406
|
+
console.warn(`Using AGENTV_HOME: ${envHome}`);
|
|
8407
|
+
}
|
|
8408
|
+
return envHome;
|
|
8409
|
+
}
|
|
8410
|
+
return path20.join(os2.homedir(), ".agentv");
|
|
8411
|
+
}
|
|
8412
|
+
function getWorkspacesRoot() {
|
|
8413
|
+
return path20.join(getAgentvHome(), "workspaces");
|
|
8414
|
+
}
|
|
8415
|
+
function getSubagentsRoot() {
|
|
8416
|
+
return path20.join(getAgentvHome(), "subagents");
|
|
8417
|
+
}
|
|
8418
|
+
function getTraceStateRoot() {
|
|
8419
|
+
return path20.join(getAgentvHome(), "trace-state");
|
|
8420
|
+
}
|
|
8421
|
+
function getWorkspacePoolRoot() {
|
|
8422
|
+
return path20.join(getAgentvHome(), "workspace-pool");
|
|
8423
|
+
}
|
|
8424
|
+
|
|
8425
|
+
// src/evaluation/providers/pi-coding-agent.ts
|
|
8093
8426
|
var piCodingAgentModule = null;
|
|
8094
8427
|
var piAiModule = null;
|
|
8095
8428
|
var loadingPromise = null;
|
|
@@ -8107,46 +8440,126 @@ async function promptInstall() {
|
|
|
8107
8440
|
rl.close();
|
|
8108
8441
|
}
|
|
8109
8442
|
}
|
|
8110
|
-
function
|
|
8111
|
-
|
|
8112
|
-
|
|
8113
|
-
|
|
8443
|
+
function findManagedSdkInstallRoot() {
|
|
8444
|
+
return path21.join(getAgentvHome(), "deps", "pi-sdk");
|
|
8445
|
+
}
|
|
8446
|
+
function resolveGlobalNpmRoot() {
|
|
8447
|
+
try {
|
|
8448
|
+
const root = execSync2("npm root -g", {
|
|
8449
|
+
encoding: "utf-8",
|
|
8450
|
+
stdio: ["ignore", "pipe", "ignore"]
|
|
8451
|
+
}).trim();
|
|
8452
|
+
return root.length > 0 ? root : void 0;
|
|
8453
|
+
} catch {
|
|
8454
|
+
return void 0;
|
|
8455
|
+
}
|
|
8456
|
+
}
|
|
8457
|
+
function buildGlobalModuleEntry(moduleName, globalNpmRoot) {
|
|
8458
|
+
return path21.join(globalNpmRoot, ...moduleName.split("/"), "dist", "index.js");
|
|
8459
|
+
}
|
|
8460
|
+
function findAccessiblePath(paths) {
|
|
8461
|
+
for (const candidate of paths) {
|
|
8114
8462
|
try {
|
|
8115
|
-
|
|
8116
|
-
|
|
8117
|
-
return dir;
|
|
8463
|
+
accessSync2(candidate);
|
|
8464
|
+
return candidate;
|
|
8118
8465
|
} catch {
|
|
8119
|
-
const parent = path20.dirname(dir);
|
|
8120
|
-
if (parent === dir) break;
|
|
8121
|
-
dir = parent;
|
|
8122
8466
|
}
|
|
8123
8467
|
}
|
|
8124
|
-
return
|
|
8468
|
+
return void 0;
|
|
8125
8469
|
}
|
|
8126
|
-
async function
|
|
8470
|
+
async function tryImportLocalSdkModules() {
|
|
8127
8471
|
try {
|
|
8128
8472
|
[piCodingAgentModule, piAiModule] = await Promise.all([
|
|
8129
8473
|
import("@mariozechner/pi-coding-agent"),
|
|
8130
8474
|
import("@mariozechner/pi-ai")
|
|
8131
8475
|
]);
|
|
8476
|
+
return true;
|
|
8132
8477
|
} catch {
|
|
8133
|
-
|
|
8134
|
-
|
|
8135
|
-
|
|
8136
|
-
|
|
8137
|
-
|
|
8138
|
-
|
|
8139
|
-
|
|
8140
|
-
|
|
8141
|
-
|
|
8142
|
-
|
|
8143
|
-
|
|
8144
|
-
|
|
8145
|
-
|
|
8146
|
-
|
|
8147
|
-
|
|
8478
|
+
return false;
|
|
8479
|
+
}
|
|
8480
|
+
}
|
|
8481
|
+
async function tryImportManagedSdkModules() {
|
|
8482
|
+
const managedRoot = findManagedSdkInstallRoot();
|
|
8483
|
+
const piCodingAgentEntry = findAccessiblePath([
|
|
8484
|
+
path21.join(managedRoot, "node_modules", "@mariozechner", "pi-coding-agent", "dist", "index.js")
|
|
8485
|
+
]);
|
|
8486
|
+
const piAiEntry = findAccessiblePath([
|
|
8487
|
+
path21.join(managedRoot, "node_modules", "@mariozechner", "pi-ai", "dist", "index.js"),
|
|
8488
|
+
path21.join(
|
|
8489
|
+
managedRoot,
|
|
8490
|
+
"node_modules",
|
|
8491
|
+
"@mariozechner",
|
|
8492
|
+
"pi-coding-agent",
|
|
8493
|
+
"node_modules",
|
|
8494
|
+
"@mariozechner",
|
|
8495
|
+
"pi-ai",
|
|
8496
|
+
"dist",
|
|
8497
|
+
"index.js"
|
|
8498
|
+
)
|
|
8499
|
+
]);
|
|
8500
|
+
if (!piCodingAgentEntry || !piAiEntry) return false;
|
|
8501
|
+
try {
|
|
8502
|
+
[piCodingAgentModule, piAiModule] = await Promise.all([
|
|
8503
|
+
import(pathToFileURL(piCodingAgentEntry).href),
|
|
8504
|
+
import(pathToFileURL(piAiEntry).href)
|
|
8505
|
+
]);
|
|
8506
|
+
return true;
|
|
8507
|
+
} catch {
|
|
8508
|
+
return false;
|
|
8509
|
+
}
|
|
8510
|
+
}
|
|
8511
|
+
async function tryImportGlobalSdkModules() {
|
|
8512
|
+
const globalNpmRoot = resolveGlobalNpmRoot();
|
|
8513
|
+
if (!globalNpmRoot) return false;
|
|
8514
|
+
const piCodingAgentEntry = findAccessiblePath([
|
|
8515
|
+
buildGlobalModuleEntry("@mariozechner/pi-coding-agent", globalNpmRoot)
|
|
8516
|
+
]);
|
|
8517
|
+
const piAiEntry = findAccessiblePath([
|
|
8518
|
+
buildGlobalModuleEntry("@mariozechner/pi-ai", globalNpmRoot),
|
|
8519
|
+
path21.join(
|
|
8520
|
+
globalNpmRoot,
|
|
8521
|
+
"@mariozechner",
|
|
8522
|
+
"pi-coding-agent",
|
|
8523
|
+
"node_modules",
|
|
8524
|
+
"@mariozechner",
|
|
8525
|
+
"pi-ai",
|
|
8526
|
+
"dist",
|
|
8527
|
+
"index.js"
|
|
8528
|
+
)
|
|
8529
|
+
]);
|
|
8530
|
+
if (!piCodingAgentEntry || !piAiEntry) return false;
|
|
8531
|
+
try {
|
|
8532
|
+
[piCodingAgentModule, piAiModule] = await Promise.all([
|
|
8533
|
+
import(pathToFileURL(piCodingAgentEntry).href),
|
|
8534
|
+
import(pathToFileURL(piAiEntry).href)
|
|
8535
|
+
]);
|
|
8536
|
+
return true;
|
|
8537
|
+
} catch {
|
|
8538
|
+
return false;
|
|
8539
|
+
}
|
|
8540
|
+
}
|
|
8541
|
+
function installSdkModules(installDir) {
|
|
8542
|
+
console.error(`Installing @mariozechner/pi-coding-agent into ${installDir} via npm...`);
|
|
8543
|
+
mkdirSync(installDir, { recursive: true });
|
|
8544
|
+
execSync2("npm install --no-save --no-package-lock @mariozechner/pi-coding-agent", {
|
|
8545
|
+
cwd: installDir,
|
|
8546
|
+
stdio: "inherit"
|
|
8547
|
+
});
|
|
8548
|
+
}
|
|
8549
|
+
async function doLoadSdkModules() {
|
|
8550
|
+
if (await tryImportLocalSdkModules() || await tryImportManagedSdkModules() || await tryImportGlobalSdkModules()) {
|
|
8551
|
+
return;
|
|
8552
|
+
}
|
|
8553
|
+
if (await promptInstall()) {
|
|
8554
|
+
const installDir = findManagedSdkInstallRoot();
|
|
8555
|
+
installSdkModules(installDir);
|
|
8556
|
+
if (await tryImportManagedSdkModules()) {
|
|
8557
|
+
return;
|
|
8148
8558
|
}
|
|
8149
8559
|
}
|
|
8560
|
+
throw new Error(
|
|
8561
|
+
"pi-coding-agent SDK is not installed. Install it with:\n npm install @mariozechner/pi-coding-agent"
|
|
8562
|
+
);
|
|
8150
8563
|
}
|
|
8151
8564
|
async function loadSdkModules() {
|
|
8152
8565
|
if (!piCodingAgentModule || !piAiModule) {
|
|
@@ -8174,7 +8587,9 @@ async function loadSdkModules() {
|
|
|
8174
8587
|
codingTools: piSdk.codingTools,
|
|
8175
8588
|
toolMap,
|
|
8176
8589
|
SessionManager: piSdk.SessionManager,
|
|
8177
|
-
getModel: piAi.getModel
|
|
8590
|
+
getModel: piAi.getModel,
|
|
8591
|
+
// biome-ignore lint/suspicious/noExplicitAny: registerBuiltInApiProviders exists at runtime but not in type defs
|
|
8592
|
+
registerBuiltInApiProviders: piAi.registerBuiltInApiProviders
|
|
8178
8593
|
};
|
|
8179
8594
|
}
|
|
8180
8595
|
var PiCodingAgentProvider = class {
|
|
@@ -8196,17 +8611,35 @@ var PiCodingAgentProvider = class {
|
|
|
8196
8611
|
const startTime = (/* @__PURE__ */ new Date()).toISOString();
|
|
8197
8612
|
const startMs = Date.now();
|
|
8198
8613
|
const sdk = await loadSdkModules();
|
|
8614
|
+
sdk.registerBuiltInApiProviders();
|
|
8199
8615
|
const logger = await this.createStreamLogger(request).catch(() => void 0);
|
|
8200
8616
|
try {
|
|
8201
8617
|
const cwd = this.resolveCwd(request.cwd);
|
|
8202
|
-
const
|
|
8618
|
+
const rawProvider = this.config.subprovider ?? "google";
|
|
8619
|
+
const normalizedBaseUrl = this.normalizeSdkBaseUrl(rawProvider, this.config.baseUrl);
|
|
8620
|
+
const hasBaseUrl = !!normalizedBaseUrl;
|
|
8621
|
+
const providerName = resolveSubprovider(rawProvider, hasBaseUrl);
|
|
8203
8622
|
const modelId = this.config.model ?? "gemini-2.5-flash";
|
|
8204
|
-
this.setApiKeyEnv(
|
|
8205
|
-
|
|
8623
|
+
this.setApiKeyEnv(rawProvider, hasBaseUrl);
|
|
8624
|
+
this.setBaseUrlEnv(rawProvider, normalizedBaseUrl, hasBaseUrl);
|
|
8625
|
+
let model = sdk.getModel(providerName, modelId);
|
|
8626
|
+
if (model && normalizedBaseUrl) {
|
|
8627
|
+
model = { ...model, baseUrl: normalizedBaseUrl };
|
|
8628
|
+
}
|
|
8206
8629
|
if (!model) {
|
|
8207
|
-
|
|
8208
|
-
|
|
8209
|
-
|
|
8630
|
+
const envProvider = providerName.replace(/-responses$/, "");
|
|
8631
|
+
model = {
|
|
8632
|
+
id: modelId,
|
|
8633
|
+
name: modelId,
|
|
8634
|
+
api: providerName,
|
|
8635
|
+
provider: envProvider,
|
|
8636
|
+
baseUrl: normalizedBaseUrl ?? "",
|
|
8637
|
+
reasoning: false,
|
|
8638
|
+
input: ["text"],
|
|
8639
|
+
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
|
|
8640
|
+
contextWindow: 128e3,
|
|
8641
|
+
maxTokens: 16384
|
|
8642
|
+
};
|
|
8210
8643
|
}
|
|
8211
8644
|
const tools = this.resolveTools(sdk);
|
|
8212
8645
|
const { session } = await sdk.createAgentSession({
|
|
@@ -8359,28 +8792,35 @@ ${fileList}`;
|
|
|
8359
8792
|
}
|
|
8360
8793
|
}
|
|
8361
8794
|
/** Maps config apiKey to the provider-specific env var the SDK reads. */
|
|
8362
|
-
setApiKeyEnv(providerName) {
|
|
8795
|
+
setApiKeyEnv(providerName, hasBaseUrl = false) {
|
|
8363
8796
|
if (!this.config.apiKey) return;
|
|
8364
|
-
const
|
|
8365
|
-
google: "GEMINI_API_KEY",
|
|
8366
|
-
gemini: "GEMINI_API_KEY",
|
|
8367
|
-
anthropic: "ANTHROPIC_API_KEY",
|
|
8368
|
-
openai: "OPENAI_API_KEY",
|
|
8369
|
-
groq: "GROQ_API_KEY",
|
|
8370
|
-
xai: "XAI_API_KEY",
|
|
8371
|
-
openrouter: "OPENROUTER_API_KEY"
|
|
8372
|
-
};
|
|
8373
|
-
const envKey = ENV_KEY_MAP[providerName.toLowerCase()];
|
|
8797
|
+
const envKey = resolveEnvKeyName(providerName, hasBaseUrl);
|
|
8374
8798
|
if (envKey) {
|
|
8375
8799
|
process.env[envKey] = this.config.apiKey;
|
|
8376
8800
|
}
|
|
8377
8801
|
}
|
|
8802
|
+
/** Maps config baseUrl to the provider-specific env var the SDK reads. */
|
|
8803
|
+
setBaseUrlEnv(providerName, baseUrl = this.config.baseUrl, hasBaseUrl = false) {
|
|
8804
|
+
const normalizedBaseUrl = this.normalizeSdkBaseUrl(providerName, baseUrl);
|
|
8805
|
+
if (!normalizedBaseUrl) return;
|
|
8806
|
+
const envKey = resolveEnvBaseUrlName(providerName, hasBaseUrl);
|
|
8807
|
+
if (envKey) {
|
|
8808
|
+
process.env[envKey] = normalizedBaseUrl;
|
|
8809
|
+
}
|
|
8810
|
+
}
|
|
8811
|
+
normalizeSdkBaseUrl(providerName, baseUrl) {
|
|
8812
|
+
if (!baseUrl) return void 0;
|
|
8813
|
+
if (providerName.toLowerCase() === "azure") {
|
|
8814
|
+
return normalizeAzureSdkBaseUrl(baseUrl);
|
|
8815
|
+
}
|
|
8816
|
+
return baseUrl;
|
|
8817
|
+
}
|
|
8378
8818
|
resolveCwd(cwdOverride) {
|
|
8379
8819
|
if (cwdOverride) {
|
|
8380
|
-
return
|
|
8820
|
+
return path21.resolve(cwdOverride);
|
|
8381
8821
|
}
|
|
8382
8822
|
if (this.config.cwd) {
|
|
8383
|
-
return
|
|
8823
|
+
return path21.resolve(this.config.cwd);
|
|
8384
8824
|
}
|
|
8385
8825
|
return process.cwd();
|
|
8386
8826
|
}
|
|
@@ -8399,9 +8839,9 @@ ${fileList}`;
|
|
|
8399
8839
|
}
|
|
8400
8840
|
resolveLogDirectory() {
|
|
8401
8841
|
if (this.config.logDir) {
|
|
8402
|
-
return
|
|
8842
|
+
return path21.resolve(this.config.logDir);
|
|
8403
8843
|
}
|
|
8404
|
-
return
|
|
8844
|
+
return path21.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
|
|
8405
8845
|
}
|
|
8406
8846
|
async createStreamLogger(request) {
|
|
8407
8847
|
const logDir = this.resolveLogDirectory();
|
|
@@ -8415,7 +8855,7 @@ ${fileList}`;
|
|
|
8415
8855
|
console.warn(`Skipping Pi stream logging (could not create ${logDir}): ${message}`);
|
|
8416
8856
|
return void 0;
|
|
8417
8857
|
}
|
|
8418
|
-
const filePath =
|
|
8858
|
+
const filePath = path21.join(logDir, buildLogFilename6(request, this.targetName));
|
|
8419
8859
|
try {
|
|
8420
8860
|
const logger = await PiStreamLogger2.create({
|
|
8421
8861
|
filePath,
|
|
@@ -8640,7 +9080,7 @@ import path30 from "node:path";
|
|
|
8640
9080
|
// src/evaluation/providers/vscode/utils/fs.ts
|
|
8641
9081
|
import { constants as constants2 } from "node:fs";
|
|
8642
9082
|
import { access as access2, mkdir as mkdir8, readdir as readdir2, rm as rm2, stat as stat2 } from "node:fs/promises";
|
|
8643
|
-
import
|
|
9083
|
+
import path22 from "node:path";
|
|
8644
9084
|
async function pathExists(target) {
|
|
8645
9085
|
try {
|
|
8646
9086
|
await access2(target, constants2.F_OK);
|
|
@@ -8656,7 +9096,7 @@ async function readDirEntries(target) {
|
|
|
8656
9096
|
const entries = await readdir2(target, { withFileTypes: true });
|
|
8657
9097
|
return entries.map((entry) => ({
|
|
8658
9098
|
name: entry.name,
|
|
8659
|
-
absolutePath:
|
|
9099
|
+
absolutePath: path22.join(target, entry.name),
|
|
8660
9100
|
isDirectory: entry.isDirectory()
|
|
8661
9101
|
}));
|
|
8662
9102
|
}
|
|
@@ -8671,9 +9111,9 @@ async function removeIfExists(target) {
|
|
|
8671
9111
|
}
|
|
8672
9112
|
|
|
8673
9113
|
// src/evaluation/providers/vscode/utils/path.ts
|
|
8674
|
-
import
|
|
9114
|
+
import path23 from "node:path";
|
|
8675
9115
|
function pathToFileUri2(filePath) {
|
|
8676
|
-
const absolutePath =
|
|
9116
|
+
const absolutePath = path23.isAbsolute(filePath) ? filePath : path23.resolve(filePath);
|
|
8677
9117
|
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
8678
9118
|
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
8679
9119
|
return `file:///${normalizedPath}`;
|
|
@@ -8682,7 +9122,7 @@ function pathToFileUri2(filePath) {
|
|
|
8682
9122
|
}
|
|
8683
9123
|
|
|
8684
9124
|
// src/evaluation/providers/vscode/dispatch/promptBuilder.ts
|
|
8685
|
-
import
|
|
9125
|
+
import path24 from "node:path";
|
|
8686
9126
|
|
|
8687
9127
|
// src/evaluation/providers/vscode/utils/template.ts
|
|
8688
9128
|
function renderTemplate2(content, variables) {
|
|
@@ -8774,8 +9214,8 @@ function createBatchRequestPrompt(userQuery, responseFileTmp, responseFileFinal,
|
|
|
8774
9214
|
});
|
|
8775
9215
|
}
|
|
8776
9216
|
function createBatchOrchestratorPrompt(requestFiles, responseFiles, templateContent) {
|
|
8777
|
-
const requestLines = requestFiles.map((file, index) => `${index + 1}. messages/${
|
|
8778
|
-
const responseList = responseFiles.map((file) => `"${
|
|
9217
|
+
const requestLines = requestFiles.map((file, index) => `${index + 1}. messages/${path24.basename(file)}`).join("\n");
|
|
9218
|
+
const responseList = responseFiles.map((file) => `"${path24.basename(file)}"`).join(", ");
|
|
8779
9219
|
return renderTemplate2(templateContent, {
|
|
8780
9220
|
requestFiles: requestLines,
|
|
8781
9221
|
responseList
|
|
@@ -8784,7 +9224,7 @@ function createBatchOrchestratorPrompt(requestFiles, responseFiles, templateCont
|
|
|
8784
9224
|
|
|
8785
9225
|
// src/evaluation/providers/vscode/dispatch/responseWaiter.ts
|
|
8786
9226
|
import { readFile as readFile9 } from "node:fs/promises";
|
|
8787
|
-
import
|
|
9227
|
+
import path25 from "node:path";
|
|
8788
9228
|
|
|
8789
9229
|
// src/evaluation/providers/vscode/utils/time.ts
|
|
8790
9230
|
function sleep2(ms) {
|
|
@@ -8843,7 +9283,7 @@ async function waitForResponseOutput(responseFileFinal, pollInterval = 1e3, sile
|
|
|
8843
9283
|
}
|
|
8844
9284
|
async function waitForBatchResponses(responseFilesFinal, pollInterval = 1e3, silent = false, timeoutMs = DEFAULT_TIMEOUT_MS) {
|
|
8845
9285
|
if (!silent) {
|
|
8846
|
-
const fileList = responseFilesFinal.map((file) =>
|
|
9286
|
+
const fileList = responseFilesFinal.map((file) => path25.basename(file)).join(", ");
|
|
8847
9287
|
console.error(`waiting for ${responseFilesFinal.length} batch response(s): ${fileList}`);
|
|
8848
9288
|
}
|
|
8849
9289
|
const deadline = Date.now() + timeoutMs;
|
|
@@ -8852,7 +9292,7 @@ async function waitForBatchResponses(responseFilesFinal, pollInterval = 1e3, sil
|
|
|
8852
9292
|
while (pending.size > 0) {
|
|
8853
9293
|
if (Date.now() >= deadline) {
|
|
8854
9294
|
if (!silent) {
|
|
8855
|
-
const remaining = [...pending].map((f) =>
|
|
9295
|
+
const remaining = [...pending].map((f) => path25.basename(f)).join(", ");
|
|
8856
9296
|
console.error(
|
|
8857
9297
|
`error: timed out after ${Math.round(timeoutMs / 1e3)}s waiting for batch responses. Still pending: ${remaining}`
|
|
8858
9298
|
);
|
|
@@ -8908,36 +9348,6 @@ import { promisify as promisify2 } from "node:util";
|
|
|
8908
9348
|
|
|
8909
9349
|
// src/evaluation/providers/vscode/dispatch/constants.ts
|
|
8910
9350
|
import path26 from "node:path";
|
|
8911
|
-
|
|
8912
|
-
// src/paths.ts
|
|
8913
|
-
import os2 from "node:os";
|
|
8914
|
-
import path25 from "node:path";
|
|
8915
|
-
var logged = false;
|
|
8916
|
-
function getAgentvHome() {
|
|
8917
|
-
const envHome = process.env.AGENTV_HOME;
|
|
8918
|
-
if (envHome && envHome !== "undefined") {
|
|
8919
|
-
if (!logged) {
|
|
8920
|
-
logged = true;
|
|
8921
|
-
console.warn(`Using AGENTV_HOME: ${envHome}`);
|
|
8922
|
-
}
|
|
8923
|
-
return envHome;
|
|
8924
|
-
}
|
|
8925
|
-
return path25.join(os2.homedir(), ".agentv");
|
|
8926
|
-
}
|
|
8927
|
-
function getWorkspacesRoot() {
|
|
8928
|
-
return path25.join(getAgentvHome(), "workspaces");
|
|
8929
|
-
}
|
|
8930
|
-
function getSubagentsRoot() {
|
|
8931
|
-
return path25.join(getAgentvHome(), "subagents");
|
|
8932
|
-
}
|
|
8933
|
-
function getTraceStateRoot() {
|
|
8934
|
-
return path25.join(getAgentvHome(), "trace-state");
|
|
8935
|
-
}
|
|
8936
|
-
function getWorkspacePoolRoot() {
|
|
8937
|
-
return path25.join(getAgentvHome(), "workspace-pool");
|
|
8938
|
-
}
|
|
8939
|
-
|
|
8940
|
-
// src/evaluation/providers/vscode/dispatch/constants.ts
|
|
8941
9351
|
var DEFAULT_LOCK_NAME = "subagent.lock";
|
|
8942
9352
|
var DEFAULT_ALIVE_FILENAME = ".alive";
|
|
8943
9353
|
function getDefaultSubagentRoot(vscodeCmd = "code") {
|
|
@@ -10194,9 +10604,10 @@ function resolveAndCreateProvider(definition, env = process.env) {
|
|
|
10194
10604
|
}
|
|
10195
10605
|
|
|
10196
10606
|
// src/evaluation/evaluators/scoring.ts
|
|
10197
|
-
var
|
|
10198
|
-
|
|
10199
|
-
|
|
10607
|
+
var DEFAULT_THRESHOLD = 0.8;
|
|
10608
|
+
var PASS_THRESHOLD = DEFAULT_THRESHOLD;
|
|
10609
|
+
function scoreToVerdict(score, threshold = DEFAULT_THRESHOLD) {
|
|
10610
|
+
return score >= threshold ? "pass" : "fail";
|
|
10200
10611
|
}
|
|
10201
10612
|
function clampScore(value) {
|
|
10202
10613
|
if (Number.isNaN(value) || !Number.isFinite(value)) {
|
|
@@ -10385,13 +10796,13 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
|
|
|
10385
10796
|
async function execShellWithStdin(command, stdinPayload, options = {}) {
|
|
10386
10797
|
const { mkdir: mkdir16, readFile: readFile15, rm: rm6, writeFile: writeFile9 } = await import("node:fs/promises");
|
|
10387
10798
|
const { tmpdir: tmpdir3 } = await import("node:os");
|
|
10388
|
-
const
|
|
10799
|
+
const path50 = await import("node:path");
|
|
10389
10800
|
const { randomUUID: randomUUID10 } = await import("node:crypto");
|
|
10390
|
-
const dir =
|
|
10801
|
+
const dir = path50.join(tmpdir3(), `agentv-exec-${randomUUID10()}`);
|
|
10391
10802
|
await mkdir16(dir, { recursive: true });
|
|
10392
|
-
const stdinPath =
|
|
10393
|
-
const stdoutPath =
|
|
10394
|
-
const stderrPath =
|
|
10803
|
+
const stdinPath = path50.join(dir, "stdin.txt");
|
|
10804
|
+
const stdoutPath = path50.join(dir, "stdout.txt");
|
|
10805
|
+
const stderrPath = path50.join(dir, "stderr.txt");
|
|
10395
10806
|
await writeFile9(stdinPath, stdinPayload, "utf8");
|
|
10396
10807
|
const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
|
|
10397
10808
|
const { spawn: spawn5 } = await import("node:child_process");
|
|
@@ -11589,7 +12000,7 @@ ${outputSchema}`;
|
|
|
11589
12000
|
parts.push("[[ ## scoring_criteria ## ]]");
|
|
11590
12001
|
for (const rubric of rubrics) {
|
|
11591
12002
|
const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
|
|
11592
|
-
const minScoreLabel = rubric.required_min_score !== void 0 ? ` [REQUIRED: min score ${rubric.required_min_score}]` : "";
|
|
12003
|
+
const minScoreLabel = rubric.min_score !== void 0 ? ` [REQUIRED: min score ${rubric.min_score}]` : rubric.required_min_score !== void 0 ? ` [REQUIRED: min score ${rubric.required_min_score}]` : "";
|
|
11593
12004
|
parts.push("", `### Criterion: ${rubric.id}${weightLabel}${minScoreLabel}`);
|
|
11594
12005
|
if (rubric.outcome) {
|
|
11595
12006
|
parts.push(`Description: ${rubric.outcome}`);
|
|
@@ -11643,54 +12054,106 @@ ${outputSchema}`;
|
|
|
11643
12054
|
async runWithRetry(options) {
|
|
11644
12055
|
const { context, graderProvider, systemPrompt, userPrompt, schema, images } = options;
|
|
11645
12056
|
let lastError;
|
|
12057
|
+
let lastInvalidResponse;
|
|
12058
|
+
let shouldAttemptStructureFix = false;
|
|
11646
12059
|
for (let attempt = 1; attempt <= 3; attempt++) {
|
|
11647
12060
|
try {
|
|
11648
|
-
const
|
|
11649
|
-
|
|
11650
|
-
|
|
11651
|
-
|
|
11652
|
-
|
|
11653
|
-
|
|
11654
|
-
|
|
11655
|
-
|
|
11656
|
-
|
|
11657
|
-
|
|
11658
|
-
|
|
11659
|
-
|
|
11660
|
-
|
|
11661
|
-
|
|
11662
|
-
|
|
11663
|
-
|
|
11664
|
-
]
|
|
11665
|
-
}
|
|
11666
|
-
],
|
|
11667
|
-
...modelOptions
|
|
11668
|
-
}) : await generateText2({
|
|
11669
|
-
model,
|
|
11670
|
-
system: systemPrompt,
|
|
11671
|
-
prompt: userPrompt,
|
|
11672
|
-
...modelOptions
|
|
11673
|
-
});
|
|
11674
|
-
const data2 = schema.parse(parseJsonFromText(result.text));
|
|
11675
|
-
const rawUsage = result.usage;
|
|
11676
|
-
const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens } : void 0;
|
|
11677
|
-
return { data: data2, tokenUsage };
|
|
12061
|
+
const result = await this.generateStructuredResponse({
|
|
12062
|
+
context,
|
|
12063
|
+
graderProvider,
|
|
12064
|
+
systemPrompt,
|
|
12065
|
+
userPrompt,
|
|
12066
|
+
images
|
|
12067
|
+
});
|
|
12068
|
+
const canRepairResponse = result.text.trim().length > 0;
|
|
12069
|
+
lastInvalidResponse = canRepairResponse ? result : void 0;
|
|
12070
|
+
let data;
|
|
12071
|
+
try {
|
|
12072
|
+
data = schema.parse(parseJsonFromText(result.text));
|
|
12073
|
+
} catch (e) {
|
|
12074
|
+
lastError = e instanceof Error ? e : new Error(String(e));
|
|
12075
|
+
shouldAttemptStructureFix = canRepairResponse;
|
|
12076
|
+
continue;
|
|
11678
12077
|
}
|
|
11679
|
-
|
|
11680
|
-
|
|
12078
|
+
return {
|
|
12079
|
+
data,
|
|
12080
|
+
providerResponse: result.providerResponse,
|
|
12081
|
+
tokenUsage: result.tokenUsage
|
|
12082
|
+
};
|
|
12083
|
+
} catch (e) {
|
|
12084
|
+
lastError = e instanceof Error ? e : new Error(String(e));
|
|
12085
|
+
}
|
|
12086
|
+
}
|
|
12087
|
+
if (shouldAttemptStructureFix && lastInvalidResponse) {
|
|
12088
|
+
try {
|
|
12089
|
+
const repaired = await this.generateStructuredResponse({
|
|
12090
|
+
context,
|
|
12091
|
+
graderProvider,
|
|
11681
12092
|
systemPrompt,
|
|
11682
|
-
|
|
11683
|
-
|
|
11684
|
-
|
|
11685
|
-
|
|
12093
|
+
userPrompt: buildStructureRepairPrompt({
|
|
12094
|
+
validationError: lastError?.message ?? "Schema validation failed",
|
|
12095
|
+
invalidResponse: lastInvalidResponse.text
|
|
12096
|
+
})
|
|
11686
12097
|
});
|
|
11687
|
-
const data = schema.parse(parseJsonFromText(
|
|
11688
|
-
return {
|
|
12098
|
+
const data = schema.parse(parseJsonFromText(repaired.text));
|
|
12099
|
+
return {
|
|
12100
|
+
data,
|
|
12101
|
+
providerResponse: repaired.providerResponse,
|
|
12102
|
+
tokenUsage: sumTokenUsage(lastInvalidResponse.tokenUsage, repaired.tokenUsage)
|
|
12103
|
+
};
|
|
11689
12104
|
} catch (e) {
|
|
11690
12105
|
lastError = e instanceof Error ? e : new Error(String(e));
|
|
11691
12106
|
}
|
|
11692
12107
|
}
|
|
11693
|
-
throw new Error(
|
|
12108
|
+
throw new Error(
|
|
12109
|
+
`Failed to parse evaluator response after 3 attempts and 1 structure-fix attempt: ${lastError?.message}`
|
|
12110
|
+
);
|
|
12111
|
+
}
|
|
12112
|
+
async generateStructuredResponse(options) {
|
|
12113
|
+
const { context, graderProvider, systemPrompt, userPrompt, images } = options;
|
|
12114
|
+
const model = graderProvider.asLanguageModel?.();
|
|
12115
|
+
if (model) {
|
|
12116
|
+
const modelOptions = {
|
|
12117
|
+
...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
|
|
12118
|
+
...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
|
|
12119
|
+
};
|
|
12120
|
+
const hasImages = images && images.length > 0;
|
|
12121
|
+
const result = hasImages ? await generateText2({
|
|
12122
|
+
model,
|
|
12123
|
+
system: systemPrompt,
|
|
12124
|
+
messages: [
|
|
12125
|
+
{
|
|
12126
|
+
role: "user",
|
|
12127
|
+
content: [
|
|
12128
|
+
{ type: "text", text: userPrompt },
|
|
12129
|
+
...toAiSdkImageParts(images)
|
|
12130
|
+
]
|
|
12131
|
+
}
|
|
12132
|
+
],
|
|
12133
|
+
...modelOptions
|
|
12134
|
+
}) : await generateText2({
|
|
12135
|
+
model,
|
|
12136
|
+
system: systemPrompt,
|
|
12137
|
+
prompt: userPrompt,
|
|
12138
|
+
...modelOptions
|
|
12139
|
+
});
|
|
12140
|
+
const rawUsage = result.usage;
|
|
12141
|
+
const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens } : void 0;
|
|
12142
|
+
return { text: result.text, tokenUsage };
|
|
12143
|
+
}
|
|
12144
|
+
const response = await graderProvider.invoke({
|
|
12145
|
+
question: userPrompt,
|
|
12146
|
+
systemPrompt,
|
|
12147
|
+
evalCaseId: context.evalCase.id,
|
|
12148
|
+
attempt: context.attempt,
|
|
12149
|
+
maxOutputTokens: this.maxOutputTokens,
|
|
12150
|
+
temperature: this.temperature
|
|
12151
|
+
});
|
|
12152
|
+
return {
|
|
12153
|
+
text: extractLastAssistantContent(response.output),
|
|
12154
|
+
providerResponse: response,
|
|
12155
|
+
tokenUsage: response.tokenUsage
|
|
12156
|
+
};
|
|
11694
12157
|
}
|
|
11695
12158
|
};
|
|
11696
12159
|
function buildOutputSchema() {
|
|
@@ -11710,6 +12173,29 @@ function buildOutputSchema() {
|
|
|
11710
12173
|
"}"
|
|
11711
12174
|
].join("\n");
|
|
11712
12175
|
}
|
|
12176
|
+
function buildStructureRepairPrompt(options) {
|
|
12177
|
+
const { validationError, invalidResponse } = options;
|
|
12178
|
+
return [
|
|
12179
|
+
"The following evaluation response has useful grading content but invalid JSON structure.",
|
|
12180
|
+
"Repair it to satisfy the schema in the system prompt.",
|
|
12181
|
+
"Preserve the evaluation meaning, do not re-grade the answer, and return only a single JSON object.",
|
|
12182
|
+
"",
|
|
12183
|
+
"Validation error:",
|
|
12184
|
+
validationError,
|
|
12185
|
+
"",
|
|
12186
|
+
"Invalid response:",
|
|
12187
|
+
invalidResponse
|
|
12188
|
+
].join("\n");
|
|
12189
|
+
}
|
|
12190
|
+
function sumTokenUsage(first, second) {
|
|
12191
|
+
if (!first && !second) {
|
|
12192
|
+
return void 0;
|
|
12193
|
+
}
|
|
12194
|
+
return {
|
|
12195
|
+
input: (first?.input ?? 0) + (second?.input ?? 0),
|
|
12196
|
+
output: (first?.output ?? 0) + (second?.output ?? 0)
|
|
12197
|
+
};
|
|
12198
|
+
}
|
|
11713
12199
|
function buildRubricOutputSchema() {
|
|
11714
12200
|
return `You are an expert evaluator. Evaluate the candidate answer against each rubric item.
|
|
11715
12201
|
You must return a valid JSON object matching this schema:
|
|
@@ -11809,19 +12295,21 @@ function calculateScoreRangeResult(result, rubrics) {
|
|
|
11809
12295
|
rawScores[rubric.id] = rawScore;
|
|
11810
12296
|
totalWeight += rubric.weight;
|
|
11811
12297
|
weightedScoreSum += normalizedScore * rubric.weight;
|
|
11812
|
-
let
|
|
11813
|
-
if (rubric.
|
|
11814
|
-
|
|
12298
|
+
let minScoreThreshold;
|
|
12299
|
+
if (rubric.min_score !== void 0) {
|
|
12300
|
+
minScoreThreshold = rubric.min_score;
|
|
12301
|
+
} else if (rubric.required_min_score !== void 0) {
|
|
12302
|
+
minScoreThreshold = rubric.required_min_score / 10;
|
|
11815
12303
|
} else if (rubric.required === true) {
|
|
11816
|
-
|
|
12304
|
+
minScoreThreshold = 1;
|
|
11817
12305
|
}
|
|
11818
12306
|
const matchingRange = rubric.score_ranges?.find(
|
|
11819
12307
|
(r) => rawScore >= r.score_range[0] && rawScore <= r.score_range[1]
|
|
11820
12308
|
);
|
|
11821
12309
|
const rangeDescription = matchingRange?.outcome ?? "";
|
|
11822
12310
|
const criterionLabel = rubric.outcome ?? rubric.id;
|
|
11823
|
-
const passed = !(
|
|
11824
|
-
if (
|
|
12311
|
+
const passed = !(minScoreThreshold !== void 0 && normalizedScore < minScoreThreshold) && rawScore >= 7;
|
|
12312
|
+
if (minScoreThreshold !== void 0 && normalizedScore < minScoreThreshold) {
|
|
11825
12313
|
failedRequired = true;
|
|
11826
12314
|
}
|
|
11827
12315
|
assertions.push({
|
|
@@ -11898,11 +12386,11 @@ function createFilesystemTools(workspacePath) {
|
|
|
11898
12386
|
execute: async (input) => {
|
|
11899
12387
|
try {
|
|
11900
12388
|
const resolved = resolveSandboxed(workspacePath, input.path);
|
|
11901
|
-
const
|
|
11902
|
-
if (
|
|
12389
|
+
const stat11 = await fs2.stat(resolved);
|
|
12390
|
+
if (stat11.isDirectory()) {
|
|
11903
12391
|
return { error: `'${input.path}' is a directory, not a file` };
|
|
11904
12392
|
}
|
|
11905
|
-
const buffer = Buffer.alloc(Math.min(
|
|
12393
|
+
const buffer = Buffer.alloc(Math.min(stat11.size, MAX_FILE_SIZE));
|
|
11906
12394
|
const fd = await fs2.open(resolved, "r");
|
|
11907
12395
|
try {
|
|
11908
12396
|
await fd.read(buffer, 0, buffer.length, 0);
|
|
@@ -11910,8 +12398,8 @@ function createFilesystemTools(workspacePath) {
|
|
|
11910
12398
|
await fd.close();
|
|
11911
12399
|
}
|
|
11912
12400
|
const content = buffer.toString("utf-8");
|
|
11913
|
-
const truncated =
|
|
11914
|
-
return { content, truncated, size:
|
|
12401
|
+
const truncated = stat11.size > MAX_FILE_SIZE;
|
|
12402
|
+
return { content, truncated, size: stat11.size };
|
|
11915
12403
|
} catch (error) {
|
|
11916
12404
|
return { error: error instanceof Error ? error.message : String(error) };
|
|
11917
12405
|
}
|
|
@@ -11962,8 +12450,8 @@ async function searchDirectory(dirPath, workspacePath, regex, matches) {
|
|
|
11962
12450
|
const ext = path35.extname(entry.name).toLowerCase();
|
|
11963
12451
|
if (BINARY_EXTENSIONS.has(ext)) continue;
|
|
11964
12452
|
try {
|
|
11965
|
-
const
|
|
11966
|
-
if (
|
|
12453
|
+
const stat11 = await fs2.stat(fullPath);
|
|
12454
|
+
if (stat11.size > MAX_FILE_SIZE) continue;
|
|
11967
12455
|
const content = await fs2.readFile(fullPath, "utf-8");
|
|
11968
12456
|
const lines = content.split("\n");
|
|
11969
12457
|
for (let i = 0; i < lines.length; i++) {
|
|
@@ -12604,115 +13092,115 @@ var FieldAccuracyEvaluator = class {
|
|
|
12604
13092
|
* Evaluate a single field against the expected value.
|
|
12605
13093
|
*/
|
|
12606
13094
|
evaluateField(fieldConfig, candidateData, expectedData) {
|
|
12607
|
-
const { path:
|
|
12608
|
-
const candidateValue = resolvePath(candidateData,
|
|
12609
|
-
const expectedValue = resolvePath(expectedData,
|
|
13095
|
+
const { path: path50, match, required = true, weight = 1 } = fieldConfig;
|
|
13096
|
+
const candidateValue = resolvePath(candidateData, path50);
|
|
13097
|
+
const expectedValue = resolvePath(expectedData, path50);
|
|
12610
13098
|
if (expectedValue === void 0) {
|
|
12611
13099
|
return {
|
|
12612
|
-
path:
|
|
13100
|
+
path: path50,
|
|
12613
13101
|
score: 1,
|
|
12614
13102
|
// No expected value means no comparison needed
|
|
12615
13103
|
weight,
|
|
12616
13104
|
hit: true,
|
|
12617
|
-
message: `${
|
|
13105
|
+
message: `${path50}: no expected value`
|
|
12618
13106
|
};
|
|
12619
13107
|
}
|
|
12620
13108
|
if (candidateValue === void 0) {
|
|
12621
13109
|
if (required) {
|
|
12622
13110
|
return {
|
|
12623
|
-
path:
|
|
13111
|
+
path: path50,
|
|
12624
13112
|
score: 0,
|
|
12625
13113
|
weight,
|
|
12626
13114
|
hit: false,
|
|
12627
|
-
message: `${
|
|
13115
|
+
message: `${path50} (required, missing)`
|
|
12628
13116
|
};
|
|
12629
13117
|
}
|
|
12630
13118
|
return {
|
|
12631
|
-
path:
|
|
13119
|
+
path: path50,
|
|
12632
13120
|
score: 1,
|
|
12633
13121
|
// Don't penalize missing optional fields
|
|
12634
13122
|
weight: 0,
|
|
12635
13123
|
// Zero weight means it won't affect the score
|
|
12636
13124
|
hit: true,
|
|
12637
|
-
message: `${
|
|
13125
|
+
message: `${path50}: optional field missing`
|
|
12638
13126
|
};
|
|
12639
13127
|
}
|
|
12640
13128
|
switch (match) {
|
|
12641
13129
|
case "exact":
|
|
12642
|
-
return this.compareExact(
|
|
13130
|
+
return this.compareExact(path50, candidateValue, expectedValue, weight);
|
|
12643
13131
|
case "numeric_tolerance":
|
|
12644
13132
|
return this.compareNumericTolerance(
|
|
12645
|
-
|
|
13133
|
+
path50,
|
|
12646
13134
|
candidateValue,
|
|
12647
13135
|
expectedValue,
|
|
12648
13136
|
fieldConfig,
|
|
12649
13137
|
weight
|
|
12650
13138
|
);
|
|
12651
13139
|
case "date":
|
|
12652
|
-
return this.compareDate(
|
|
13140
|
+
return this.compareDate(path50, candidateValue, expectedValue, fieldConfig, weight);
|
|
12653
13141
|
default:
|
|
12654
13142
|
return {
|
|
12655
|
-
path:
|
|
13143
|
+
path: path50,
|
|
12656
13144
|
score: 0,
|
|
12657
13145
|
weight,
|
|
12658
13146
|
hit: false,
|
|
12659
|
-
message: `${
|
|
13147
|
+
message: `${path50}: unknown match type "${match}"`
|
|
12660
13148
|
};
|
|
12661
13149
|
}
|
|
12662
13150
|
}
|
|
12663
13151
|
/**
|
|
12664
13152
|
* Exact equality comparison.
|
|
12665
13153
|
*/
|
|
12666
|
-
compareExact(
|
|
13154
|
+
compareExact(path50, candidateValue, expectedValue, weight) {
|
|
12667
13155
|
if (deepEqual(candidateValue, expectedValue)) {
|
|
12668
13156
|
return {
|
|
12669
|
-
path:
|
|
13157
|
+
path: path50,
|
|
12670
13158
|
score: 1,
|
|
12671
13159
|
weight,
|
|
12672
13160
|
hit: true,
|
|
12673
|
-
message:
|
|
13161
|
+
message: path50
|
|
12674
13162
|
};
|
|
12675
13163
|
}
|
|
12676
13164
|
if (typeof candidateValue !== typeof expectedValue) {
|
|
12677
13165
|
return {
|
|
12678
|
-
path:
|
|
13166
|
+
path: path50,
|
|
12679
13167
|
score: 0,
|
|
12680
13168
|
weight,
|
|
12681
13169
|
hit: false,
|
|
12682
|
-
message: `${
|
|
13170
|
+
message: `${path50} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
|
|
12683
13171
|
};
|
|
12684
13172
|
}
|
|
12685
13173
|
return {
|
|
12686
|
-
path:
|
|
13174
|
+
path: path50,
|
|
12687
13175
|
score: 0,
|
|
12688
13176
|
weight,
|
|
12689
13177
|
hit: false,
|
|
12690
|
-
message: `${
|
|
13178
|
+
message: `${path50} (value mismatch)`
|
|
12691
13179
|
};
|
|
12692
13180
|
}
|
|
12693
13181
|
/**
|
|
12694
13182
|
* Numeric comparison with absolute or relative tolerance.
|
|
12695
13183
|
*/
|
|
12696
|
-
compareNumericTolerance(
|
|
13184
|
+
compareNumericTolerance(path50, candidateValue, expectedValue, fieldConfig, weight) {
|
|
12697
13185
|
const { tolerance = 0, relative = false } = fieldConfig;
|
|
12698
13186
|
const candidateNum = toNumber(candidateValue);
|
|
12699
13187
|
const expectedNum = toNumber(expectedValue);
|
|
12700
13188
|
if (candidateNum === null || expectedNum === null) {
|
|
12701
13189
|
return {
|
|
12702
|
-
path:
|
|
13190
|
+
path: path50,
|
|
12703
13191
|
score: 0,
|
|
12704
13192
|
weight,
|
|
12705
13193
|
hit: false,
|
|
12706
|
-
message: `${
|
|
13194
|
+
message: `${path50} (non-numeric value)`
|
|
12707
13195
|
};
|
|
12708
13196
|
}
|
|
12709
13197
|
if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
|
|
12710
13198
|
return {
|
|
12711
|
-
path:
|
|
13199
|
+
path: path50,
|
|
12712
13200
|
score: 0,
|
|
12713
13201
|
weight,
|
|
12714
13202
|
hit: false,
|
|
12715
|
-
message: `${
|
|
13203
|
+
message: `${path50} (invalid numeric value)`
|
|
12716
13204
|
};
|
|
12717
13205
|
}
|
|
12718
13206
|
const diff = Math.abs(candidateNum - expectedNum);
|
|
@@ -12725,61 +13213,61 @@ var FieldAccuracyEvaluator = class {
|
|
|
12725
13213
|
}
|
|
12726
13214
|
if (withinTolerance) {
|
|
12727
13215
|
return {
|
|
12728
|
-
path:
|
|
13216
|
+
path: path50,
|
|
12729
13217
|
score: 1,
|
|
12730
13218
|
weight,
|
|
12731
13219
|
hit: true,
|
|
12732
|
-
message: `${
|
|
13220
|
+
message: `${path50} (within tolerance: diff=${diff.toFixed(2)})`
|
|
12733
13221
|
};
|
|
12734
13222
|
}
|
|
12735
13223
|
return {
|
|
12736
|
-
path:
|
|
13224
|
+
path: path50,
|
|
12737
13225
|
score: 0,
|
|
12738
13226
|
weight,
|
|
12739
13227
|
hit: false,
|
|
12740
|
-
message: `${
|
|
13228
|
+
message: `${path50} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
|
|
12741
13229
|
};
|
|
12742
13230
|
}
|
|
12743
13231
|
/**
|
|
12744
13232
|
* Date comparison with format normalization.
|
|
12745
13233
|
*/
|
|
12746
|
-
compareDate(
|
|
13234
|
+
compareDate(path50, candidateValue, expectedValue, fieldConfig, weight) {
|
|
12747
13235
|
const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
|
|
12748
13236
|
const candidateDate = parseDate(String(candidateValue), formats);
|
|
12749
13237
|
const expectedDate = parseDate(String(expectedValue), formats);
|
|
12750
13238
|
if (candidateDate === null) {
|
|
12751
13239
|
return {
|
|
12752
|
-
path:
|
|
13240
|
+
path: path50,
|
|
12753
13241
|
score: 0,
|
|
12754
13242
|
weight,
|
|
12755
13243
|
hit: false,
|
|
12756
|
-
message: `${
|
|
13244
|
+
message: `${path50} (unparseable candidate date)`
|
|
12757
13245
|
};
|
|
12758
13246
|
}
|
|
12759
13247
|
if (expectedDate === null) {
|
|
12760
13248
|
return {
|
|
12761
|
-
path:
|
|
13249
|
+
path: path50,
|
|
12762
13250
|
score: 0,
|
|
12763
13251
|
weight,
|
|
12764
13252
|
hit: false,
|
|
12765
|
-
message: `${
|
|
13253
|
+
message: `${path50} (unparseable expected date)`
|
|
12766
13254
|
};
|
|
12767
13255
|
}
|
|
12768
13256
|
if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
|
|
12769
13257
|
return {
|
|
12770
|
-
path:
|
|
13258
|
+
path: path50,
|
|
12771
13259
|
score: 1,
|
|
12772
13260
|
weight,
|
|
12773
13261
|
hit: true,
|
|
12774
|
-
message:
|
|
13262
|
+
message: path50
|
|
12775
13263
|
};
|
|
12776
13264
|
}
|
|
12777
13265
|
return {
|
|
12778
|
-
path:
|
|
13266
|
+
path: path50,
|
|
12779
13267
|
score: 0,
|
|
12780
13268
|
weight,
|
|
12781
13269
|
hit: false,
|
|
12782
|
-
message: `${
|
|
13270
|
+
message: `${path50} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
|
|
12783
13271
|
};
|
|
12784
13272
|
}
|
|
12785
13273
|
/**
|
|
@@ -12812,11 +13300,11 @@ var FieldAccuracyEvaluator = class {
|
|
|
12812
13300
|
};
|
|
12813
13301
|
}
|
|
12814
13302
|
};
|
|
12815
|
-
function resolvePath(obj,
|
|
12816
|
-
if (!
|
|
13303
|
+
function resolvePath(obj, path50) {
|
|
13304
|
+
if (!path50 || !obj) {
|
|
12817
13305
|
return void 0;
|
|
12818
13306
|
}
|
|
12819
|
-
const parts =
|
|
13307
|
+
const parts = path50.split(/\.|\[|\]/).filter((p) => p.length > 0);
|
|
12820
13308
|
let current = obj;
|
|
12821
13309
|
for (const part of parts) {
|
|
12822
13310
|
if (current === null || current === void 0) {
|
|
@@ -13308,8 +13796,8 @@ var TokenUsageEvaluator = class {
|
|
|
13308
13796
|
};
|
|
13309
13797
|
|
|
13310
13798
|
// src/evaluation/evaluators/tool-trajectory.ts
|
|
13311
|
-
function getNestedValue(obj,
|
|
13312
|
-
const parts =
|
|
13799
|
+
function getNestedValue(obj, path50) {
|
|
13800
|
+
const parts = path50.split(".");
|
|
13313
13801
|
let current = obj;
|
|
13314
13802
|
for (const part of parts) {
|
|
13315
13803
|
if (current === null || current === void 0 || typeof current !== "object") {
|
|
@@ -15081,7 +15569,7 @@ var WorkspacePoolManager = class {
|
|
|
15081
15569
|
}
|
|
15082
15570
|
/**
|
|
15083
15571
|
* Reset an existing slot for reuse:
|
|
15084
|
-
* 1. Reset repos (git reset --hard
|
|
15572
|
+
* 1. Reset repos (fetch from origin when resolve=remote, then git reset --hard && git clean per repo)
|
|
15085
15573
|
* 2. Re-copy template files (skip repo directories)
|
|
15086
15574
|
*/
|
|
15087
15575
|
async resetSlot(slotPath, templatePath, repos, poolReset = "fast") {
|
|
@@ -15094,7 +15582,17 @@ var WorkspacePoolManager = class {
|
|
|
15094
15582
|
continue;
|
|
15095
15583
|
}
|
|
15096
15584
|
const ref = repo.checkout?.ref ?? "HEAD";
|
|
15097
|
-
|
|
15585
|
+
const resolve = repo.checkout?.resolve ?? "remote";
|
|
15586
|
+
if (resolve === "remote") {
|
|
15587
|
+
const fetchArgs = ["fetch", "origin", ref];
|
|
15588
|
+
if (repo.clone?.depth) {
|
|
15589
|
+
fetchArgs.splice(1, 0, "--depth", String(repo.clone.depth));
|
|
15590
|
+
}
|
|
15591
|
+
await git(fetchArgs, { cwd: repoDir });
|
|
15592
|
+
await git(["reset", "--hard", "FETCH_HEAD"], { cwd: repoDir });
|
|
15593
|
+
} else {
|
|
15594
|
+
await git(["reset", "--hard", ref], { cwd: repoDir });
|
|
15595
|
+
}
|
|
15098
15596
|
const cleanFlag = poolReset === "strict" ? "-fdx" : "-fd";
|
|
15099
15597
|
await git(["clean", cleanFlag], { cwd: repoDir });
|
|
15100
15598
|
}
|
|
@@ -15391,7 +15889,7 @@ async function executeWorkspaceScript(config, context, failureMode = "fatal") {
|
|
|
15391
15889
|
}
|
|
15392
15890
|
|
|
15393
15891
|
// src/evaluation/orchestrator.ts
|
|
15394
|
-
function classifyQualityStatus(score, threshold =
|
|
15892
|
+
function classifyQualityStatus(score, threshold = DEFAULT_THRESHOLD) {
|
|
15395
15893
|
return score >= threshold ? "ok" : "quality_failure";
|
|
15396
15894
|
}
|
|
15397
15895
|
function buildSkippedEvaluatorError(scores) {
|
|
@@ -15483,7 +15981,7 @@ async function runEvaluation(options) {
|
|
|
15483
15981
|
const filteredEvalCases = filterEvalCases(evalCases, filter);
|
|
15484
15982
|
if (filteredEvalCases.length === 0) {
|
|
15485
15983
|
if (filter) {
|
|
15486
|
-
throw new Error(`No tests matched filter '${filter}' in ${evalFilePath}`);
|
|
15984
|
+
throw new Error(`No tests matched filter '${formatFilter(filter)}' in ${evalFilePath}`);
|
|
15487
15985
|
}
|
|
15488
15986
|
return [];
|
|
15489
15987
|
}
|
|
@@ -15509,20 +16007,10 @@ async function runEvaluation(options) {
|
|
|
15509
16007
|
if (resolvedTargetsByName.has(name)) {
|
|
15510
16008
|
return resolvedTargetsByName.get(name);
|
|
15511
16009
|
}
|
|
15512
|
-
|
|
16010
|
+
const definition = resolveDelegatedTargetDefinition(name, targetDefinitions, envLookup);
|
|
15513
16011
|
if (!definition) {
|
|
15514
16012
|
return void 0;
|
|
15515
16013
|
}
|
|
15516
|
-
for (let depth = 0; depth < 5; depth++) {
|
|
15517
|
-
const useTarget = definition.use_target;
|
|
15518
|
-
if (typeof useTarget !== "string" || useTarget.trim().length === 0) break;
|
|
15519
|
-
const envMatch = useTarget.trim().match(/^\$\{\{\s*([A-Z0-9_]+)\s*\}\}$/i);
|
|
15520
|
-
const resolvedName = envMatch ? envLookup[envMatch[1]] ?? "" : useTarget.trim();
|
|
15521
|
-
if (resolvedName.length === 0) break;
|
|
15522
|
-
const next = targetDefinitions.get(resolvedName);
|
|
15523
|
-
if (!next) break;
|
|
15524
|
-
definition = next;
|
|
15525
|
-
}
|
|
15526
16014
|
const resolved = resolveTargetDefinition(definition, envLookup, evalFilePath);
|
|
15527
16015
|
resolvedTargetsByName.set(name, resolved);
|
|
15528
16016
|
return resolved;
|
|
@@ -15545,6 +16033,9 @@ async function runEvaluation(options) {
|
|
|
15545
16033
|
const graderName = targetContext.graderTarget ?? targetContext.name;
|
|
15546
16034
|
const resolvedGrader = resolveTargetByName(graderName);
|
|
15547
16035
|
if (!resolvedGrader) {
|
|
16036
|
+
if (!LLM_GRADER_CAPABLE_KINDS.includes(targetContext.kind)) {
|
|
16037
|
+
return void 0;
|
|
16038
|
+
}
|
|
15548
16039
|
return getOrCreateProvider(targetContext);
|
|
15549
16040
|
}
|
|
15550
16041
|
return getOrCreateProvider(resolvedGrader);
|
|
@@ -15875,7 +16366,7 @@ async function runEvaluation(options) {
|
|
|
15875
16366
|
const budgetResult = {
|
|
15876
16367
|
timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
|
|
15877
16368
|
testId: evalCase.id,
|
|
15878
|
-
|
|
16369
|
+
suite: evalCase.suite,
|
|
15879
16370
|
category: evalCase.category,
|
|
15880
16371
|
score: 0,
|
|
15881
16372
|
assertions: [],
|
|
@@ -15912,7 +16403,7 @@ async function runEvaluation(options) {
|
|
|
15912
16403
|
const haltResult = {
|
|
15913
16404
|
timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
|
|
15914
16405
|
testId: evalCase.id,
|
|
15915
|
-
|
|
16406
|
+
suite: evalCase.suite,
|
|
15916
16407
|
category: evalCase.category,
|
|
15917
16408
|
score: 0,
|
|
15918
16409
|
assertions: [],
|
|
@@ -16224,7 +16715,7 @@ async function runBatchEvaluation(options) {
|
|
|
16224
16715
|
targetResolver,
|
|
16225
16716
|
availableTargets,
|
|
16226
16717
|
verbose,
|
|
16227
|
-
threshold: batchThreshold
|
|
16718
|
+
threshold: evalCase.threshold ?? batchThreshold
|
|
16228
16719
|
});
|
|
16229
16720
|
if (providerError) {
|
|
16230
16721
|
result = {
|
|
@@ -16686,8 +17177,9 @@ async function runEvalCase(options) {
|
|
|
16686
17177
|
fileChanges,
|
|
16687
17178
|
workspacePath,
|
|
16688
17179
|
verbose,
|
|
16689
|
-
threshold: caseThreshold
|
|
17180
|
+
threshold: evalCase.threshold ?? caseThreshold
|
|
16690
17181
|
});
|
|
17182
|
+
const effectiveThreshold = evalCase.threshold ?? caseThreshold;
|
|
16691
17183
|
const totalDurationMs = Date.now() - caseStartMs;
|
|
16692
17184
|
const graderTokens = aggregateEvaluatorTokenUsage(result.scores);
|
|
16693
17185
|
const evalRunTokenUsage = tokenUsage || graderTokens ? {
|
|
@@ -16701,7 +17193,7 @@ async function runEvalCase(options) {
|
|
|
16701
17193
|
...evalRunTokenUsage ? { tokenUsage: evalRunTokenUsage } : {}
|
|
16702
17194
|
};
|
|
16703
17195
|
const skippedEvaluatorError = buildSkippedEvaluatorError(result.scores);
|
|
16704
|
-
const executionStatus = providerError || skippedEvaluatorError ? "execution_error" : classifyQualityStatus(result.score,
|
|
17196
|
+
const executionStatus = providerError || skippedEvaluatorError ? "execution_error" : classifyQualityStatus(result.score, effectiveThreshold);
|
|
16705
17197
|
const targetUsedField = targetUsed ? { targetUsed } : {};
|
|
16706
17198
|
const finalResult = providerError ? {
|
|
16707
17199
|
...result,
|
|
@@ -16902,7 +17394,8 @@ async function evaluateCandidate(options) {
|
|
|
16902
17394
|
targetResolver,
|
|
16903
17395
|
availableTargets,
|
|
16904
17396
|
fileChanges,
|
|
16905
|
-
workspacePath
|
|
17397
|
+
workspacePath,
|
|
17398
|
+
threshold: evalThreshold
|
|
16906
17399
|
});
|
|
16907
17400
|
const completedAt = nowFn();
|
|
16908
17401
|
let agentRequest;
|
|
@@ -16933,7 +17426,7 @@ async function evaluateCandidate(options) {
|
|
|
16933
17426
|
return {
|
|
16934
17427
|
timestamp: completedAt.toISOString(),
|
|
16935
17428
|
testId: evalCase.id,
|
|
16936
|
-
|
|
17429
|
+
suite: evalCase.suite,
|
|
16937
17430
|
category: evalCase.category,
|
|
16938
17431
|
conversationId: evalCase.conversation_id,
|
|
16939
17432
|
score: score.score,
|
|
@@ -16976,7 +17469,8 @@ async function runEvaluatorsForCase(options) {
|
|
|
16976
17469
|
targetResolver,
|
|
16977
17470
|
availableTargets,
|
|
16978
17471
|
fileChanges,
|
|
16979
|
-
workspacePath
|
|
17472
|
+
workspacePath,
|
|
17473
|
+
threshold
|
|
16980
17474
|
} = options;
|
|
16981
17475
|
if (evalCase.assertions && evalCase.assertions.length > 0) {
|
|
16982
17476
|
return runEvaluatorList({
|
|
@@ -17002,7 +17496,8 @@ async function runEvaluatorsForCase(options) {
|
|
|
17002
17496
|
targetResolver,
|
|
17003
17497
|
availableTargets,
|
|
17004
17498
|
fileChanges,
|
|
17005
|
-
workspacePath
|
|
17499
|
+
workspacePath,
|
|
17500
|
+
threshold
|
|
17006
17501
|
});
|
|
17007
17502
|
}
|
|
17008
17503
|
const evaluatorKind = evalCase.evaluator ?? "llm-grader";
|
|
@@ -17104,7 +17599,8 @@ async function runEvaluatorList(options) {
|
|
|
17104
17599
|
name: evaluatorConfig.name,
|
|
17105
17600
|
type: evaluatorConfig.type,
|
|
17106
17601
|
weight,
|
|
17107
|
-
...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {}
|
|
17602
|
+
...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {},
|
|
17603
|
+
...evaluatorConfig.min_score !== void 0 ? { min_score: evaluatorConfig.min_score } : {}
|
|
17108
17604
|
});
|
|
17109
17605
|
scores.push({
|
|
17110
17606
|
name: evaluatorConfig.name,
|
|
@@ -17139,7 +17635,8 @@ async function runEvaluatorList(options) {
|
|
|
17139
17635
|
name: evaluatorConfig.name ?? "unknown",
|
|
17140
17636
|
type: evaluatorConfig.type ?? "llm-grader",
|
|
17141
17637
|
weight,
|
|
17142
|
-
...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {}
|
|
17638
|
+
...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {},
|
|
17639
|
+
...evaluatorConfig.min_score !== void 0 ? { min_score: evaluatorConfig.min_score } : {}
|
|
17143
17640
|
});
|
|
17144
17641
|
scores.push({
|
|
17145
17642
|
name: evaluatorConfig.name ?? "unknown",
|
|
@@ -17173,9 +17670,10 @@ async function runEvaluatorList(options) {
|
|
|
17173
17670
|
}
|
|
17174
17671
|
}
|
|
17175
17672
|
}
|
|
17673
|
+
const effectiveThreshold = options.threshold ?? DEFAULT_THRESHOLD;
|
|
17176
17674
|
const hasRequiredFailure = scored.some((entry) => {
|
|
17177
17675
|
if (!entry.required) return false;
|
|
17178
|
-
const minScore = typeof entry.required === "number" ? entry.required :
|
|
17676
|
+
const minScore = entry.min_score ?? (typeof entry.required === "number" ? entry.required : effectiveThreshold);
|
|
17179
17677
|
return entry.score.score < minScore;
|
|
17180
17678
|
});
|
|
17181
17679
|
const scorable = scored.filter((entry) => entry.score.verdict !== "skip");
|
|
@@ -17186,17 +17684,23 @@ async function runEvaluatorList(options) {
|
|
|
17186
17684
|
const expectedAspectCount = assertions.length || 1;
|
|
17187
17685
|
const score = {
|
|
17188
17686
|
score: aggregateScore,
|
|
17189
|
-
verdict: scoreToVerdict(aggregateScore),
|
|
17687
|
+
verdict: scoreToVerdict(aggregateScore, effectiveThreshold),
|
|
17190
17688
|
assertions,
|
|
17191
17689
|
expectedAspectCount
|
|
17192
17690
|
};
|
|
17193
17691
|
return { score, scores };
|
|
17194
17692
|
}
|
|
17693
|
+
function formatFilter(filter) {
|
|
17694
|
+
return typeof filter === "string" ? filter : filter.join(", ");
|
|
17695
|
+
}
|
|
17696
|
+
function matchesFilter3(id, filter) {
|
|
17697
|
+
return typeof filter === "string" ? micromatch3.isMatch(id, filter) : filter.some((pattern) => micromatch3.isMatch(id, pattern));
|
|
17698
|
+
}
|
|
17195
17699
|
function filterEvalCases(evalCases, filter) {
|
|
17196
17700
|
if (!filter) {
|
|
17197
17701
|
return evalCases;
|
|
17198
17702
|
}
|
|
17199
|
-
return evalCases.filter((evalCase) =>
|
|
17703
|
+
return evalCases.filter((evalCase) => matchesFilter3(evalCase.id, filter));
|
|
17200
17704
|
}
|
|
17201
17705
|
function buildEvaluatorRegistry(overrides, resolveGraderProvider) {
|
|
17202
17706
|
const llmGrader = overrides?.["llm-grader"] ?? new LlmGraderEvaluator({
|
|
@@ -17283,7 +17787,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
17283
17787
|
return {
|
|
17284
17788
|
timestamp: timestamp.toISOString(),
|
|
17285
17789
|
testId: evalCase.id,
|
|
17286
|
-
|
|
17790
|
+
suite: evalCase.suite,
|
|
17287
17791
|
category: evalCase.category,
|
|
17288
17792
|
conversationId: evalCase.conversation_id,
|
|
17289
17793
|
score: 0,
|
|
@@ -17555,6 +18059,7 @@ async function evaluate(config) {
|
|
|
17555
18059
|
verbose: config.verbose,
|
|
17556
18060
|
maxConcurrency: config.workers ?? 3,
|
|
17557
18061
|
filter: config.filter,
|
|
18062
|
+
threshold: config.threshold,
|
|
17558
18063
|
evalCases,
|
|
17559
18064
|
onResult: async (result) => {
|
|
17560
18065
|
collectedResults.push(result);
|
|
@@ -17565,19 +18070,19 @@ async function evaluate(config) {
|
|
|
17565
18070
|
const durationMs = Date.now() - startTime;
|
|
17566
18071
|
return {
|
|
17567
18072
|
results: allResults,
|
|
17568
|
-
summary: computeSummary(allResults, durationMs)
|
|
18073
|
+
summary: computeSummary(allResults, durationMs, config.threshold)
|
|
17569
18074
|
};
|
|
17570
18075
|
}
|
|
17571
18076
|
function mapAssertionType(type) {
|
|
17572
18077
|
return type.replace(/_/g, "-");
|
|
17573
18078
|
}
|
|
17574
|
-
function computeSummary(results, durationMs) {
|
|
18079
|
+
function computeSummary(results, durationMs, threshold = DEFAULT_THRESHOLD) {
|
|
17575
18080
|
const total = results.length;
|
|
17576
18081
|
let passed = 0;
|
|
17577
18082
|
let scoreSum = 0;
|
|
17578
18083
|
for (const r of results) {
|
|
17579
18084
|
scoreSum += r.score;
|
|
17580
|
-
if (r.score >=
|
|
18085
|
+
if (r.score >= threshold) {
|
|
17581
18086
|
passed++;
|
|
17582
18087
|
}
|
|
17583
18088
|
}
|
|
@@ -17608,7 +18113,7 @@ async function discoverDefaultTarget(repoRoot) {
|
|
|
17608
18113
|
return null;
|
|
17609
18114
|
}
|
|
17610
18115
|
async function loadEnvHierarchy(repoRoot, startPath) {
|
|
17611
|
-
const { readFileSync:
|
|
18116
|
+
const { readFileSync: readFileSync4 } = await import("node:fs");
|
|
17612
18117
|
const chain = buildDirectoryChain(startPath, repoRoot);
|
|
17613
18118
|
const envFiles = [];
|
|
17614
18119
|
for (const dir of chain) {
|
|
@@ -17617,7 +18122,7 @@ async function loadEnvHierarchy(repoRoot, startPath) {
|
|
|
17617
18122
|
}
|
|
17618
18123
|
for (let i = 0; i < envFiles.length; i++) {
|
|
17619
18124
|
try {
|
|
17620
|
-
const content =
|
|
18125
|
+
const content = readFileSync4(envFiles[i], "utf8");
|
|
17621
18126
|
for (const line of content.split("\n")) {
|
|
17622
18127
|
const trimmed = line.trim();
|
|
17623
18128
|
if (!trimmed || trimmed.startsWith("#")) continue;
|
|
@@ -17690,7 +18195,7 @@ var CONFIG_FILE_NAMES = [
|
|
|
17690
18195
|
];
|
|
17691
18196
|
async function loadTsConfig(projectRoot) {
|
|
17692
18197
|
const { existsSync: existsSync7 } = await import("node:fs");
|
|
17693
|
-
const { pathToFileURL } = await import("node:url");
|
|
18198
|
+
const { pathToFileURL: pathToFileURL2 } = await import("node:url");
|
|
17694
18199
|
const { join: join2 } = await import("node:path");
|
|
17695
18200
|
for (const fileName of CONFIG_FILE_NAMES) {
|
|
17696
18201
|
const filePath = join2(projectRoot, fileName);
|
|
@@ -17698,7 +18203,7 @@ async function loadTsConfig(projectRoot) {
|
|
|
17698
18203
|
continue;
|
|
17699
18204
|
}
|
|
17700
18205
|
try {
|
|
17701
|
-
const fileUrl =
|
|
18206
|
+
const fileUrl = pathToFileURL2(filePath).href;
|
|
17702
18207
|
const mod = await import(fileUrl);
|
|
17703
18208
|
const config = mod.default ?? mod;
|
|
17704
18209
|
return AgentVConfigSchema.parse(config);
|
|
@@ -17832,7 +18337,7 @@ function shouldSkipCacheForTemperature(targetConfig) {
|
|
|
17832
18337
|
}
|
|
17833
18338
|
|
|
17834
18339
|
// src/projects.ts
|
|
17835
|
-
import { existsSync as existsSync6, mkdirSync, readFileSync as
|
|
18340
|
+
import { existsSync as existsSync6, mkdirSync as mkdirSync2, readFileSync as readFileSync3, readdirSync as readdirSync3, statSync as statSync2, writeFileSync } from "node:fs";
|
|
17836
18341
|
import path47 from "node:path";
|
|
17837
18342
|
import { parse as parseYaml3, stringify as stringifyYaml } from "yaml";
|
|
17838
18343
|
function getProjectsRegistryPath() {
|
|
@@ -17844,7 +18349,7 @@ function loadProjectRegistry() {
|
|
|
17844
18349
|
return { projects: [] };
|
|
17845
18350
|
}
|
|
17846
18351
|
try {
|
|
17847
|
-
const raw =
|
|
18352
|
+
const raw = readFileSync3(registryPath, "utf-8");
|
|
17848
18353
|
const parsed = parseYaml3(raw);
|
|
17849
18354
|
if (!parsed || !Array.isArray(parsed.projects)) {
|
|
17850
18355
|
return { projects: [] };
|
|
@@ -17858,7 +18363,7 @@ function saveProjectRegistry(registry) {
|
|
|
17858
18363
|
const registryPath = getProjectsRegistryPath();
|
|
17859
18364
|
const dir = path47.dirname(registryPath);
|
|
17860
18365
|
if (!existsSync6(dir)) {
|
|
17861
|
-
|
|
18366
|
+
mkdirSync2(dir, { recursive: true });
|
|
17862
18367
|
}
|
|
17863
18368
|
writeFileSync(registryPath, stringifyYaml(registry), "utf-8");
|
|
17864
18369
|
}
|
|
@@ -18124,7 +18629,7 @@ var OtelTraceExporter = class {
|
|
|
18124
18629
|
rootSpan.setAttribute("gen_ai.system", "agentv");
|
|
18125
18630
|
rootSpan.setAttribute("agentv.test_id", result.testId);
|
|
18126
18631
|
rootSpan.setAttribute("agentv.target", result.target);
|
|
18127
|
-
if (result.
|
|
18632
|
+
if (result.suite) rootSpan.setAttribute("agentv.suite", result.suite);
|
|
18128
18633
|
rootSpan.setAttribute("agentv.score", result.score);
|
|
18129
18634
|
if (captureContent && result.output.length > 0) {
|
|
18130
18635
|
const lastMsg = result.output[result.output.length - 1];
|
|
@@ -18333,7 +18838,7 @@ var OtelStreamingObserver = class {
|
|
|
18333
18838
|
this.rootSpan.setAttribute("gen_ai.system", "agentv");
|
|
18334
18839
|
this.rootSpan.setAttribute("agentv.test_id", testId);
|
|
18335
18840
|
this.rootSpan.setAttribute("agentv.target", target);
|
|
18336
|
-
if (evalSet) this.rootSpan.setAttribute("agentv.
|
|
18841
|
+
if (evalSet) this.rootSpan.setAttribute("agentv.suite", evalSet);
|
|
18337
18842
|
this.rootCtx = this.api.trace.setSpan(this.api.context.active(), this.rootSpan);
|
|
18338
18843
|
}
|
|
18339
18844
|
/** Create and immediately export a tool span */
|
|
@@ -18682,11 +19187,241 @@ function extractToolResultContent(content) {
|
|
|
18682
19187
|
return parts.length > 0 ? parts.join("") : void 0;
|
|
18683
19188
|
}
|
|
18684
19189
|
|
|
18685
|
-
// src/import/
|
|
19190
|
+
// src/import/codex-parser.ts
|
|
19191
|
+
function parseCodexSession(jsonl) {
|
|
19192
|
+
const messages = [];
|
|
19193
|
+
let sessionId = "";
|
|
19194
|
+
let cwd;
|
|
19195
|
+
let model;
|
|
19196
|
+
let version;
|
|
19197
|
+
let startTimestamp;
|
|
19198
|
+
let endTimestamp;
|
|
19199
|
+
const pendingCalls = /* @__PURE__ */ new Map();
|
|
19200
|
+
const lines = jsonl.split("\n").filter((l) => l.trim().length > 0);
|
|
19201
|
+
for (const line of lines) {
|
|
19202
|
+
let entry;
|
|
19203
|
+
try {
|
|
19204
|
+
entry = JSON.parse(line);
|
|
19205
|
+
} catch {
|
|
19206
|
+
continue;
|
|
19207
|
+
}
|
|
19208
|
+
if (!entry.type) continue;
|
|
19209
|
+
if (entry.timestamp) {
|
|
19210
|
+
if (!startTimestamp) startTimestamp = entry.timestamp;
|
|
19211
|
+
endTimestamp = entry.timestamp;
|
|
19212
|
+
}
|
|
19213
|
+
const payload = entry.payload ?? {};
|
|
19214
|
+
switch (entry.type) {
|
|
19215
|
+
case "session_meta": {
|
|
19216
|
+
sessionId = String(payload.id ?? "");
|
|
19217
|
+
cwd = payload.cwd ? String(payload.cwd) : void 0;
|
|
19218
|
+
version = payload.cli_version ? String(payload.cli_version) : void 0;
|
|
19219
|
+
if (payload.model && !model) {
|
|
19220
|
+
model = String(payload.model);
|
|
19221
|
+
}
|
|
19222
|
+
break;
|
|
19223
|
+
}
|
|
19224
|
+
case "turn_context": {
|
|
19225
|
+
if (payload.model && !model) {
|
|
19226
|
+
model = String(payload.model);
|
|
19227
|
+
}
|
|
19228
|
+
if (payload.cwd && !cwd) {
|
|
19229
|
+
cwd = String(payload.cwd);
|
|
19230
|
+
}
|
|
19231
|
+
break;
|
|
19232
|
+
}
|
|
19233
|
+
case "response_item": {
|
|
19234
|
+
const itemType = String(payload.type ?? "");
|
|
19235
|
+
const role = String(payload.role ?? "");
|
|
19236
|
+
switch (itemType) {
|
|
19237
|
+
case "message": {
|
|
19238
|
+
if (role === "developer") break;
|
|
19239
|
+
const content = extractResponseItemContent(payload.content);
|
|
19240
|
+
if (role === "user" && content) {
|
|
19241
|
+
messages.push({ role: "user", content });
|
|
19242
|
+
} else if (role === "assistant" && content) {
|
|
19243
|
+
messages.push({ role: "assistant", content });
|
|
19244
|
+
}
|
|
19245
|
+
break;
|
|
19246
|
+
}
|
|
19247
|
+
case "function_call": {
|
|
19248
|
+
const toolName = String(payload.name ?? "");
|
|
19249
|
+
const callId = String(payload.call_id ?? "");
|
|
19250
|
+
let input;
|
|
19251
|
+
if (typeof payload.arguments === "string") {
|
|
19252
|
+
try {
|
|
19253
|
+
input = JSON.parse(payload.arguments);
|
|
19254
|
+
} catch {
|
|
19255
|
+
input = payload.arguments;
|
|
19256
|
+
}
|
|
19257
|
+
} else {
|
|
19258
|
+
input = payload.arguments;
|
|
19259
|
+
}
|
|
19260
|
+
const toolCall = { tool: toolName, input, id: callId };
|
|
19261
|
+
const msgIdx = messages.length;
|
|
19262
|
+
messages.push({
|
|
19263
|
+
role: "assistant",
|
|
19264
|
+
toolCalls: [toolCall]
|
|
19265
|
+
});
|
|
19266
|
+
if (callId) {
|
|
19267
|
+
pendingCalls.set(callId, { msgIdx, toolIdx: 0 });
|
|
19268
|
+
}
|
|
19269
|
+
break;
|
|
19270
|
+
}
|
|
19271
|
+
case "custom_tool_call": {
|
|
19272
|
+
const toolName = String(payload.name ?? "");
|
|
19273
|
+
const callId = String(payload.call_id ?? "");
|
|
19274
|
+
let input;
|
|
19275
|
+
if (typeof payload.arguments === "string") {
|
|
19276
|
+
try {
|
|
19277
|
+
input = JSON.parse(payload.arguments);
|
|
19278
|
+
} catch {
|
|
19279
|
+
input = payload.arguments;
|
|
19280
|
+
}
|
|
19281
|
+
} else {
|
|
19282
|
+
input = payload.arguments;
|
|
19283
|
+
}
|
|
19284
|
+
const toolCall = { tool: toolName, input, id: callId };
|
|
19285
|
+
const msgIdx = messages.length;
|
|
19286
|
+
messages.push({
|
|
19287
|
+
role: "assistant",
|
|
19288
|
+
toolCalls: [toolCall]
|
|
19289
|
+
});
|
|
19290
|
+
if (callId) {
|
|
19291
|
+
pendingCalls.set(callId, { msgIdx, toolIdx: 0 });
|
|
19292
|
+
}
|
|
19293
|
+
break;
|
|
19294
|
+
}
|
|
19295
|
+
case "function_call_output":
|
|
19296
|
+
case "custom_tool_call_output": {
|
|
19297
|
+
const callId = String(payload.call_id ?? "");
|
|
19298
|
+
const pending = pendingCalls.get(callId);
|
|
19299
|
+
if (pending) {
|
|
19300
|
+
const existingMsg = messages[pending.msgIdx];
|
|
19301
|
+
const existingCalls = [...existingMsg.toolCalls ?? []];
|
|
19302
|
+
existingCalls[pending.toolIdx] = {
|
|
19303
|
+
...existingCalls[pending.toolIdx],
|
|
19304
|
+
output: payload.output
|
|
19305
|
+
};
|
|
19306
|
+
messages[pending.msgIdx] = { ...existingMsg, toolCalls: existingCalls };
|
|
19307
|
+
pendingCalls.delete(callId);
|
|
19308
|
+
}
|
|
19309
|
+
break;
|
|
19310
|
+
}
|
|
19311
|
+
// Skip reasoning blocks (thinking tokens)
|
|
19312
|
+
case "reasoning":
|
|
19313
|
+
break;
|
|
19314
|
+
}
|
|
19315
|
+
break;
|
|
19316
|
+
}
|
|
19317
|
+
}
|
|
19318
|
+
}
|
|
19319
|
+
let durationMs;
|
|
19320
|
+
if (startTimestamp && endTimestamp) {
|
|
19321
|
+
durationMs = new Date(endTimestamp).getTime() - new Date(startTimestamp).getTime();
|
|
19322
|
+
}
|
|
19323
|
+
const source = {
|
|
19324
|
+
provider: "codex",
|
|
19325
|
+
sessionId,
|
|
19326
|
+
cwd,
|
|
19327
|
+
startedAt: startTimestamp,
|
|
19328
|
+
model,
|
|
19329
|
+
version
|
|
19330
|
+
};
|
|
19331
|
+
return {
|
|
19332
|
+
messages,
|
|
19333
|
+
source,
|
|
19334
|
+
// Codex rollout files don't include token counts (only rate limit info)
|
|
19335
|
+
tokenUsage: void 0,
|
|
19336
|
+
durationMs,
|
|
19337
|
+
costUsd: null
|
|
19338
|
+
};
|
|
19339
|
+
}
|
|
19340
|
+
function extractResponseItemContent(content) {
|
|
19341
|
+
if (typeof content === "string") return content;
|
|
19342
|
+
if (!Array.isArray(content)) return void 0;
|
|
19343
|
+
const parts = [];
|
|
19344
|
+
for (const block of content) {
|
|
19345
|
+
if (typeof block === "object" && block !== null) {
|
|
19346
|
+
const b = block;
|
|
19347
|
+
if (typeof b.text === "string") {
|
|
19348
|
+
parts.push(b.text);
|
|
19349
|
+
}
|
|
19350
|
+
}
|
|
19351
|
+
}
|
|
19352
|
+
return parts.length > 0 ? parts.join("") : void 0;
|
|
19353
|
+
}
|
|
19354
|
+
|
|
19355
|
+
// src/import/codex-session-discovery.ts
|
|
18686
19356
|
import { readdir as readdir8, stat as stat9 } from "node:fs/promises";
|
|
18687
19357
|
import { homedir as homedir3 } from "node:os";
|
|
18688
19358
|
import path48 from "node:path";
|
|
18689
|
-
var
|
|
19359
|
+
var DEFAULT_SESSIONS_DIR = () => path48.join(homedir3(), ".codex", "sessions");
|
|
19360
|
+
async function discoverCodexSessions(opts) {
|
|
19361
|
+
const sessionsDir = opts?.sessionsDir ?? DEFAULT_SESSIONS_DIR();
|
|
19362
|
+
const limit = opts?.latest ? 1 : opts?.limit ?? 10;
|
|
19363
|
+
const sessions = [];
|
|
19364
|
+
let yearDirs;
|
|
19365
|
+
try {
|
|
19366
|
+
yearDirs = await readdir8(sessionsDir);
|
|
19367
|
+
} catch {
|
|
19368
|
+
return [];
|
|
19369
|
+
}
|
|
19370
|
+
for (const year of yearDirs) {
|
|
19371
|
+
const yearPath = path48.join(sessionsDir, year);
|
|
19372
|
+
let monthDirs;
|
|
19373
|
+
try {
|
|
19374
|
+
monthDirs = await readdir8(yearPath);
|
|
19375
|
+
} catch {
|
|
19376
|
+
continue;
|
|
19377
|
+
}
|
|
19378
|
+
for (const month of monthDirs) {
|
|
19379
|
+
const monthPath = path48.join(yearPath, month);
|
|
19380
|
+
let dayDirs;
|
|
19381
|
+
try {
|
|
19382
|
+
dayDirs = await readdir8(monthPath);
|
|
19383
|
+
} catch {
|
|
19384
|
+
continue;
|
|
19385
|
+
}
|
|
19386
|
+
for (const day of dayDirs) {
|
|
19387
|
+
if (opts?.date) {
|
|
19388
|
+
const dirDate = `${year}-${month}-${day}`;
|
|
19389
|
+
if (dirDate !== opts.date) continue;
|
|
19390
|
+
}
|
|
19391
|
+
const dayPath = path48.join(monthPath, day);
|
|
19392
|
+
let files;
|
|
19393
|
+
try {
|
|
19394
|
+
files = await readdir8(dayPath);
|
|
19395
|
+
} catch {
|
|
19396
|
+
continue;
|
|
19397
|
+
}
|
|
19398
|
+
for (const file of files) {
|
|
19399
|
+
if (!file.startsWith("rollout-") || !file.endsWith(".jsonl")) continue;
|
|
19400
|
+
const filePath = path48.join(dayPath, file);
|
|
19401
|
+
const nameWithoutExt = file.replace(/\.jsonl$/, "");
|
|
19402
|
+
const parts = nameWithoutExt.split("-");
|
|
19403
|
+
const sessionId = parts.length >= 6 ? parts.slice(-5).join("-") : nameWithoutExt;
|
|
19404
|
+
let updatedAt;
|
|
19405
|
+
try {
|
|
19406
|
+
const fileStat = await stat9(filePath);
|
|
19407
|
+
updatedAt = fileStat.mtime;
|
|
19408
|
+
} catch {
|
|
19409
|
+
updatedAt = /* @__PURE__ */ new Date(0);
|
|
19410
|
+
}
|
|
19411
|
+
sessions.push({ sessionId, filePath, filename: file, updatedAt });
|
|
19412
|
+
}
|
|
19413
|
+
}
|
|
19414
|
+
}
|
|
19415
|
+
}
|
|
19416
|
+
sessions.sort((a, b) => b.updatedAt.getTime() - a.updatedAt.getTime());
|
|
19417
|
+
return sessions.slice(0, limit);
|
|
19418
|
+
}
|
|
19419
|
+
|
|
19420
|
+
// src/import/session-discovery.ts
|
|
19421
|
+
import { readdir as readdir9, stat as stat10 } from "node:fs/promises";
|
|
19422
|
+
import { homedir as homedir4 } from "node:os";
|
|
19423
|
+
import path49 from "node:path";
|
|
19424
|
+
var DEFAULT_PROJECTS_DIR = () => path49.join(homedir4(), ".claude", "projects");
|
|
18690
19425
|
function encodeProjectPath(projectPath) {
|
|
18691
19426
|
return projectPath.replace(/\//g, "-");
|
|
18692
19427
|
}
|
|
@@ -18695,7 +19430,7 @@ async function discoverClaudeSessions(opts) {
|
|
|
18695
19430
|
const limit = opts?.latest ? 1 : opts?.limit ?? 10;
|
|
18696
19431
|
let projectDirs;
|
|
18697
19432
|
try {
|
|
18698
|
-
projectDirs = await
|
|
19433
|
+
projectDirs = await readdir9(projectsDir);
|
|
18699
19434
|
} catch {
|
|
18700
19435
|
return [];
|
|
18701
19436
|
}
|
|
@@ -18705,10 +19440,10 @@ async function discoverClaudeSessions(opts) {
|
|
|
18705
19440
|
}
|
|
18706
19441
|
const sessions = [];
|
|
18707
19442
|
for (const projectDir of projectDirs) {
|
|
18708
|
-
const dirPath =
|
|
19443
|
+
const dirPath = path49.join(projectsDir, projectDir);
|
|
18709
19444
|
let entries;
|
|
18710
19445
|
try {
|
|
18711
|
-
entries = await
|
|
19446
|
+
entries = await readdir9(dirPath);
|
|
18712
19447
|
} catch {
|
|
18713
19448
|
continue;
|
|
18714
19449
|
}
|
|
@@ -18716,10 +19451,10 @@ async function discoverClaudeSessions(opts) {
|
|
|
18716
19451
|
if (!entry.endsWith(".jsonl")) continue;
|
|
18717
19452
|
const sessionId = entry.replace(/\.jsonl$/, "");
|
|
18718
19453
|
if (opts?.sessionId && sessionId !== opts.sessionId) continue;
|
|
18719
|
-
const filePath =
|
|
19454
|
+
const filePath = path49.join(dirPath, entry);
|
|
18720
19455
|
let updatedAt;
|
|
18721
19456
|
try {
|
|
18722
|
-
const fileStat = await
|
|
19457
|
+
const fileStat = await stat10(filePath);
|
|
18723
19458
|
updatedAt = fileStat.mtime;
|
|
18724
19459
|
} catch {
|
|
18725
19460
|
updatedAt = /* @__PURE__ */ new Date(0);
|
|
@@ -18738,10 +19473,85 @@ async function discoverClaudeSessions(opts) {
|
|
|
18738
19473
|
|
|
18739
19474
|
// src/import/types.ts
|
|
18740
19475
|
import { readFile as readFile14 } from "node:fs/promises";
|
|
19476
|
+
function toTranscriptJsonLine(entry) {
|
|
19477
|
+
const firstUserMessage = entry.messages.find((m) => m.role === "user");
|
|
19478
|
+
const input = typeof firstUserMessage?.content === "string" ? firstUserMessage.content : "";
|
|
19479
|
+
return {
|
|
19480
|
+
input,
|
|
19481
|
+
output: entry.messages,
|
|
19482
|
+
token_usage: entry.tokenUsage ? {
|
|
19483
|
+
input: entry.tokenUsage.input,
|
|
19484
|
+
output: entry.tokenUsage.output,
|
|
19485
|
+
cached: entry.tokenUsage.cached
|
|
19486
|
+
} : void 0,
|
|
19487
|
+
duration_ms: entry.durationMs,
|
|
19488
|
+
cost_usd: entry.costUsd,
|
|
19489
|
+
source: {
|
|
19490
|
+
provider: entry.source.provider,
|
|
19491
|
+
session_id: entry.source.sessionId,
|
|
19492
|
+
model: entry.source.model,
|
|
19493
|
+
timestamp: entry.source.startedAt,
|
|
19494
|
+
git_branch: entry.source.gitBranch,
|
|
19495
|
+
cwd: entry.source.cwd ?? entry.source.projectPath,
|
|
19496
|
+
version: entry.source.version
|
|
19497
|
+
}
|
|
19498
|
+
};
|
|
19499
|
+
}
|
|
19500
|
+
async function readTranscriptJsonl(filePath) {
|
|
19501
|
+
const text = await readFile14(filePath, "utf8");
|
|
19502
|
+
return text.split("\n").filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
19503
|
+
}
|
|
18741
19504
|
async function readTranscriptFile(filePath) {
|
|
18742
19505
|
return readFile14(filePath, "utf8");
|
|
18743
19506
|
}
|
|
18744
19507
|
|
|
19508
|
+
// src/import/transcript-provider.ts
|
|
19509
|
+
var TranscriptProvider = class _TranscriptProvider {
|
|
19510
|
+
id;
|
|
19511
|
+
kind = "transcript";
|
|
19512
|
+
targetName;
|
|
19513
|
+
lines;
|
|
19514
|
+
cursor = 0;
|
|
19515
|
+
constructor(targetName, lines) {
|
|
19516
|
+
this.targetName = targetName;
|
|
19517
|
+
this.id = `transcript:${targetName}`;
|
|
19518
|
+
this.lines = lines;
|
|
19519
|
+
}
|
|
19520
|
+
/**
|
|
19521
|
+
* Create a TranscriptProvider from a JSONL file path.
|
|
19522
|
+
*/
|
|
19523
|
+
static async fromFile(filePath) {
|
|
19524
|
+
const lines = await readTranscriptJsonl(filePath);
|
|
19525
|
+
if (lines.length === 0) {
|
|
19526
|
+
throw new Error(`Transcript file is empty: ${filePath}`);
|
|
19527
|
+
}
|
|
19528
|
+
const providerName = lines[0].source.provider ?? "transcript";
|
|
19529
|
+
return new _TranscriptProvider(providerName, lines);
|
|
19530
|
+
}
|
|
19531
|
+
get lineCount() {
|
|
19532
|
+
return this.lines.length;
|
|
19533
|
+
}
|
|
19534
|
+
async invoke(_request) {
|
|
19535
|
+
if (this.cursor >= this.lines.length) {
|
|
19536
|
+
throw new Error(
|
|
19537
|
+
`Transcript exhausted: ${this.lines.length} line(s) available but ${this.cursor + 1} invocations attempted. Each transcript line maps to one test case.`
|
|
19538
|
+
);
|
|
19539
|
+
}
|
|
19540
|
+
const line = this.lines[this.cursor++];
|
|
19541
|
+
return {
|
|
19542
|
+
output: line.output,
|
|
19543
|
+
tokenUsage: line.token_usage ? {
|
|
19544
|
+
input: line.token_usage.input,
|
|
19545
|
+
output: line.token_usage.output,
|
|
19546
|
+
cached: line.token_usage.cached
|
|
19547
|
+
} : void 0,
|
|
19548
|
+
durationMs: line.duration_ms,
|
|
19549
|
+
costUsd: line.cost_usd ?? void 0,
|
|
19550
|
+
startTime: line.source.timestamp
|
|
19551
|
+
};
|
|
19552
|
+
}
|
|
19553
|
+
};
|
|
19554
|
+
|
|
18745
19555
|
// src/index.ts
|
|
18746
19556
|
function createAgentKernel() {
|
|
18747
19557
|
return { status: "stub" };
|
|
@@ -18755,6 +19565,7 @@ export {
|
|
|
18755
19565
|
DEFAULT_EVALUATOR_TEMPLATE,
|
|
18756
19566
|
DEFAULT_EVAL_PATTERNS,
|
|
18757
19567
|
DEFAULT_EXPLORATION_TOOLS,
|
|
19568
|
+
DEFAULT_THRESHOLD,
|
|
18758
19569
|
DeterministicAssertionEvaluator,
|
|
18759
19570
|
EvaluatorRegistry,
|
|
18760
19571
|
ExecutionMetricsEvaluator,
|
|
@@ -18776,6 +19587,7 @@ export {
|
|
|
18776
19587
|
TemplateNotFoundError,
|
|
18777
19588
|
TokenUsageEvaluator,
|
|
18778
19589
|
ToolTrajectoryEvaluator,
|
|
19590
|
+
TranscriptProvider,
|
|
18779
19591
|
WorkspaceCreationError,
|
|
18780
19592
|
WorkspacePoolManager,
|
|
18781
19593
|
addProject,
|
|
@@ -18812,6 +19624,7 @@ export {
|
|
|
18812
19624
|
detectFormat,
|
|
18813
19625
|
discoverAssertions,
|
|
18814
19626
|
discoverClaudeSessions,
|
|
19627
|
+
discoverCodexSessions,
|
|
18815
19628
|
discoverCopilotSessions,
|
|
18816
19629
|
discoverGraders,
|
|
18817
19630
|
discoverGraders as discoverJudges,
|
|
@@ -18872,6 +19685,8 @@ export {
|
|
|
18872
19685
|
normalizeLineEndings,
|
|
18873
19686
|
parseAgentSkillsEvals,
|
|
18874
19687
|
parseClaudeSession,
|
|
19688
|
+
parseCodexSession,
|
|
19689
|
+
parseCopilotEvents,
|
|
18875
19690
|
parseJsonFromText,
|
|
18876
19691
|
parseJsonSafe,
|
|
18877
19692
|
readJsonFile,
|
|
@@ -18879,8 +19694,10 @@ export {
|
|
|
18879
19694
|
readTestSuiteMetadata,
|
|
18880
19695
|
readTextFile,
|
|
18881
19696
|
readTranscriptFile,
|
|
19697
|
+
readTranscriptJsonl,
|
|
18882
19698
|
removeProject,
|
|
18883
19699
|
resolveAndCreateProvider,
|
|
19700
|
+
resolveDelegatedTargetDefinition,
|
|
18884
19701
|
resolveFileReference,
|
|
18885
19702
|
resolveTargetDefinition,
|
|
18886
19703
|
resolveWorkspaceTemplate,
|
|
@@ -18910,6 +19727,7 @@ export {
|
|
|
18910
19727
|
substituteVariables,
|
|
18911
19728
|
toCamelCaseDeep,
|
|
18912
19729
|
toSnakeCaseDeep,
|
|
19730
|
+
toTranscriptJsonLine,
|
|
18913
19731
|
tokensPerTool,
|
|
18914
19732
|
touchProject,
|
|
18915
19733
|
transpileEvalYaml,
|