@agentv/core 4.6.1 → 4.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,5 +1,6 @@
1
1
  import {
2
2
  COMMON_TARGET_SETTINGS,
3
+ LLM_GRADER_CAPABLE_KINDS,
3
4
  TEST_MESSAGE_ROLES,
4
5
  buildDirectoryChain,
5
6
  buildSearchRoots,
@@ -24,7 +25,7 @@ import {
24
25
  resolveDelegatedTargetDefinition,
25
26
  resolveFileReference,
26
27
  resolveTargetDefinition
27
- } from "./chunk-ZK4GG7PR.js";
28
+ } from "./chunk-VCVVKCC4.js";
28
29
  import {
29
30
  AgentvProvider
30
31
  } from "./chunk-PRNXHNLF.js";
@@ -856,8 +857,13 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
856
857
  const negate = rawEvaluator.negate === true ? true : void 0;
857
858
  if (isCustomType) {
858
859
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
859
- const required2 = parseRequired(rawEvaluator.required);
860
- const knownProps2 = /* @__PURE__ */ new Set(["name", "type", "weight", "required", "negate"]);
860
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
861
+ rawEvaluator.required,
862
+ rawEvaluator.min_score,
863
+ name,
864
+ evalId
865
+ );
866
+ const knownProps2 = /* @__PURE__ */ new Set(["name", "type", "weight", "required", "min_score", "negate"]);
861
867
  const config2 = {};
862
868
  for (const [key, value] of Object.entries(rawEvaluator)) {
863
869
  if (!knownProps2.has(key) && value !== void 0) {
@@ -869,6 +875,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
869
875
  type: customTypeName,
870
876
  ...weight2 !== void 0 ? { weight: weight2 } : {},
871
877
  ...required2 !== void 0 ? { required: required2 } : {},
878
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
872
879
  ...negate !== void 0 ? { negate } : {},
873
880
  ...Object.keys(config2).length > 0 ? { config: config2 } : {}
874
881
  });
@@ -938,7 +945,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
938
945
  );
939
946
  }
940
947
  }
941
- const required2 = parseRequired(rawEvaluator.required);
948
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
949
+ rawEvaluator.required,
950
+ rawEvaluator.min_score,
951
+ name,
952
+ evalId
953
+ );
942
954
  const knownProps2 = /* @__PURE__ */ new Set([
943
955
  "name",
944
956
  "type",
@@ -964,6 +976,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
964
976
  resolvedCwd,
965
977
  ...weight2 !== void 0 ? { weight: weight2 } : {},
966
978
  ...required2 !== void 0 ? { required: required2 } : {},
979
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
967
980
  ...negate !== void 0 ? { negate } : {},
968
981
  ...Object.keys(config2).length > 0 ? { config: config2 } : {},
969
982
  ...targetConfig !== void 0 ? { target: targetConfig } : {}
@@ -1092,7 +1105,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1092
1105
  };
1093
1106
  }
1094
1107
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1095
- const required2 = parseRequired(rawEvaluator.required);
1108
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
1109
+ rawEvaluator.required,
1110
+ rawEvaluator.min_score,
1111
+ name,
1112
+ evalId
1113
+ );
1096
1114
  evaluators.push({
1097
1115
  name,
1098
1116
  type: "composite",
@@ -1100,6 +1118,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1100
1118
  aggregator,
1101
1119
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1102
1120
  ...required2 !== void 0 ? { required: required2 } : {},
1121
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
1103
1122
  ...negate !== void 0 ? { negate } : {}
1104
1123
  });
1105
1124
  continue;
@@ -1210,7 +1229,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1210
1229
  continue;
1211
1230
  }
1212
1231
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1213
- const required2 = parseRequired(rawEvaluator.required);
1232
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
1233
+ rawEvaluator.required,
1234
+ rawEvaluator.min_score,
1235
+ name,
1236
+ evalId
1237
+ );
1214
1238
  const config2 = {
1215
1239
  name,
1216
1240
  type: "tool-trajectory",
@@ -1219,6 +1243,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1219
1243
  ...expected ? { expected } : {},
1220
1244
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1221
1245
  ...required2 !== void 0 ? { required: required2 } : {},
1246
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
1222
1247
  ...negate !== void 0 ? { negate } : {},
1223
1248
  ...argsMatch2 !== void 0 ? { argsMatch: argsMatch2 } : {}
1224
1249
  };
@@ -1281,7 +1306,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1281
1306
  const aggregation = asString(rawEvaluator.aggregation);
1282
1307
  const validAggregation = isValidFieldAggregationType(aggregation) ? aggregation : void 0;
1283
1308
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1284
- const required2 = parseRequired(rawEvaluator.required);
1309
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
1310
+ rawEvaluator.required,
1311
+ rawEvaluator.min_score,
1312
+ name,
1313
+ evalId
1314
+ );
1285
1315
  evaluators.push({
1286
1316
  name,
1287
1317
  type: "field-accuracy",
@@ -1289,6 +1319,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1289
1319
  ...validAggregation ? { aggregation: validAggregation } : {},
1290
1320
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1291
1321
  ...required2 !== void 0 ? { required: required2 } : {},
1322
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
1292
1323
  ...negate !== void 0 ? { negate } : {}
1293
1324
  });
1294
1325
  continue;
@@ -1302,13 +1333,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1302
1333
  continue;
1303
1334
  }
1304
1335
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1305
- const required2 = parseRequired(rawEvaluator.required);
1336
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
1337
+ rawEvaluator.required,
1338
+ rawEvaluator.min_score,
1339
+ name,
1340
+ evalId
1341
+ );
1306
1342
  evaluators.push({
1307
1343
  name,
1308
1344
  type: "latency",
1309
1345
  threshold,
1310
1346
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1311
1347
  ...required2 !== void 0 ? { required: required2 } : {},
1348
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
1312
1349
  ...negate !== void 0 ? { negate } : {}
1313
1350
  });
1314
1351
  continue;
@@ -1322,13 +1359,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1322
1359
  continue;
1323
1360
  }
1324
1361
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1325
- const required2 = parseRequired(rawEvaluator.required);
1362
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
1363
+ rawEvaluator.required,
1364
+ rawEvaluator.min_score,
1365
+ name,
1366
+ evalId
1367
+ );
1326
1368
  evaluators.push({
1327
1369
  name,
1328
1370
  type: "cost",
1329
1371
  budget,
1330
1372
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1331
1373
  ...required2 !== void 0 ? { required: required2 } : {},
1374
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
1332
1375
  ...negate !== void 0 ? { negate } : {}
1333
1376
  });
1334
1377
  continue;
@@ -1360,13 +1403,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1360
1403
  continue;
1361
1404
  }
1362
1405
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1363
- const required2 = parseRequired(rawEvaluator.required);
1406
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
1407
+ rawEvaluator.required,
1408
+ rawEvaluator.min_score,
1409
+ name,
1410
+ evalId
1411
+ );
1364
1412
  evaluators.push({
1365
1413
  name,
1366
1414
  type: "token-usage",
1367
1415
  ...validLimits,
1368
1416
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1369
1417
  ...required2 !== void 0 ? { required: required2 } : {},
1418
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
1370
1419
  ...negate !== void 0 ? { negate } : {}
1371
1420
  });
1372
1421
  continue;
@@ -1412,13 +1461,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1412
1461
  continue;
1413
1462
  }
1414
1463
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1415
- const required2 = parseRequired(rawEvaluator.required);
1464
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
1465
+ rawEvaluator.required,
1466
+ rawEvaluator.min_score,
1467
+ name,
1468
+ evalId
1469
+ );
1416
1470
  evaluators.push({
1417
1471
  name,
1418
1472
  type: "execution-metrics",
1419
1473
  ...validThresholds,
1420
1474
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1421
1475
  ...required2 !== void 0 ? { required: required2 } : {},
1476
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
1422
1477
  ...negate !== void 0 ? { negate } : {}
1423
1478
  });
1424
1479
  continue;
@@ -1432,7 +1487,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1432
1487
  const rawShouldTrigger = rawEvaluator.should_trigger;
1433
1488
  const shouldTrigger = typeof rawShouldTrigger === "boolean" ? rawShouldTrigger : void 0;
1434
1489
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1435
- const required2 = parseRequired(rawEvaluator.required);
1490
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
1491
+ rawEvaluator.required,
1492
+ rawEvaluator.min_score,
1493
+ name,
1494
+ evalId
1495
+ );
1436
1496
  evaluators.push({
1437
1497
  name,
1438
1498
  type: "skill-trigger",
@@ -1440,6 +1500,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1440
1500
  ...shouldTrigger !== void 0 ? { should_trigger: shouldTrigger } : {},
1441
1501
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1442
1502
  ...required2 !== void 0 ? { required: required2 } : {},
1503
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
1443
1504
  ...negate !== void 0 ? { negate } : {}
1444
1505
  });
1445
1506
  continue;
@@ -1451,13 +1512,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1451
1512
  continue;
1452
1513
  }
1453
1514
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1454
- const required2 = parseRequired(rawEvaluator.required);
1515
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
1516
+ rawEvaluator.required,
1517
+ rawEvaluator.min_score,
1518
+ name,
1519
+ evalId
1520
+ );
1455
1521
  evaluators.push({
1456
1522
  name,
1457
1523
  type: "contains",
1458
1524
  value,
1459
1525
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1460
1526
  ...required2 !== void 0 ? { required: required2 } : {},
1527
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
1461
1528
  ...negate !== void 0 ? { negate } : {}
1462
1529
  });
1463
1530
  continue;
@@ -1471,13 +1538,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1471
1538
  continue;
1472
1539
  }
1473
1540
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1474
- const required2 = parseRequired(rawEvaluator.required);
1541
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
1542
+ rawEvaluator.required,
1543
+ rawEvaluator.min_score,
1544
+ name,
1545
+ evalId
1546
+ );
1475
1547
  evaluators.push({
1476
1548
  name,
1477
1549
  type: typeValue,
1478
1550
  value,
1479
1551
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1480
1552
  ...required2 !== void 0 ? { required: required2 } : {},
1553
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
1481
1554
  ...negate !== void 0 ? { negate } : {}
1482
1555
  });
1483
1556
  continue;
@@ -1489,13 +1562,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1489
1562
  continue;
1490
1563
  }
1491
1564
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1492
- const required2 = parseRequired(rawEvaluator.required);
1565
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
1566
+ rawEvaluator.required,
1567
+ rawEvaluator.min_score,
1568
+ name,
1569
+ evalId
1570
+ );
1493
1571
  evaluators.push({
1494
1572
  name,
1495
1573
  type: "icontains",
1496
1574
  value,
1497
1575
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1498
1576
  ...required2 !== void 0 ? { required: required2 } : {},
1577
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
1499
1578
  ...negate !== void 0 ? { negate } : {}
1500
1579
  });
1501
1580
  continue;
@@ -1509,13 +1588,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1509
1588
  continue;
1510
1589
  }
1511
1590
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1512
- const required2 = parseRequired(rawEvaluator.required);
1591
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
1592
+ rawEvaluator.required,
1593
+ rawEvaluator.min_score,
1594
+ name,
1595
+ evalId
1596
+ );
1513
1597
  evaluators.push({
1514
1598
  name,
1515
1599
  type: typeValue,
1516
1600
  value,
1517
1601
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1518
1602
  ...required2 !== void 0 ? { required: required2 } : {},
1603
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
1519
1604
  ...negate !== void 0 ? { negate } : {}
1520
1605
  });
1521
1606
  continue;
@@ -1527,13 +1612,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1527
1612
  continue;
1528
1613
  }
1529
1614
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1530
- const required2 = parseRequired(rawEvaluator.required);
1615
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
1616
+ rawEvaluator.required,
1617
+ rawEvaluator.min_score,
1618
+ name,
1619
+ evalId
1620
+ );
1531
1621
  evaluators.push({
1532
1622
  name,
1533
1623
  type: typeValue,
1534
1624
  value,
1535
1625
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1536
1626
  ...required2 !== void 0 ? { required: required2 } : {},
1627
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
1537
1628
  ...negate !== void 0 ? { negate } : {}
1538
1629
  });
1539
1630
  continue;
@@ -1546,7 +1637,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1546
1637
  }
1547
1638
  const flags = asString(rawEvaluator.flags);
1548
1639
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1549
- const required2 = parseRequired(rawEvaluator.required);
1640
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
1641
+ rawEvaluator.required,
1642
+ rawEvaluator.min_score,
1643
+ name,
1644
+ evalId
1645
+ );
1550
1646
  evaluators.push({
1551
1647
  name,
1552
1648
  type: "regex",
@@ -1554,18 +1650,25 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1554
1650
  ...flags !== void 0 ? { flags } : {},
1555
1651
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1556
1652
  ...required2 !== void 0 ? { required: required2 } : {},
1653
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
1557
1654
  ...negate !== void 0 ? { negate } : {}
1558
1655
  });
1559
1656
  continue;
1560
1657
  }
1561
1658
  if (typeValue === "is-json") {
1562
1659
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1563
- const required2 = parseRequired(rawEvaluator.required);
1660
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
1661
+ rawEvaluator.required,
1662
+ rawEvaluator.min_score,
1663
+ name,
1664
+ evalId
1665
+ );
1564
1666
  evaluators.push({
1565
1667
  name,
1566
1668
  type: "is-json",
1567
1669
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1568
1670
  ...required2 !== void 0 ? { required: required2 } : {},
1671
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
1569
1672
  ...negate !== void 0 ? { negate } : {}
1570
1673
  });
1571
1674
  continue;
@@ -1577,13 +1680,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1577
1680
  continue;
1578
1681
  }
1579
1682
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1580
- const required2 = parseRequired(rawEvaluator.required);
1683
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
1684
+ rawEvaluator.required,
1685
+ rawEvaluator.min_score,
1686
+ name,
1687
+ evalId
1688
+ );
1581
1689
  evaluators.push({
1582
1690
  name,
1583
1691
  type: "equals",
1584
1692
  value,
1585
1693
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1586
1694
  ...required2 !== void 0 ? { required: required2 } : {},
1695
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
1587
1696
  ...negate !== void 0 ? { negate } : {}
1588
1697
  });
1589
1698
  continue;
@@ -1619,7 +1728,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1619
1728
  continue;
1620
1729
  }
1621
1730
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1622
- const required2 = parseRequired(rawEvaluator.required);
1731
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
1732
+ rawEvaluator.required,
1733
+ rawEvaluator.min_score,
1734
+ name,
1735
+ evalId
1736
+ );
1623
1737
  evaluators.push({
1624
1738
  name,
1625
1739
  type: "llm-grader",
@@ -1627,6 +1741,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1627
1741
  ...graderTargetName ? { target: graderTargetName } : {},
1628
1742
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1629
1743
  ...required2 !== void 0 ? { required: required2 } : {},
1744
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
1630
1745
  ...negate !== void 0 ? { negate } : {}
1631
1746
  });
1632
1747
  continue;
@@ -1696,7 +1811,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1696
1811
  continue;
1697
1812
  }
1698
1813
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1699
- const required2 = parseRequired(rawEvaluator.required);
1814
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
1815
+ rawEvaluator.required,
1816
+ rawEvaluator.min_score,
1817
+ name,
1818
+ evalId
1819
+ );
1700
1820
  evaluators.push({
1701
1821
  name,
1702
1822
  type: "llm-grader",
@@ -1704,12 +1824,18 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1704
1824
  ...graderTargetName ? { target: graderTargetName } : {},
1705
1825
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1706
1826
  ...required2 !== void 0 ? { required: required2 } : {},
1827
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
1707
1828
  ...negate !== void 0 ? { negate } : {}
1708
1829
  });
1709
1830
  continue;
1710
1831
  }
1711
1832
  const weight = validateWeight(rawEvaluator.weight, name, evalId);
1712
- const required = parseRequired(rawEvaluator.required);
1833
+ const { required, min_score } = parseRequiredAndMinScore(
1834
+ rawEvaluator.required,
1835
+ rawEvaluator.min_score,
1836
+ name,
1837
+ evalId
1838
+ );
1713
1839
  const knownProps = /* @__PURE__ */ new Set([
1714
1840
  "name",
1715
1841
  "type",
@@ -1720,6 +1846,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1720
1846
  "weight",
1721
1847
  "config",
1722
1848
  "required",
1849
+ "min_score",
1723
1850
  "negate",
1724
1851
  "max_steps",
1725
1852
  "maxSteps",
@@ -1749,6 +1876,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1749
1876
  ...graderTargetName ? { target: graderTargetName } : {},
1750
1877
  ...weight !== void 0 ? { weight } : {},
1751
1878
  ...required !== void 0 ? { required } : {},
1879
+ ...min_score !== void 0 ? { min_score } : {},
1752
1880
  ...negate !== void 0 ? { negate } : {},
1753
1881
  ...finalConfig ? { config: finalConfig } : {},
1754
1882
  ...llmMaxSteps !== void 0 ? { max_steps: llmMaxSteps } : {},
@@ -1880,10 +2008,23 @@ ${detailBlock}${ANSI_RESET4}`);
1880
2008
  console.warn(`${ANSI_YELLOW3}Warning: ${message}${ANSI_RESET4}`);
1881
2009
  }
1882
2010
  }
1883
- function parseRequired(value) {
1884
- if (value === true) return true;
1885
- if (typeof value === "number" && value > 0 && value <= 1) return value;
1886
- return void 0;
2011
+ function parseRequiredAndMinScore(rawRequired, rawMinScore, evaluatorName, evalId) {
2012
+ const result = {};
2013
+ if (typeof rawMinScore === "number" && rawMinScore > 0 && rawMinScore <= 1) {
2014
+ result.min_score = rawMinScore;
2015
+ }
2016
+ if (rawRequired === true) {
2017
+ result.required = true;
2018
+ } else if (typeof rawRequired === "number" && rawRequired > 0 && rawRequired <= 1) {
2019
+ if (result.min_score === void 0) {
2020
+ result.min_score = rawRequired;
2021
+ }
2022
+ result.required = rawRequired;
2023
+ logWarning2(
2024
+ `Evaluator '${evaluatorName}' in '${evalId}': 'required: ${rawRequired}' is deprecated. Use 'required: true' + 'min_score: ${rawRequired}' instead.`
2025
+ );
2026
+ }
2027
+ return result;
1887
2028
  }
1888
2029
  function validateWeight(rawWeight, evaluatorName, evalId) {
1889
2030
  if (rawWeight === void 0) {
@@ -1926,16 +2067,30 @@ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
1926
2067
  const id = asString(rawRubric.id) ?? `rubric-${index + 1}`;
1927
2068
  const expectedOutcome = asString(rawRubric.outcome) ?? "";
1928
2069
  const weight = typeof rawRubric.weight === "number" ? rawRubric.weight : 1;
2070
+ let minScore;
1929
2071
  let requiredMinScore;
1930
2072
  let required;
1931
- if (typeof rawRubric.required_min_score === "number") {
1932
- const minScore = rawRubric.required_min_score;
1933
- if (!Number.isInteger(minScore) || minScore < 0 || minScore > 10) {
2073
+ if (typeof rawRubric.min_score === "number") {
2074
+ const ms = rawRubric.min_score;
2075
+ if (ms <= 0 || ms > 1) {
1934
2076
  throw new Error(
1935
- `Invalid required_min_score for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be an integer 0-10 (got ${minScore})`
2077
+ `Invalid min_score for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be in (0, 1] (got ${ms})`
1936
2078
  );
1937
2079
  }
1938
- requiredMinScore = minScore;
2080
+ minScore = ms;
2081
+ requiredMinScore = Math.round(ms * 10);
2082
+ } else if (typeof rawRubric.required_min_score === "number") {
2083
+ const rms = rawRubric.required_min_score;
2084
+ if (!Number.isInteger(rms) || rms < 0 || rms > 10) {
2085
+ throw new Error(
2086
+ `Invalid required_min_score for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be an integer 0-10 (got ${rms})`
2087
+ );
2088
+ }
2089
+ requiredMinScore = rms;
2090
+ minScore = rms / 10;
2091
+ logWarning2(
2092
+ `Rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': 'required_min_score: ${rms}' is deprecated. Use 'min_score: ${rms / 10}' (0-1 scale) instead.`
2093
+ );
1939
2094
  }
1940
2095
  if (typeof rawRubric.required === "boolean") {
1941
2096
  required = rawRubric.required;
@@ -1955,6 +2110,7 @@ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
1955
2110
  weight,
1956
2111
  ...expectedOutcome.length > 0 ? { outcome: expectedOutcome } : {},
1957
2112
  ...required !== void 0 ? { required } : {},
2113
+ ...minScore !== void 0 ? { min_score: minScore } : {},
1958
2114
  ...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {},
1959
2115
  score_ranges: scoreRanges
1960
2116
  });
@@ -1971,6 +2127,7 @@ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
1971
2127
  weight,
1972
2128
  // Default to required: true if not specified (backward compatibility)
1973
2129
  required: required ?? true,
2130
+ ...minScore !== void 0 ? { min_score: minScore } : {},
1974
2131
  ...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {}
1975
2132
  });
1976
2133
  }
@@ -2099,12 +2256,22 @@ function parseInlineRubrics(rawRubrics) {
2099
2256
  id: asString(rubric.id) ?? `rubric-${index + 1}`,
2100
2257
  weight: typeof rubric.weight === "number" ? rubric.weight : 1
2101
2258
  };
2259
+ let inlineMinScore;
2260
+ let inlineRequiredMinScore;
2261
+ if (typeof rubric.min_score === "number") {
2262
+ inlineMinScore = rubric.min_score;
2263
+ inlineRequiredMinScore = Math.round(inlineMinScore * 10);
2264
+ } else if (typeof rubric.required_min_score === "number") {
2265
+ inlineRequiredMinScore = rubric.required_min_score;
2266
+ inlineMinScore = inlineRequiredMinScore / 10;
2267
+ }
2102
2268
  if (scoreRanges && scoreRanges.length > 0) {
2103
2269
  return {
2104
2270
  ...baseRubric,
2105
2271
  ...expectedOutcome.length > 0 ? { outcome: expectedOutcome } : {},
2106
2272
  ...typeof rubric.required === "boolean" ? { required: rubric.required } : {},
2107
- ...typeof rubric.required_min_score === "number" ? { required_min_score: rubric.required_min_score } : {},
2273
+ ...inlineMinScore !== void 0 ? { min_score: inlineMinScore } : {},
2274
+ ...inlineRequiredMinScore !== void 0 ? { required_min_score: inlineRequiredMinScore } : {},
2108
2275
  score_ranges: scoreRanges
2109
2276
  };
2110
2277
  }
@@ -2112,7 +2279,8 @@ function parseInlineRubrics(rawRubrics) {
2112
2279
  ...baseRubric,
2113
2280
  outcome: expectedOutcome,
2114
2281
  required: typeof rubric.required === "boolean" ? rubric.required : true,
2115
- ...typeof rubric.required_min_score === "number" ? { required_min_score: rubric.required_min_score } : {}
2282
+ ...inlineMinScore !== void 0 ? { min_score: inlineMinScore } : {},
2283
+ ...inlineRequiredMinScore !== void 0 ? { required_min_score: inlineRequiredMinScore } : {}
2116
2284
  };
2117
2285
  }).filter((r) => r.outcome && r.outcome.length > 0 || "score_ranges" in r && r.score_ranges);
2118
2286
  if (rubricItems.length === 0) {
@@ -2512,6 +2680,9 @@ function resolveExpectedMessages(raw) {
2512
2680
  var ANSI_YELLOW5 = "\x1B[33m";
2513
2681
  var ANSI_RED2 = "\x1B[31m";
2514
2682
  var ANSI_RESET6 = "\x1B[0m";
2683
+ function matchesFilter(id, filter) {
2684
+ return typeof filter === "string" ? micromatch.isMatch(id, filter) : filter.some((pattern) => micromatch.isMatch(id, pattern));
2685
+ }
2515
2686
  function detectFormat(filePath) {
2516
2687
  const ext = path6.extname(filePath).toLowerCase();
2517
2688
  if (ext === ".jsonl") return "jsonl";
@@ -2579,40 +2750,40 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
2579
2750
  const sidecar = await loadSidecarMetadata(absoluteTestPath, verbose);
2580
2751
  const rawFile = await readFile5(absoluteTestPath, "utf8");
2581
2752
  const rawCases = parseJsonlContent(rawFile, evalFilePath);
2582
- const fallbackEvalSet = path6.basename(absoluteTestPath, ".jsonl") || "eval";
2583
- const evalSetName = sidecar.name && sidecar.name.trim().length > 0 ? sidecar.name : fallbackEvalSet;
2753
+ const fallbackSuiteName = path6.basename(absoluteTestPath, ".jsonl") || "eval";
2754
+ const suiteName = sidecar.name && sidecar.name.trim().length > 0 ? sidecar.name : fallbackSuiteName;
2584
2755
  const globalEvaluator = coerceEvaluator(sidecar.evaluator, "sidecar") ?? "llm-grader";
2585
2756
  const globalExecution = sidecar.execution;
2586
2757
  if (verbose) {
2587
2758
  console.log(`
2588
- [JSONL Dataset: ${evalFilePath}]`);
2759
+ [JSONL Suite: ${evalFilePath}]`);
2589
2760
  console.log(` Cases: ${rawCases.length}`);
2590
- console.log(` Eval set: ${evalSetName}`);
2761
+ console.log(` Suite: ${suiteName}`);
2591
2762
  if (sidecar.description) {
2592
2763
  console.log(` Description: ${sidecar.description}`);
2593
2764
  }
2594
2765
  }
2595
2766
  const results = [];
2596
2767
  for (let lineIndex = 0; lineIndex < rawCases.length; lineIndex++) {
2597
- const evalcase = rawCases[lineIndex];
2768
+ const testCaseConfig = rawCases[lineIndex];
2598
2769
  const lineNumber = lineIndex + 1;
2599
- const id = asString4(evalcase.id);
2600
- if (filterPattern && (!id || !micromatch.isMatch(id, filterPattern))) {
2770
+ const id = asString4(testCaseConfig.id);
2771
+ if (filterPattern && (!id || !matchesFilter(id, filterPattern))) {
2601
2772
  continue;
2602
2773
  }
2603
- const conversationId = asString4(evalcase.conversation_id);
2604
- let outcome = asString4(evalcase.criteria);
2605
- if (!outcome && evalcase.expected_outcome !== void 0) {
2606
- outcome = asString4(evalcase.expected_outcome);
2774
+ const conversationId = asString4(testCaseConfig.conversation_id);
2775
+ let outcome = asString4(testCaseConfig.criteria);
2776
+ if (!outcome && testCaseConfig.expected_outcome !== void 0) {
2777
+ outcome = asString4(testCaseConfig.expected_outcome);
2607
2778
  if (outcome) {
2608
2779
  logWarning4(
2609
- `Test '${asString4(evalcase.id) ?? "unknown"}': 'expected_outcome' is deprecated. Use 'criteria' instead.`
2780
+ `Test '${asString4(testCaseConfig.id) ?? "unknown"}': 'expected_outcome' is deprecated. Use 'criteria' instead.`
2610
2781
  );
2611
2782
  }
2612
2783
  }
2613
- const rawInputMessages = resolveInputMessages(evalcase);
2614
- const expectedMessages = resolveExpectedMessages(evalcase) ?? [];
2615
- const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || evalcase.assert !== void 0;
2784
+ const rawInputMessages = resolveInputMessages(testCaseConfig);
2785
+ const expectedMessages = resolveExpectedMessages(testCaseConfig) ?? [];
2786
+ const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || testCaseConfig.assert !== void 0;
2616
2787
  if (!id || !hasEvaluationSpec || !rawInputMessages || rawInputMessages.length === 0) {
2617
2788
  logError2(
2618
2789
  `Skipping incomplete test at line ${lineNumber}: ${id ?? "unknown"}. Missing required fields: id, input, and at least one of criteria/expected_output/assert`
@@ -2649,18 +2820,23 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
2649
2820
  }
2650
2821
  }
2651
2822
  const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
2652
- const caseExecution = isJsonObject(evalcase.execution) ? evalcase.execution : void 0;
2823
+ const caseExecution = isJsonObject(testCaseConfig.execution) ? testCaseConfig.execution : void 0;
2653
2824
  const mergedExecution = caseExecution ?? globalExecution;
2654
- const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
2825
+ const testCaseEvaluatorKind = coerceEvaluator(testCaseConfig.evaluator, id) ?? globalEvaluator;
2655
2826
  let evaluators;
2656
2827
  try {
2657
- evaluators = await parseEvaluators(evalcase, mergedExecution, searchRoots, id ?? "unknown");
2828
+ evaluators = await parseEvaluators(
2829
+ testCaseConfig,
2830
+ mergedExecution,
2831
+ searchRoots,
2832
+ id ?? "unknown"
2833
+ );
2658
2834
  } catch (error) {
2659
2835
  const message = error instanceof Error ? error.message : String(error);
2660
2836
  logError2(`Skipping test '${id}' at line ${lineNumber}: ${message}`);
2661
2837
  continue;
2662
2838
  }
2663
- const inlineRubrics = evalcase.rubrics;
2839
+ const inlineRubrics = testCaseConfig.rubrics;
2664
2840
  if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
2665
2841
  const rubricEvaluator = parseInlineRubrics(inlineRubrics);
2666
2842
  if (rubricEvaluator) {
@@ -2671,7 +2847,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
2671
2847
  const userFilePaths = collectResolvedInputFilePaths(inputMessages);
2672
2848
  const testCase = {
2673
2849
  id,
2674
- dataset: evalSetName,
2850
+ suite: suiteName,
2675
2851
  conversation_id: conversationId,
2676
2852
  question,
2677
2853
  input: inputMessages,
@@ -2679,7 +2855,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
2679
2855
  reference_answer: referenceAnswer,
2680
2856
  file_paths: userFilePaths,
2681
2857
  criteria: outcome ?? "",
2682
- evaluator: evalCaseEvaluatorKind,
2858
+ evaluator: testCaseEvaluatorKind,
2683
2859
  assertions: evaluators
2684
2860
  };
2685
2861
  results.push(testCase);
@@ -2862,6 +3038,9 @@ function buildChatPromptFromSegments(options) {
2862
3038
  var ANSI_YELLOW6 = "\x1B[33m";
2863
3039
  var ANSI_RED3 = "\x1B[31m";
2864
3040
  var ANSI_RESET7 = "\x1B[0m";
3041
+ function matchesFilter2(id, filter) {
3042
+ return typeof filter === "string" ? micromatch2.isMatch(id, filter) : filter.some((pattern) => micromatch2.isMatch(id, pattern));
3043
+ }
2865
3044
  function resolveTests(suite) {
2866
3045
  if (suite.tests !== void 0) return suite.tests;
2867
3046
  if (suite.eval_cases !== void 0) {
@@ -2941,18 +3120,18 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
2941
3120
  throw new Error(`Invalid test file format: ${evalFilePath}`);
2942
3121
  }
2943
3122
  const suite = interpolated;
2944
- const evalSetNameFromSuite = asString5(suite.name)?.trim();
2945
- const fallbackEvalSet = path7.basename(absoluteTestPath).replace(/\.eval\.ya?ml$/i, "").replace(/\.ya?ml$/i, "") || "eval";
2946
- const evalSetName = evalSetNameFromSuite && evalSetNameFromSuite.length > 0 ? evalSetNameFromSuite : fallbackEvalSet;
2947
- const rawTestcases = resolveTests(suite);
3123
+ const suiteNameFromFile = asString5(suite.name)?.trim();
3124
+ const fallbackSuiteName = path7.basename(absoluteTestPath).replace(/\.eval\.ya?ml$/i, "").replace(/\.ya?ml$/i, "") || "eval";
3125
+ const suiteName = suiteNameFromFile && suiteNameFromFile.length > 0 ? suiteNameFromFile : fallbackSuiteName;
3126
+ const rawTestCases = resolveTests(suite);
2948
3127
  const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm-grader";
2949
3128
  const evalFileDir = path7.dirname(absoluteTestPath);
2950
- let expandedTestcases;
2951
- if (typeof rawTestcases === "string") {
2952
- const externalPath = path7.resolve(evalFileDir, rawTestcases);
2953
- expandedTestcases = await loadCasesFromFile(externalPath);
2954
- } else if (Array.isArray(rawTestcases)) {
2955
- expandedTestcases = await expandFileReferences(rawTestcases, evalFileDir);
3129
+ let expandedTestCases;
3130
+ if (typeof rawTestCases === "string") {
3131
+ const externalPath = path7.resolve(evalFileDir, rawTestCases);
3132
+ expandedTestCases = await loadCasesFromFile(externalPath);
3133
+ } else if (Array.isArray(rawTestCases)) {
3134
+ expandedTestCases = await expandFileReferences(rawTestCases, evalFileDir);
2956
3135
  } else {
2957
3136
  throw new Error(`Invalid test file format: ${evalFilePath} - missing 'tests' field`);
2958
3137
  }
@@ -2967,32 +3146,33 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
2967
3146
  }
2968
3147
  const globalExecution = suiteAssertions !== void 0 ? { ...rawGlobalExecution ?? {}, assertions: suiteAssertions } : rawGlobalExecution;
2969
3148
  const results = [];
2970
- for (const rawEvalcase of expandedTestcases) {
2971
- if (!isJsonObject(rawEvalcase)) {
3149
+ for (const rawTestCase of expandedTestCases) {
3150
+ if (!isJsonObject(rawTestCase)) {
2972
3151
  logWarning5("Skipping invalid test entry (expected object)");
2973
3152
  continue;
2974
3153
  }
2975
- const evalcase = rawEvalcase;
2976
- const id = asString5(evalcase.id);
2977
- if (filterPattern && (!id || !micromatch2.isMatch(id, filterPattern))) {
3154
+ const testCaseConfig = rawTestCase;
3155
+ const id = asString5(testCaseConfig.id);
3156
+ if (filterPattern && (!id || !matchesFilter2(id, filterPattern))) {
2978
3157
  continue;
2979
3158
  }
2980
- const conversationId = asString5(evalcase.conversation_id);
2981
- let outcome = asString5(evalcase.criteria);
2982
- if (!outcome && evalcase.expected_outcome !== void 0) {
2983
- outcome = asString5(evalcase.expected_outcome);
3159
+ const conversationId = asString5(testCaseConfig.conversation_id);
3160
+ let outcome = asString5(testCaseConfig.criteria);
3161
+ if (!outcome && testCaseConfig.expected_outcome !== void 0) {
3162
+ outcome = asString5(testCaseConfig.expected_outcome);
2984
3163
  if (outcome) {
2985
3164
  logWarning5(
2986
- `Test '${asString5(evalcase.id) ?? "unknown"}': 'expected_outcome' is deprecated. Use 'criteria' instead.`
3165
+ `Test '${asString5(testCaseConfig.id) ?? "unknown"}': 'expected_outcome' is deprecated. Use 'criteria' instead.`
2987
3166
  );
2988
3167
  }
2989
3168
  }
2990
- const caseExecution = isJsonObject(evalcase.execution) ? evalcase.execution : void 0;
3169
+ const caseExecution = isJsonObject(testCaseConfig.execution) ? testCaseConfig.execution : void 0;
2991
3170
  const skipDefaults = caseExecution?.skip_defaults === true;
3171
+ const caseThreshold = typeof caseExecution?.threshold === "number" && caseExecution.threshold >= 0 && caseExecution.threshold <= 1 ? caseExecution.threshold : void 0;
2992
3172
  const effectiveSuiteInputFiles = suiteInputFiles && !skipDefaults ? suiteInputFiles : void 0;
2993
- const testInputMessages = resolveInputMessages(evalcase, effectiveSuiteInputFiles);
2994
- const expectedMessages = resolveExpectedMessages(evalcase) ?? [];
2995
- const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || evalcase.assertions !== void 0 || evalcase.assert !== void 0;
3173
+ const testInputMessages = resolveInputMessages(testCaseConfig, effectiveSuiteInputFiles);
3174
+ const expectedMessages = resolveExpectedMessages(testCaseConfig) ?? [];
3175
+ const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || testCaseConfig.assertions !== void 0 || testCaseConfig.assert !== void 0;
2996
3176
  if (!id || !hasEvaluationSpec || !testInputMessages || testInputMessages.length === 0) {
2997
3177
  logError3(
2998
3178
  `Skipping incomplete test: ${id ?? "unknown"}. Missing required fields: id, input, and at least one of criteria/expected_output/assertions`
@@ -3039,16 +3219,21 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
3039
3219
  }
3040
3220
  }
3041
3221
  const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
3042
- const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
3222
+ const testCaseEvaluatorKind = coerceEvaluator(testCaseConfig.evaluator, id) ?? globalEvaluator;
3043
3223
  let evaluators;
3044
3224
  try {
3045
- evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
3225
+ evaluators = await parseEvaluators(
3226
+ testCaseConfig,
3227
+ globalExecution,
3228
+ searchRoots,
3229
+ id ?? "unknown"
3230
+ );
3046
3231
  } catch (error) {
3047
3232
  const message = error instanceof Error ? error.message : String(error);
3048
3233
  logError3(`Skipping test '${id}': ${message}`);
3049
3234
  continue;
3050
3235
  }
3051
- const inlineRubrics = evalcase.rubrics;
3236
+ const inlineRubrics = testCaseConfig.rubrics;
3052
3237
  if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
3053
3238
  const rubricEvaluator = parseInlineRubrics(inlineRubrics);
3054
3239
  if (rubricEvaluator) {
@@ -3057,13 +3242,13 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
3057
3242
  }
3058
3243
  warnUnconsumedCriteria(outcome, evaluators, id ?? "unknown");
3059
3244
  const userFilePaths = collectResolvedInputFilePaths(inputMessages);
3060
- const caseWorkspace = await resolveWorkspaceConfig(evalcase.workspace, evalFileDir);
3245
+ const caseWorkspace = await resolveWorkspaceConfig(testCaseConfig.workspace, evalFileDir);
3061
3246
  const mergedWorkspace = mergeWorkspaceConfigs(suiteWorkspace, caseWorkspace);
3062
- const metadata = isJsonObject(evalcase.metadata) ? evalcase.metadata : void 0;
3063
- const caseTargets = extractTargetsFromTestCase(evalcase);
3247
+ const metadata = isJsonObject(testCaseConfig.metadata) ? testCaseConfig.metadata : void 0;
3248
+ const caseTargets = extractTargetsFromTestCase(testCaseConfig);
3064
3249
  const testCase = {
3065
3250
  id,
3066
- dataset: evalSetName,
3251
+ suite: suiteName,
3067
3252
  category: options?.category,
3068
3253
  conversation_id: conversationId,
3069
3254
  question,
@@ -3072,11 +3257,12 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
3072
3257
  reference_answer: referenceAnswer,
3073
3258
  file_paths: userFilePaths,
3074
3259
  criteria: outcome ?? "",
3075
- evaluator: evalCaseEvaluatorKind,
3260
+ evaluator: testCaseEvaluatorKind,
3076
3261
  assertions: evaluators,
3077
3262
  workspace: mergedWorkspace,
3078
3263
  metadata,
3079
- targets: caseTargets
3264
+ targets: caseTargets,
3265
+ ...caseThreshold !== void 0 ? { threshold: caseThreshold } : {}
3080
3266
  };
3081
3267
  results.push(testCase);
3082
3268
  }
@@ -3620,7 +3806,7 @@ var AzureProvider = class {
3620
3806
  };
3621
3807
  this.retryConfig = config.retry;
3622
3808
  const azure = createAzure(buildAzureOptions(config));
3623
- this.model = azure.chat(config.deploymentName);
3809
+ this.model = config.apiFormat === "responses" ? azure(config.deploymentName) : azure.chat(config.deploymentName);
3624
3810
  }
3625
3811
  id;
3626
3812
  kind = "azure";
@@ -3746,7 +3932,9 @@ function buildAzureOptions(config) {
3746
3932
  const options = {
3747
3933
  apiKey: config.apiKey,
3748
3934
  apiVersion: config.version,
3749
- useDeploymentBasedUrls: true
3935
+ // Chat completions still use deployment-scoped Azure URLs for compatibility
3936
+ // with existing deployments. Responses API should use the SDK's v1 path.
3937
+ useDeploymentBasedUrls: config.apiFormat !== "responses"
3750
3938
  };
3751
3939
  const baseURL = normalizeAzureBaseUrl(config.resourceName);
3752
3940
  if (baseURL) {
@@ -6946,6 +7134,25 @@ var CopilotSdkProvider = class {
6946
7134
  content: systemPrompt
6947
7135
  };
6948
7136
  }
7137
+ if (this.config.byokBaseUrl) {
7138
+ const byokType = this.config.byokType ?? "openai";
7139
+ const provider = {
7140
+ type: byokType,
7141
+ baseUrl: normalizeByokBaseUrl(this.config.byokBaseUrl, byokType)
7142
+ };
7143
+ if (this.config.byokBearerToken) {
7144
+ provider.bearerToken = this.config.byokBearerToken;
7145
+ } else if (this.config.byokApiKey) {
7146
+ provider.apiKey = this.config.byokApiKey;
7147
+ }
7148
+ if (this.config.byokWireApi) {
7149
+ provider.wireApi = this.config.byokWireApi;
7150
+ }
7151
+ if (this.config.byokType === "azure" && this.config.byokApiVersion) {
7152
+ provider.azure = { apiVersion: this.config.byokApiVersion };
7153
+ }
7154
+ sessionOptions.provider = provider;
7155
+ }
6949
7156
  let session;
6950
7157
  try {
6951
7158
  session = await client.createSession(sessionOptions);
@@ -7177,6 +7384,16 @@ function resolveSkillDirectories(cwd) {
7177
7384
  ];
7178
7385
  return candidates.filter((dir) => existsSync2(dir));
7179
7386
  }
7387
+ function normalizeByokBaseUrl(baseUrl, type) {
7388
+ const trimmed = baseUrl.trim().replace(/\/+$/, "");
7389
+ if (/^https?:\/\//i.test(trimmed)) {
7390
+ return trimmed;
7391
+ }
7392
+ if (type === "azure") {
7393
+ return `https://${trimmed}.openai.azure.com`;
7394
+ }
7395
+ return trimmed;
7396
+ }
7180
7397
  function summarizeSdkEvent(eventType, data) {
7181
7398
  if (!data || typeof data !== "object") {
7182
7399
  return eventType;
@@ -7354,6 +7571,22 @@ function extractAzureResourceName(baseUrl) {
7354
7571
  if (urlMatch) return urlMatch[1];
7355
7572
  return baseUrl;
7356
7573
  }
7574
+ function normalizeAzureSdkBaseUrl(baseUrl) {
7575
+ const trimmed = baseUrl.trim().replace(/\/+$/, "");
7576
+ if (!trimmed) {
7577
+ return trimmed;
7578
+ }
7579
+ if (!/^https?:\/\//i.test(trimmed)) {
7580
+ return `https://${trimmed}.openai.azure.com/openai/v1`;
7581
+ }
7582
+ if (/\/openai\/v1$/i.test(trimmed)) {
7583
+ return trimmed;
7584
+ }
7585
+ if (/\/openai$/i.test(trimmed)) {
7586
+ return `${trimmed}/v1`;
7587
+ }
7588
+ return `${trimmed}/openai/v1`;
7589
+ }
7357
7590
 
7358
7591
  // src/evaluation/providers/pi-utils.ts
7359
7592
  function extractPiTextContent(content) {
@@ -8184,11 +8417,41 @@ async function defaultPiRunner(options) {
8184
8417
  // src/evaluation/providers/pi-coding-agent.ts
8185
8418
  import { execSync as execSync2 } from "node:child_process";
8186
8419
  import { randomUUID as randomUUID8 } from "node:crypto";
8187
- import { accessSync as accessSync2, createWriteStream as createWriteStream6 } from "node:fs";
8420
+ import { accessSync as accessSync2, createWriteStream as createWriteStream6, mkdirSync } from "node:fs";
8188
8421
  import { mkdir as mkdir7 } from "node:fs/promises";
8189
- import path20 from "node:path";
8422
+ import path21 from "node:path";
8190
8423
  import { createInterface } from "node:readline";
8191
- import { fileURLToPath as fileURLToPath3 } from "node:url";
8424
+ import { fileURLToPath as fileURLToPath3, pathToFileURL } from "node:url";
8425
+
8426
+ // src/paths.ts
8427
+ import os2 from "node:os";
8428
+ import path20 from "node:path";
8429
+ var logged = false;
8430
+ function getAgentvHome() {
8431
+ const envHome = process.env.AGENTV_HOME;
8432
+ if (envHome && envHome !== "undefined") {
8433
+ if (!logged) {
8434
+ logged = true;
8435
+ console.warn(`Using AGENTV_HOME: ${envHome}`);
8436
+ }
8437
+ return envHome;
8438
+ }
8439
+ return path20.join(os2.homedir(), ".agentv");
8440
+ }
8441
+ function getWorkspacesRoot() {
8442
+ return path20.join(getAgentvHome(), "workspaces");
8443
+ }
8444
+ function getSubagentsRoot() {
8445
+ return path20.join(getAgentvHome(), "subagents");
8446
+ }
8447
+ function getTraceStateRoot() {
8448
+ return path20.join(getAgentvHome(), "trace-state");
8449
+ }
8450
+ function getWorkspacePoolRoot() {
8451
+ return path20.join(getAgentvHome(), "workspace-pool");
8452
+ }
8453
+
8454
+ // src/evaluation/providers/pi-coding-agent.ts
8192
8455
  var piCodingAgentModule = null;
8193
8456
  var piAiModule = null;
8194
8457
  var loadingPromise = null;
@@ -8206,46 +8469,126 @@ async function promptInstall() {
8206
8469
  rl.close();
8207
8470
  }
8208
8471
  }
8209
- function findAgentvRoot() {
8210
- const thisFile = fileURLToPath3(import.meta.url);
8211
- let dir = path20.dirname(thisFile);
8212
- for (let i = 0; i < 10; i++) {
8472
+ function findManagedSdkInstallRoot() {
8473
+ return path21.join(getAgentvHome(), "deps", "pi-sdk");
8474
+ }
8475
+ function resolveGlobalNpmRoot() {
8476
+ try {
8477
+ const root = execSync2("npm root -g", {
8478
+ encoding: "utf-8",
8479
+ stdio: ["ignore", "pipe", "ignore"]
8480
+ }).trim();
8481
+ return root.length > 0 ? root : void 0;
8482
+ } catch {
8483
+ return void 0;
8484
+ }
8485
+ }
8486
+ function buildGlobalModuleEntry(moduleName, globalNpmRoot) {
8487
+ return path21.join(globalNpmRoot, ...moduleName.split("/"), "dist", "index.js");
8488
+ }
8489
+ function findAccessiblePath(paths) {
8490
+ for (const candidate of paths) {
8213
8491
  try {
8214
- const pkg = path20.join(dir, "package.json");
8215
- accessSync2(pkg);
8216
- return dir;
8492
+ accessSync2(candidate);
8493
+ return candidate;
8217
8494
  } catch {
8218
- const parent = path20.dirname(dir);
8219
- if (parent === dir) break;
8220
- dir = parent;
8221
8495
  }
8222
8496
  }
8223
- return path20.dirname(thisFile);
8497
+ return void 0;
8224
8498
  }
8225
- async function doLoadSdkModules() {
8499
+ async function tryImportLocalSdkModules() {
8226
8500
  try {
8227
8501
  [piCodingAgentModule, piAiModule] = await Promise.all([
8228
8502
  import("@mariozechner/pi-coding-agent"),
8229
8503
  import("@mariozechner/pi-ai")
8230
8504
  ]);
8505
+ return true;
8231
8506
  } catch {
8232
- if (await promptInstall()) {
8233
- const installDir = findAgentvRoot();
8234
- console.error(`Installing @mariozechner/pi-coding-agent into ${installDir}...`);
8235
- execSync2("bun add @mariozechner/pi-coding-agent", {
8236
- cwd: installDir,
8237
- stdio: "inherit"
8238
- });
8239
- [piCodingAgentModule, piAiModule] = await Promise.all([
8240
- import("@mariozechner/pi-coding-agent"),
8241
- import("@mariozechner/pi-ai")
8242
- ]);
8243
- } else {
8244
- throw new Error(
8245
- "pi-coding-agent SDK is not installed. Install it with:\n bun add @mariozechner/pi-coding-agent"
8246
- );
8507
+ return false;
8508
+ }
8509
+ }
8510
+ async function tryImportManagedSdkModules() {
8511
+ const managedRoot = findManagedSdkInstallRoot();
8512
+ const piCodingAgentEntry = findAccessiblePath([
8513
+ path21.join(managedRoot, "node_modules", "@mariozechner", "pi-coding-agent", "dist", "index.js")
8514
+ ]);
8515
+ const piAiEntry = findAccessiblePath([
8516
+ path21.join(managedRoot, "node_modules", "@mariozechner", "pi-ai", "dist", "index.js"),
8517
+ path21.join(
8518
+ managedRoot,
8519
+ "node_modules",
8520
+ "@mariozechner",
8521
+ "pi-coding-agent",
8522
+ "node_modules",
8523
+ "@mariozechner",
8524
+ "pi-ai",
8525
+ "dist",
8526
+ "index.js"
8527
+ )
8528
+ ]);
8529
+ if (!piCodingAgentEntry || !piAiEntry) return false;
8530
+ try {
8531
+ [piCodingAgentModule, piAiModule] = await Promise.all([
8532
+ import(pathToFileURL(piCodingAgentEntry).href),
8533
+ import(pathToFileURL(piAiEntry).href)
8534
+ ]);
8535
+ return true;
8536
+ } catch {
8537
+ return false;
8538
+ }
8539
+ }
8540
+ async function tryImportGlobalSdkModules() {
8541
+ const globalNpmRoot = resolveGlobalNpmRoot();
8542
+ if (!globalNpmRoot) return false;
8543
+ const piCodingAgentEntry = findAccessiblePath([
8544
+ buildGlobalModuleEntry("@mariozechner/pi-coding-agent", globalNpmRoot)
8545
+ ]);
8546
+ const piAiEntry = findAccessiblePath([
8547
+ buildGlobalModuleEntry("@mariozechner/pi-ai", globalNpmRoot),
8548
+ path21.join(
8549
+ globalNpmRoot,
8550
+ "@mariozechner",
8551
+ "pi-coding-agent",
8552
+ "node_modules",
8553
+ "@mariozechner",
8554
+ "pi-ai",
8555
+ "dist",
8556
+ "index.js"
8557
+ )
8558
+ ]);
8559
+ if (!piCodingAgentEntry || !piAiEntry) return false;
8560
+ try {
8561
+ [piCodingAgentModule, piAiModule] = await Promise.all([
8562
+ import(pathToFileURL(piCodingAgentEntry).href),
8563
+ import(pathToFileURL(piAiEntry).href)
8564
+ ]);
8565
+ return true;
8566
+ } catch {
8567
+ return false;
8568
+ }
8569
+ }
8570
+ function installSdkModules(installDir) {
8571
+ console.error(`Installing @mariozechner/pi-coding-agent into ${installDir} via npm...`);
8572
+ mkdirSync(installDir, { recursive: true });
8573
+ execSync2("npm install --no-save --no-package-lock @mariozechner/pi-coding-agent", {
8574
+ cwd: installDir,
8575
+ stdio: "inherit"
8576
+ });
8577
+ }
8578
+ async function doLoadSdkModules() {
8579
+ if (await tryImportLocalSdkModules() || await tryImportManagedSdkModules() || await tryImportGlobalSdkModules()) {
8580
+ return;
8581
+ }
8582
+ if (await promptInstall()) {
8583
+ const installDir = findManagedSdkInstallRoot();
8584
+ installSdkModules(installDir);
8585
+ if (await tryImportManagedSdkModules()) {
8586
+ return;
8247
8587
  }
8248
8588
  }
8589
+ throw new Error(
8590
+ "pi-coding-agent SDK is not installed. Install it with:\n npm install @mariozechner/pi-coding-agent"
8591
+ );
8249
8592
  }
8250
8593
  async function loadSdkModules() {
8251
8594
  if (!piCodingAgentModule || !piAiModule) {
@@ -8302,12 +8645,16 @@ var PiCodingAgentProvider = class {
8302
8645
  try {
8303
8646
  const cwd = this.resolveCwd(request.cwd);
8304
8647
  const rawProvider = this.config.subprovider ?? "google";
8305
- const hasBaseUrl = !!this.config.baseUrl;
8648
+ const normalizedBaseUrl = this.normalizeSdkBaseUrl(rawProvider, this.config.baseUrl);
8649
+ const hasBaseUrl = !!normalizedBaseUrl;
8306
8650
  const providerName = resolveSubprovider(rawProvider, hasBaseUrl);
8307
8651
  const modelId = this.config.model ?? "gemini-2.5-flash";
8308
8652
  this.setApiKeyEnv(rawProvider, hasBaseUrl);
8309
- this.setBaseUrlEnv(rawProvider, hasBaseUrl);
8653
+ this.setBaseUrlEnv(rawProvider, normalizedBaseUrl, hasBaseUrl);
8310
8654
  let model = sdk.getModel(providerName, modelId);
8655
+ if (model && normalizedBaseUrl) {
8656
+ model = { ...model, baseUrl: normalizedBaseUrl };
8657
+ }
8311
8658
  if (!model) {
8312
8659
  const envProvider = providerName.replace(/-responses$/, "");
8313
8660
  model = {
@@ -8315,7 +8662,7 @@ var PiCodingAgentProvider = class {
8315
8662
  name: modelId,
8316
8663
  api: providerName,
8317
8664
  provider: envProvider,
8318
- baseUrl: this.config.baseUrl ?? "",
8665
+ baseUrl: normalizedBaseUrl ?? "",
8319
8666
  reasoning: false,
8320
8667
  input: ["text"],
8321
8668
  cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
@@ -8482,19 +8829,27 @@ ${fileList}`;
8482
8829
  }
8483
8830
  }
8484
8831
  /** Maps config baseUrl to the provider-specific env var the SDK reads. */
8485
- setBaseUrlEnv(providerName, hasBaseUrl = false) {
8486
- if (!this.config.baseUrl) return;
8832
+ setBaseUrlEnv(providerName, baseUrl = this.config.baseUrl, hasBaseUrl = false) {
8833
+ const normalizedBaseUrl = this.normalizeSdkBaseUrl(providerName, baseUrl);
8834
+ if (!normalizedBaseUrl) return;
8487
8835
  const envKey = resolveEnvBaseUrlName(providerName, hasBaseUrl);
8488
8836
  if (envKey) {
8489
- process.env[envKey] = this.config.baseUrl;
8837
+ process.env[envKey] = normalizedBaseUrl;
8490
8838
  }
8491
8839
  }
8840
+ normalizeSdkBaseUrl(providerName, baseUrl) {
8841
+ if (!baseUrl) return void 0;
8842
+ if (providerName.toLowerCase() === "azure") {
8843
+ return normalizeAzureSdkBaseUrl(baseUrl);
8844
+ }
8845
+ return baseUrl;
8846
+ }
8492
8847
  resolveCwd(cwdOverride) {
8493
8848
  if (cwdOverride) {
8494
- return path20.resolve(cwdOverride);
8849
+ return path21.resolve(cwdOverride);
8495
8850
  }
8496
8851
  if (this.config.cwd) {
8497
- return path20.resolve(this.config.cwd);
8852
+ return path21.resolve(this.config.cwd);
8498
8853
  }
8499
8854
  return process.cwd();
8500
8855
  }
@@ -8513,9 +8868,9 @@ ${fileList}`;
8513
8868
  }
8514
8869
  resolveLogDirectory() {
8515
8870
  if (this.config.logDir) {
8516
- return path20.resolve(this.config.logDir);
8871
+ return path21.resolve(this.config.logDir);
8517
8872
  }
8518
- return path20.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
8873
+ return path21.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
8519
8874
  }
8520
8875
  async createStreamLogger(request) {
8521
8876
  const logDir = this.resolveLogDirectory();
@@ -8529,7 +8884,7 @@ ${fileList}`;
8529
8884
  console.warn(`Skipping Pi stream logging (could not create ${logDir}): ${message}`);
8530
8885
  return void 0;
8531
8886
  }
8532
- const filePath = path20.join(logDir, buildLogFilename6(request, this.targetName));
8887
+ const filePath = path21.join(logDir, buildLogFilename6(request, this.targetName));
8533
8888
  try {
8534
8889
  const logger = await PiStreamLogger2.create({
8535
8890
  filePath,
@@ -8754,7 +9109,7 @@ import path30 from "node:path";
8754
9109
  // src/evaluation/providers/vscode/utils/fs.ts
8755
9110
  import { constants as constants2 } from "node:fs";
8756
9111
  import { access as access2, mkdir as mkdir8, readdir as readdir2, rm as rm2, stat as stat2 } from "node:fs/promises";
8757
- import path21 from "node:path";
9112
+ import path22 from "node:path";
8758
9113
  async function pathExists(target) {
8759
9114
  try {
8760
9115
  await access2(target, constants2.F_OK);
@@ -8770,7 +9125,7 @@ async function readDirEntries(target) {
8770
9125
  const entries = await readdir2(target, { withFileTypes: true });
8771
9126
  return entries.map((entry) => ({
8772
9127
  name: entry.name,
8773
- absolutePath: path21.join(target, entry.name),
9128
+ absolutePath: path22.join(target, entry.name),
8774
9129
  isDirectory: entry.isDirectory()
8775
9130
  }));
8776
9131
  }
@@ -8785,9 +9140,9 @@ async function removeIfExists(target) {
8785
9140
  }
8786
9141
 
8787
9142
  // src/evaluation/providers/vscode/utils/path.ts
8788
- import path22 from "node:path";
9143
+ import path23 from "node:path";
8789
9144
  function pathToFileUri2(filePath) {
8790
- const absolutePath = path22.isAbsolute(filePath) ? filePath : path22.resolve(filePath);
9145
+ const absolutePath = path23.isAbsolute(filePath) ? filePath : path23.resolve(filePath);
8791
9146
  const normalizedPath = absolutePath.replace(/\\/g, "/");
8792
9147
  if (/^[a-zA-Z]:\//.test(normalizedPath)) {
8793
9148
  return `file:///${normalizedPath}`;
@@ -8796,7 +9151,7 @@ function pathToFileUri2(filePath) {
8796
9151
  }
8797
9152
 
8798
9153
  // src/evaluation/providers/vscode/dispatch/promptBuilder.ts
8799
- import path23 from "node:path";
9154
+ import path24 from "node:path";
8800
9155
 
8801
9156
  // src/evaluation/providers/vscode/utils/template.ts
8802
9157
  function renderTemplate2(content, variables) {
@@ -8888,8 +9243,8 @@ function createBatchRequestPrompt(userQuery, responseFileTmp, responseFileFinal,
8888
9243
  });
8889
9244
  }
8890
9245
  function createBatchOrchestratorPrompt(requestFiles, responseFiles, templateContent) {
8891
- const requestLines = requestFiles.map((file, index) => `${index + 1}. messages/${path23.basename(file)}`).join("\n");
8892
- const responseList = responseFiles.map((file) => `"${path23.basename(file)}"`).join(", ");
9246
+ const requestLines = requestFiles.map((file, index) => `${index + 1}. messages/${path24.basename(file)}`).join("\n");
9247
+ const responseList = responseFiles.map((file) => `"${path24.basename(file)}"`).join(", ");
8893
9248
  return renderTemplate2(templateContent, {
8894
9249
  requestFiles: requestLines,
8895
9250
  responseList
@@ -8898,7 +9253,7 @@ function createBatchOrchestratorPrompt(requestFiles, responseFiles, templateCont
8898
9253
 
8899
9254
  // src/evaluation/providers/vscode/dispatch/responseWaiter.ts
8900
9255
  import { readFile as readFile9 } from "node:fs/promises";
8901
- import path24 from "node:path";
9256
+ import path25 from "node:path";
8902
9257
 
8903
9258
  // src/evaluation/providers/vscode/utils/time.ts
8904
9259
  function sleep2(ms) {
@@ -8957,7 +9312,7 @@ async function waitForResponseOutput(responseFileFinal, pollInterval = 1e3, sile
8957
9312
  }
8958
9313
  async function waitForBatchResponses(responseFilesFinal, pollInterval = 1e3, silent = false, timeoutMs = DEFAULT_TIMEOUT_MS) {
8959
9314
  if (!silent) {
8960
- const fileList = responseFilesFinal.map((file) => path24.basename(file)).join(", ");
9315
+ const fileList = responseFilesFinal.map((file) => path25.basename(file)).join(", ");
8961
9316
  console.error(`waiting for ${responseFilesFinal.length} batch response(s): ${fileList}`);
8962
9317
  }
8963
9318
  const deadline = Date.now() + timeoutMs;
@@ -8966,7 +9321,7 @@ async function waitForBatchResponses(responseFilesFinal, pollInterval = 1e3, sil
8966
9321
  while (pending.size > 0) {
8967
9322
  if (Date.now() >= deadline) {
8968
9323
  if (!silent) {
8969
- const remaining = [...pending].map((f) => path24.basename(f)).join(", ");
9324
+ const remaining = [...pending].map((f) => path25.basename(f)).join(", ");
8970
9325
  console.error(
8971
9326
  `error: timed out after ${Math.round(timeoutMs / 1e3)}s waiting for batch responses. Still pending: ${remaining}`
8972
9327
  );
@@ -9022,36 +9377,6 @@ import { promisify as promisify2 } from "node:util";
9022
9377
 
9023
9378
  // src/evaluation/providers/vscode/dispatch/constants.ts
9024
9379
  import path26 from "node:path";
9025
-
9026
- // src/paths.ts
9027
- import os2 from "node:os";
9028
- import path25 from "node:path";
9029
- var logged = false;
9030
- function getAgentvHome() {
9031
- const envHome = process.env.AGENTV_HOME;
9032
- if (envHome && envHome !== "undefined") {
9033
- if (!logged) {
9034
- logged = true;
9035
- console.warn(`Using AGENTV_HOME: ${envHome}`);
9036
- }
9037
- return envHome;
9038
- }
9039
- return path25.join(os2.homedir(), ".agentv");
9040
- }
9041
- function getWorkspacesRoot() {
9042
- return path25.join(getAgentvHome(), "workspaces");
9043
- }
9044
- function getSubagentsRoot() {
9045
- return path25.join(getAgentvHome(), "subagents");
9046
- }
9047
- function getTraceStateRoot() {
9048
- return path25.join(getAgentvHome(), "trace-state");
9049
- }
9050
- function getWorkspacePoolRoot() {
9051
- return path25.join(getAgentvHome(), "workspace-pool");
9052
- }
9053
-
9054
- // src/evaluation/providers/vscode/dispatch/constants.ts
9055
9380
  var DEFAULT_LOCK_NAME = "subagent.lock";
9056
9381
  var DEFAULT_ALIVE_FILENAME = ".alive";
9057
9382
  function getDefaultSubagentRoot(vscodeCmd = "code") {
@@ -10308,9 +10633,10 @@ function resolveAndCreateProvider(definition, env = process.env) {
10308
10633
  }
10309
10634
 
10310
10635
  // src/evaluation/evaluators/scoring.ts
10311
- var PASS_THRESHOLD = 0.8;
10312
- function scoreToVerdict(score) {
10313
- return score >= PASS_THRESHOLD ? "pass" : "fail";
10636
+ var DEFAULT_THRESHOLD = 0.8;
10637
+ var PASS_THRESHOLD = DEFAULT_THRESHOLD;
10638
+ function scoreToVerdict(score, threshold = DEFAULT_THRESHOLD) {
10639
+ return score >= threshold ? "pass" : "fail";
10314
10640
  }
10315
10641
  function clampScore(value) {
10316
10642
  if (Number.isNaN(value) || !Number.isFinite(value)) {
@@ -10499,13 +10825,13 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
10499
10825
  async function execShellWithStdin(command, stdinPayload, options = {}) {
10500
10826
  const { mkdir: mkdir16, readFile: readFile15, rm: rm6, writeFile: writeFile9 } = await import("node:fs/promises");
10501
10827
  const { tmpdir: tmpdir3 } = await import("node:os");
10502
- const path49 = await import("node:path");
10828
+ const path50 = await import("node:path");
10503
10829
  const { randomUUID: randomUUID10 } = await import("node:crypto");
10504
- const dir = path49.join(tmpdir3(), `agentv-exec-${randomUUID10()}`);
10830
+ const dir = path50.join(tmpdir3(), `agentv-exec-${randomUUID10()}`);
10505
10831
  await mkdir16(dir, { recursive: true });
10506
- const stdinPath = path49.join(dir, "stdin.txt");
10507
- const stdoutPath = path49.join(dir, "stdout.txt");
10508
- const stderrPath = path49.join(dir, "stderr.txt");
10832
+ const stdinPath = path50.join(dir, "stdin.txt");
10833
+ const stdoutPath = path50.join(dir, "stdout.txt");
10834
+ const stderrPath = path50.join(dir, "stderr.txt");
10509
10835
  await writeFile9(stdinPath, stdinPayload, "utf8");
10510
10836
  const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
10511
10837
  const { spawn: spawn5 } = await import("node:child_process");
@@ -11703,7 +12029,7 @@ ${outputSchema}`;
11703
12029
  parts.push("[[ ## scoring_criteria ## ]]");
11704
12030
  for (const rubric of rubrics) {
11705
12031
  const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
11706
- const minScoreLabel = rubric.required_min_score !== void 0 ? ` [REQUIRED: min score ${rubric.required_min_score}]` : "";
12032
+ const minScoreLabel = rubric.min_score !== void 0 ? ` [REQUIRED: min score ${rubric.min_score}]` : rubric.required_min_score !== void 0 ? ` [REQUIRED: min score ${rubric.required_min_score}]` : "";
11707
12033
  parts.push("", `### Criterion: ${rubric.id}${weightLabel}${minScoreLabel}`);
11708
12034
  if (rubric.outcome) {
11709
12035
  parts.push(`Description: ${rubric.outcome}`);
@@ -11757,54 +12083,106 @@ ${outputSchema}`;
11757
12083
  async runWithRetry(options) {
11758
12084
  const { context, graderProvider, systemPrompt, userPrompt, schema, images } = options;
11759
12085
  let lastError;
12086
+ let lastInvalidResponse;
12087
+ let shouldAttemptStructureFix = false;
11760
12088
  for (let attempt = 1; attempt <= 3; attempt++) {
11761
12089
  try {
11762
- const model = graderProvider.asLanguageModel?.();
11763
- if (model) {
11764
- const modelOptions = {
11765
- ...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
11766
- ...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
11767
- };
11768
- const hasImages = images && images.length > 0;
11769
- const result = hasImages ? await generateText2({
11770
- model,
11771
- system: systemPrompt,
11772
- messages: [
11773
- {
11774
- role: "user",
11775
- content: [
11776
- { type: "text", text: userPrompt },
11777
- ...toAiSdkImageParts(images)
11778
- ]
11779
- }
11780
- ],
11781
- ...modelOptions
11782
- }) : await generateText2({
11783
- model,
11784
- system: systemPrompt,
11785
- prompt: userPrompt,
11786
- ...modelOptions
11787
- });
11788
- const data2 = schema.parse(parseJsonFromText(result.text));
11789
- const rawUsage = result.usage;
11790
- const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens } : void 0;
11791
- return { data: data2, tokenUsage };
12090
+ const result = await this.generateStructuredResponse({
12091
+ context,
12092
+ graderProvider,
12093
+ systemPrompt,
12094
+ userPrompt,
12095
+ images
12096
+ });
12097
+ const canRepairResponse = result.text.trim().length > 0;
12098
+ lastInvalidResponse = canRepairResponse ? result : void 0;
12099
+ let data;
12100
+ try {
12101
+ data = schema.parse(parseJsonFromText(result.text));
12102
+ } catch (e) {
12103
+ lastError = e instanceof Error ? e : new Error(String(e));
12104
+ shouldAttemptStructureFix = canRepairResponse;
12105
+ continue;
11792
12106
  }
11793
- const response = await graderProvider.invoke({
11794
- question: userPrompt,
12107
+ return {
12108
+ data,
12109
+ providerResponse: result.providerResponse,
12110
+ tokenUsage: result.tokenUsage
12111
+ };
12112
+ } catch (e) {
12113
+ lastError = e instanceof Error ? e : new Error(String(e));
12114
+ }
12115
+ }
12116
+ if (shouldAttemptStructureFix && lastInvalidResponse) {
12117
+ try {
12118
+ const repaired = await this.generateStructuredResponse({
12119
+ context,
12120
+ graderProvider,
11795
12121
  systemPrompt,
11796
- evalCaseId: context.evalCase.id,
11797
- attempt: context.attempt,
11798
- maxOutputTokens: this.maxOutputTokens,
11799
- temperature: this.temperature
12122
+ userPrompt: buildStructureRepairPrompt({
12123
+ validationError: lastError?.message ?? "Schema validation failed",
12124
+ invalidResponse: lastInvalidResponse.text
12125
+ })
11800
12126
  });
11801
- const data = schema.parse(parseJsonFromText(extractLastAssistantContent(response.output)));
11802
- return { data, providerResponse: response, tokenUsage: response.tokenUsage };
12127
+ const data = schema.parse(parseJsonFromText(repaired.text));
12128
+ return {
12129
+ data,
12130
+ providerResponse: repaired.providerResponse,
12131
+ tokenUsage: sumTokenUsage(lastInvalidResponse.tokenUsage, repaired.tokenUsage)
12132
+ };
11803
12133
  } catch (e) {
11804
12134
  lastError = e instanceof Error ? e : new Error(String(e));
11805
12135
  }
11806
12136
  }
11807
- throw new Error(`Failed to parse evaluator response after 3 attempts: ${lastError?.message}`);
12137
+ throw new Error(
12138
+ `Failed to parse evaluator response after 3 attempts and 1 structure-fix attempt: ${lastError?.message}`
12139
+ );
12140
+ }
12141
+ async generateStructuredResponse(options) {
12142
+ const { context, graderProvider, systemPrompt, userPrompt, images } = options;
12143
+ const model = graderProvider.asLanguageModel?.();
12144
+ if (model) {
12145
+ const modelOptions = {
12146
+ ...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
12147
+ ...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
12148
+ };
12149
+ const hasImages = images && images.length > 0;
12150
+ const result = hasImages ? await generateText2({
12151
+ model,
12152
+ system: systemPrompt,
12153
+ messages: [
12154
+ {
12155
+ role: "user",
12156
+ content: [
12157
+ { type: "text", text: userPrompt },
12158
+ ...toAiSdkImageParts(images)
12159
+ ]
12160
+ }
12161
+ ],
12162
+ ...modelOptions
12163
+ }) : await generateText2({
12164
+ model,
12165
+ system: systemPrompt,
12166
+ prompt: userPrompt,
12167
+ ...modelOptions
12168
+ });
12169
+ const rawUsage = result.usage;
12170
+ const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens } : void 0;
12171
+ return { text: result.text, tokenUsage };
12172
+ }
12173
+ const response = await graderProvider.invoke({
12174
+ question: userPrompt,
12175
+ systemPrompt,
12176
+ evalCaseId: context.evalCase.id,
12177
+ attempt: context.attempt,
12178
+ maxOutputTokens: this.maxOutputTokens,
12179
+ temperature: this.temperature
12180
+ });
12181
+ return {
12182
+ text: extractLastAssistantContent(response.output),
12183
+ providerResponse: response,
12184
+ tokenUsage: response.tokenUsage
12185
+ };
11808
12186
  }
11809
12187
  };
11810
12188
  function buildOutputSchema() {
@@ -11824,6 +12202,29 @@ function buildOutputSchema() {
11824
12202
  "}"
11825
12203
  ].join("\n");
11826
12204
  }
12205
+ function buildStructureRepairPrompt(options) {
12206
+ const { validationError, invalidResponse } = options;
12207
+ return [
12208
+ "The following evaluation response has useful grading content but invalid JSON structure.",
12209
+ "Repair it to satisfy the schema in the system prompt.",
12210
+ "Preserve the evaluation meaning, do not re-grade the answer, and return only a single JSON object.",
12211
+ "",
12212
+ "Validation error:",
12213
+ validationError,
12214
+ "",
12215
+ "Invalid response:",
12216
+ invalidResponse
12217
+ ].join("\n");
12218
+ }
12219
+ function sumTokenUsage(first, second) {
12220
+ if (!first && !second) {
12221
+ return void 0;
12222
+ }
12223
+ return {
12224
+ input: (first?.input ?? 0) + (second?.input ?? 0),
12225
+ output: (first?.output ?? 0) + (second?.output ?? 0)
12226
+ };
12227
+ }
11827
12228
  function buildRubricOutputSchema() {
11828
12229
  return `You are an expert evaluator. Evaluate the candidate answer against each rubric item.
11829
12230
  You must return a valid JSON object matching this schema:
@@ -11923,19 +12324,21 @@ function calculateScoreRangeResult(result, rubrics) {
11923
12324
  rawScores[rubric.id] = rawScore;
11924
12325
  totalWeight += rubric.weight;
11925
12326
  weightedScoreSum += normalizedScore * rubric.weight;
11926
- let requiredMinScore;
11927
- if (rubric.required_min_score !== void 0) {
11928
- requiredMinScore = rubric.required_min_score;
12327
+ let minScoreThreshold;
12328
+ if (rubric.min_score !== void 0) {
12329
+ minScoreThreshold = rubric.min_score;
12330
+ } else if (rubric.required_min_score !== void 0) {
12331
+ minScoreThreshold = rubric.required_min_score / 10;
11929
12332
  } else if (rubric.required === true) {
11930
- requiredMinScore = 10;
12333
+ minScoreThreshold = 1;
11931
12334
  }
11932
12335
  const matchingRange = rubric.score_ranges?.find(
11933
12336
  (r) => rawScore >= r.score_range[0] && rawScore <= r.score_range[1]
11934
12337
  );
11935
12338
  const rangeDescription = matchingRange?.outcome ?? "";
11936
12339
  const criterionLabel = rubric.outcome ?? rubric.id;
11937
- const passed = !(requiredMinScore !== void 0 && rawScore < requiredMinScore) && rawScore >= 7;
11938
- if (requiredMinScore !== void 0 && rawScore < requiredMinScore) {
12340
+ const passed = !(minScoreThreshold !== void 0 && normalizedScore < minScoreThreshold) && rawScore >= 7;
12341
+ if (minScoreThreshold !== void 0 && normalizedScore < minScoreThreshold) {
11939
12342
  failedRequired = true;
11940
12343
  }
11941
12344
  assertions.push({
@@ -12012,11 +12415,11 @@ function createFilesystemTools(workspacePath) {
12012
12415
  execute: async (input) => {
12013
12416
  try {
12014
12417
  const resolved = resolveSandboxed(workspacePath, input.path);
12015
- const stat10 = await fs2.stat(resolved);
12016
- if (stat10.isDirectory()) {
12418
+ const stat11 = await fs2.stat(resolved);
12419
+ if (stat11.isDirectory()) {
12017
12420
  return { error: `'${input.path}' is a directory, not a file` };
12018
12421
  }
12019
- const buffer = Buffer.alloc(Math.min(stat10.size, MAX_FILE_SIZE));
12422
+ const buffer = Buffer.alloc(Math.min(stat11.size, MAX_FILE_SIZE));
12020
12423
  const fd = await fs2.open(resolved, "r");
12021
12424
  try {
12022
12425
  await fd.read(buffer, 0, buffer.length, 0);
@@ -12024,8 +12427,8 @@ function createFilesystemTools(workspacePath) {
12024
12427
  await fd.close();
12025
12428
  }
12026
12429
  const content = buffer.toString("utf-8");
12027
- const truncated = stat10.size > MAX_FILE_SIZE;
12028
- return { content, truncated, size: stat10.size };
12430
+ const truncated = stat11.size > MAX_FILE_SIZE;
12431
+ return { content, truncated, size: stat11.size };
12029
12432
  } catch (error) {
12030
12433
  return { error: error instanceof Error ? error.message : String(error) };
12031
12434
  }
@@ -12076,8 +12479,8 @@ async function searchDirectory(dirPath, workspacePath, regex, matches) {
12076
12479
  const ext = path35.extname(entry.name).toLowerCase();
12077
12480
  if (BINARY_EXTENSIONS.has(ext)) continue;
12078
12481
  try {
12079
- const stat10 = await fs2.stat(fullPath);
12080
- if (stat10.size > MAX_FILE_SIZE) continue;
12482
+ const stat11 = await fs2.stat(fullPath);
12483
+ if (stat11.size > MAX_FILE_SIZE) continue;
12081
12484
  const content = await fs2.readFile(fullPath, "utf-8");
12082
12485
  const lines = content.split("\n");
12083
12486
  for (let i = 0; i < lines.length; i++) {
@@ -12718,115 +13121,115 @@ var FieldAccuracyEvaluator = class {
12718
13121
  * Evaluate a single field against the expected value.
12719
13122
  */
12720
13123
  evaluateField(fieldConfig, candidateData, expectedData) {
12721
- const { path: path49, match, required = true, weight = 1 } = fieldConfig;
12722
- const candidateValue = resolvePath(candidateData, path49);
12723
- const expectedValue = resolvePath(expectedData, path49);
13124
+ const { path: path50, match, required = true, weight = 1 } = fieldConfig;
13125
+ const candidateValue = resolvePath(candidateData, path50);
13126
+ const expectedValue = resolvePath(expectedData, path50);
12724
13127
  if (expectedValue === void 0) {
12725
13128
  return {
12726
- path: path49,
13129
+ path: path50,
12727
13130
  score: 1,
12728
13131
  // No expected value means no comparison needed
12729
13132
  weight,
12730
13133
  hit: true,
12731
- message: `${path49}: no expected value`
13134
+ message: `${path50}: no expected value`
12732
13135
  };
12733
13136
  }
12734
13137
  if (candidateValue === void 0) {
12735
13138
  if (required) {
12736
13139
  return {
12737
- path: path49,
13140
+ path: path50,
12738
13141
  score: 0,
12739
13142
  weight,
12740
13143
  hit: false,
12741
- message: `${path49} (required, missing)`
13144
+ message: `${path50} (required, missing)`
12742
13145
  };
12743
13146
  }
12744
13147
  return {
12745
- path: path49,
13148
+ path: path50,
12746
13149
  score: 1,
12747
13150
  // Don't penalize missing optional fields
12748
13151
  weight: 0,
12749
13152
  // Zero weight means it won't affect the score
12750
13153
  hit: true,
12751
- message: `${path49}: optional field missing`
13154
+ message: `${path50}: optional field missing`
12752
13155
  };
12753
13156
  }
12754
13157
  switch (match) {
12755
13158
  case "exact":
12756
- return this.compareExact(path49, candidateValue, expectedValue, weight);
13159
+ return this.compareExact(path50, candidateValue, expectedValue, weight);
12757
13160
  case "numeric_tolerance":
12758
13161
  return this.compareNumericTolerance(
12759
- path49,
13162
+ path50,
12760
13163
  candidateValue,
12761
13164
  expectedValue,
12762
13165
  fieldConfig,
12763
13166
  weight
12764
13167
  );
12765
13168
  case "date":
12766
- return this.compareDate(path49, candidateValue, expectedValue, fieldConfig, weight);
13169
+ return this.compareDate(path50, candidateValue, expectedValue, fieldConfig, weight);
12767
13170
  default:
12768
13171
  return {
12769
- path: path49,
13172
+ path: path50,
12770
13173
  score: 0,
12771
13174
  weight,
12772
13175
  hit: false,
12773
- message: `${path49}: unknown match type "${match}"`
13176
+ message: `${path50}: unknown match type "${match}"`
12774
13177
  };
12775
13178
  }
12776
13179
  }
12777
13180
  /**
12778
13181
  * Exact equality comparison.
12779
13182
  */
12780
- compareExact(path49, candidateValue, expectedValue, weight) {
13183
+ compareExact(path50, candidateValue, expectedValue, weight) {
12781
13184
  if (deepEqual(candidateValue, expectedValue)) {
12782
13185
  return {
12783
- path: path49,
13186
+ path: path50,
12784
13187
  score: 1,
12785
13188
  weight,
12786
13189
  hit: true,
12787
- message: path49
13190
+ message: path50
12788
13191
  };
12789
13192
  }
12790
13193
  if (typeof candidateValue !== typeof expectedValue) {
12791
13194
  return {
12792
- path: path49,
13195
+ path: path50,
12793
13196
  score: 0,
12794
13197
  weight,
12795
13198
  hit: false,
12796
- message: `${path49} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
13199
+ message: `${path50} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
12797
13200
  };
12798
13201
  }
12799
13202
  return {
12800
- path: path49,
13203
+ path: path50,
12801
13204
  score: 0,
12802
13205
  weight,
12803
13206
  hit: false,
12804
- message: `${path49} (value mismatch)`
13207
+ message: `${path50} (value mismatch)`
12805
13208
  };
12806
13209
  }
12807
13210
  /**
12808
13211
  * Numeric comparison with absolute or relative tolerance.
12809
13212
  */
12810
- compareNumericTolerance(path49, candidateValue, expectedValue, fieldConfig, weight) {
13213
+ compareNumericTolerance(path50, candidateValue, expectedValue, fieldConfig, weight) {
12811
13214
  const { tolerance = 0, relative = false } = fieldConfig;
12812
13215
  const candidateNum = toNumber(candidateValue);
12813
13216
  const expectedNum = toNumber(expectedValue);
12814
13217
  if (candidateNum === null || expectedNum === null) {
12815
13218
  return {
12816
- path: path49,
13219
+ path: path50,
12817
13220
  score: 0,
12818
13221
  weight,
12819
13222
  hit: false,
12820
- message: `${path49} (non-numeric value)`
13223
+ message: `${path50} (non-numeric value)`
12821
13224
  };
12822
13225
  }
12823
13226
  if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
12824
13227
  return {
12825
- path: path49,
13228
+ path: path50,
12826
13229
  score: 0,
12827
13230
  weight,
12828
13231
  hit: false,
12829
- message: `${path49} (invalid numeric value)`
13232
+ message: `${path50} (invalid numeric value)`
12830
13233
  };
12831
13234
  }
12832
13235
  const diff = Math.abs(candidateNum - expectedNum);
@@ -12839,61 +13242,61 @@ var FieldAccuracyEvaluator = class {
12839
13242
  }
12840
13243
  if (withinTolerance) {
12841
13244
  return {
12842
- path: path49,
13245
+ path: path50,
12843
13246
  score: 1,
12844
13247
  weight,
12845
13248
  hit: true,
12846
- message: `${path49} (within tolerance: diff=${diff.toFixed(2)})`
13249
+ message: `${path50} (within tolerance: diff=${diff.toFixed(2)})`
12847
13250
  };
12848
13251
  }
12849
13252
  return {
12850
- path: path49,
13253
+ path: path50,
12851
13254
  score: 0,
12852
13255
  weight,
12853
13256
  hit: false,
12854
- message: `${path49} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
13257
+ message: `${path50} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
12855
13258
  };
12856
13259
  }
12857
13260
  /**
12858
13261
  * Date comparison with format normalization.
12859
13262
  */
12860
- compareDate(path49, candidateValue, expectedValue, fieldConfig, weight) {
13263
+ compareDate(path50, candidateValue, expectedValue, fieldConfig, weight) {
12861
13264
  const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
12862
13265
  const candidateDate = parseDate(String(candidateValue), formats);
12863
13266
  const expectedDate = parseDate(String(expectedValue), formats);
12864
13267
  if (candidateDate === null) {
12865
13268
  return {
12866
- path: path49,
13269
+ path: path50,
12867
13270
  score: 0,
12868
13271
  weight,
12869
13272
  hit: false,
12870
- message: `${path49} (unparseable candidate date)`
13273
+ message: `${path50} (unparseable candidate date)`
12871
13274
  };
12872
13275
  }
12873
13276
  if (expectedDate === null) {
12874
13277
  return {
12875
- path: path49,
13278
+ path: path50,
12876
13279
  score: 0,
12877
13280
  weight,
12878
13281
  hit: false,
12879
- message: `${path49} (unparseable expected date)`
13282
+ message: `${path50} (unparseable expected date)`
12880
13283
  };
12881
13284
  }
12882
13285
  if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
12883
13286
  return {
12884
- path: path49,
13287
+ path: path50,
12885
13288
  score: 1,
12886
13289
  weight,
12887
13290
  hit: true,
12888
- message: path49
13291
+ message: path50
12889
13292
  };
12890
13293
  }
12891
13294
  return {
12892
- path: path49,
13295
+ path: path50,
12893
13296
  score: 0,
12894
13297
  weight,
12895
13298
  hit: false,
12896
- message: `${path49} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
13299
+ message: `${path50} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
12897
13300
  };
12898
13301
  }
12899
13302
  /**
@@ -12926,11 +13329,11 @@ var FieldAccuracyEvaluator = class {
12926
13329
  };
12927
13330
  }
12928
13331
  };
12929
- function resolvePath(obj, path49) {
12930
- if (!path49 || !obj) {
13332
+ function resolvePath(obj, path50) {
13333
+ if (!path50 || !obj) {
12931
13334
  return void 0;
12932
13335
  }
12933
- const parts = path49.split(/\.|\[|\]/).filter((p) => p.length > 0);
13336
+ const parts = path50.split(/\.|\[|\]/).filter((p) => p.length > 0);
12934
13337
  let current = obj;
12935
13338
  for (const part of parts) {
12936
13339
  if (current === null || current === void 0) {
@@ -13422,8 +13825,8 @@ var TokenUsageEvaluator = class {
13422
13825
  };
13423
13826
 
13424
13827
  // src/evaluation/evaluators/tool-trajectory.ts
13425
- function getNestedValue(obj, path49) {
13426
- const parts = path49.split(".");
13828
+ function getNestedValue(obj, path50) {
13829
+ const parts = path50.split(".");
13427
13830
  let current = obj;
13428
13831
  for (const part of parts) {
13429
13832
  if (current === null || current === void 0 || typeof current !== "object") {
@@ -15195,7 +15598,7 @@ var WorkspacePoolManager = class {
15195
15598
  }
15196
15599
  /**
15197
15600
  * Reset an existing slot for reuse:
15198
- * 1. Reset repos (git reset --hard {ref} && git clean -fd per repo)
15601
+ * 1. Reset repos (fetch from origin when resolve=remote, then git reset --hard && git clean per repo)
15199
15602
  * 2. Re-copy template files (skip repo directories)
15200
15603
  */
15201
15604
  async resetSlot(slotPath, templatePath, repos, poolReset = "fast") {
@@ -15208,7 +15611,17 @@ var WorkspacePoolManager = class {
15208
15611
  continue;
15209
15612
  }
15210
15613
  const ref = repo.checkout?.ref ?? "HEAD";
15211
- await git(["reset", "--hard", ref], { cwd: repoDir });
15614
+ const resolve = repo.checkout?.resolve ?? "remote";
15615
+ if (resolve === "remote") {
15616
+ const fetchArgs = ["fetch", "origin", ref];
15617
+ if (repo.clone?.depth) {
15618
+ fetchArgs.splice(1, 0, "--depth", String(repo.clone.depth));
15619
+ }
15620
+ await git(fetchArgs, { cwd: repoDir });
15621
+ await git(["reset", "--hard", "FETCH_HEAD"], { cwd: repoDir });
15622
+ } else {
15623
+ await git(["reset", "--hard", ref], { cwd: repoDir });
15624
+ }
15212
15625
  const cleanFlag = poolReset === "strict" ? "-fdx" : "-fd";
15213
15626
  await git(["clean", cleanFlag], { cwd: repoDir });
15214
15627
  }
@@ -15505,7 +15918,7 @@ async function executeWorkspaceScript(config, context, failureMode = "fatal") {
15505
15918
  }
15506
15919
 
15507
15920
  // src/evaluation/orchestrator.ts
15508
- function classifyQualityStatus(score, threshold = PASS_THRESHOLD) {
15921
+ function classifyQualityStatus(score, threshold = DEFAULT_THRESHOLD) {
15509
15922
  return score >= threshold ? "ok" : "quality_failure";
15510
15923
  }
15511
15924
  function buildSkippedEvaluatorError(scores) {
@@ -15597,7 +16010,7 @@ async function runEvaluation(options) {
15597
16010
  const filteredEvalCases = filterEvalCases(evalCases, filter);
15598
16011
  if (filteredEvalCases.length === 0) {
15599
16012
  if (filter) {
15600
- throw new Error(`No tests matched filter '${filter}' in ${evalFilePath}`);
16013
+ throw new Error(`No tests matched filter '${formatFilter(filter)}' in ${evalFilePath}`);
15601
16014
  }
15602
16015
  return [];
15603
16016
  }
@@ -15649,6 +16062,9 @@ async function runEvaluation(options) {
15649
16062
  const graderName = targetContext.graderTarget ?? targetContext.name;
15650
16063
  const resolvedGrader = resolveTargetByName(graderName);
15651
16064
  if (!resolvedGrader) {
16065
+ if (!LLM_GRADER_CAPABLE_KINDS.includes(targetContext.kind)) {
16066
+ return void 0;
16067
+ }
15652
16068
  return getOrCreateProvider(targetContext);
15653
16069
  }
15654
16070
  return getOrCreateProvider(resolvedGrader);
@@ -15979,7 +16395,7 @@ async function runEvaluation(options) {
15979
16395
  const budgetResult = {
15980
16396
  timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
15981
16397
  testId: evalCase.id,
15982
- dataset: evalCase.dataset,
16398
+ suite: evalCase.suite,
15983
16399
  category: evalCase.category,
15984
16400
  score: 0,
15985
16401
  assertions: [],
@@ -16016,7 +16432,7 @@ async function runEvaluation(options) {
16016
16432
  const haltResult = {
16017
16433
  timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
16018
16434
  testId: evalCase.id,
16019
- dataset: evalCase.dataset,
16435
+ suite: evalCase.suite,
16020
16436
  category: evalCase.category,
16021
16437
  score: 0,
16022
16438
  assertions: [],
@@ -16328,7 +16744,7 @@ async function runBatchEvaluation(options) {
16328
16744
  targetResolver,
16329
16745
  availableTargets,
16330
16746
  verbose,
16331
- threshold: batchThreshold
16747
+ threshold: evalCase.threshold ?? batchThreshold
16332
16748
  });
16333
16749
  if (providerError) {
16334
16750
  result = {
@@ -16790,8 +17206,9 @@ async function runEvalCase(options) {
16790
17206
  fileChanges,
16791
17207
  workspacePath,
16792
17208
  verbose,
16793
- threshold: caseThreshold
17209
+ threshold: evalCase.threshold ?? caseThreshold
16794
17210
  });
17211
+ const effectiveThreshold = evalCase.threshold ?? caseThreshold;
16795
17212
  const totalDurationMs = Date.now() - caseStartMs;
16796
17213
  const graderTokens = aggregateEvaluatorTokenUsage(result.scores);
16797
17214
  const evalRunTokenUsage = tokenUsage || graderTokens ? {
@@ -16805,7 +17222,7 @@ async function runEvalCase(options) {
16805
17222
  ...evalRunTokenUsage ? { tokenUsage: evalRunTokenUsage } : {}
16806
17223
  };
16807
17224
  const skippedEvaluatorError = buildSkippedEvaluatorError(result.scores);
16808
- const executionStatus = providerError || skippedEvaluatorError ? "execution_error" : classifyQualityStatus(result.score, caseThreshold);
17225
+ const executionStatus = providerError || skippedEvaluatorError ? "execution_error" : classifyQualityStatus(result.score, effectiveThreshold);
16809
17226
  const targetUsedField = targetUsed ? { targetUsed } : {};
16810
17227
  const finalResult = providerError ? {
16811
17228
  ...result,
@@ -17006,7 +17423,8 @@ async function evaluateCandidate(options) {
17006
17423
  targetResolver,
17007
17424
  availableTargets,
17008
17425
  fileChanges,
17009
- workspacePath
17426
+ workspacePath,
17427
+ threshold: evalThreshold
17010
17428
  });
17011
17429
  const completedAt = nowFn();
17012
17430
  let agentRequest;
@@ -17037,7 +17455,7 @@ async function evaluateCandidate(options) {
17037
17455
  return {
17038
17456
  timestamp: completedAt.toISOString(),
17039
17457
  testId: evalCase.id,
17040
- dataset: evalCase.dataset,
17458
+ suite: evalCase.suite,
17041
17459
  category: evalCase.category,
17042
17460
  conversationId: evalCase.conversation_id,
17043
17461
  score: score.score,
@@ -17080,7 +17498,8 @@ async function runEvaluatorsForCase(options) {
17080
17498
  targetResolver,
17081
17499
  availableTargets,
17082
17500
  fileChanges,
17083
- workspacePath
17501
+ workspacePath,
17502
+ threshold
17084
17503
  } = options;
17085
17504
  if (evalCase.assertions && evalCase.assertions.length > 0) {
17086
17505
  return runEvaluatorList({
@@ -17106,7 +17525,8 @@ async function runEvaluatorsForCase(options) {
17106
17525
  targetResolver,
17107
17526
  availableTargets,
17108
17527
  fileChanges,
17109
- workspacePath
17528
+ workspacePath,
17529
+ threshold
17110
17530
  });
17111
17531
  }
17112
17532
  const evaluatorKind = evalCase.evaluator ?? "llm-grader";
@@ -17208,7 +17628,8 @@ async function runEvaluatorList(options) {
17208
17628
  name: evaluatorConfig.name,
17209
17629
  type: evaluatorConfig.type,
17210
17630
  weight,
17211
- ...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {}
17631
+ ...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {},
17632
+ ...evaluatorConfig.min_score !== void 0 ? { min_score: evaluatorConfig.min_score } : {}
17212
17633
  });
17213
17634
  scores.push({
17214
17635
  name: evaluatorConfig.name,
@@ -17243,7 +17664,8 @@ async function runEvaluatorList(options) {
17243
17664
  name: evaluatorConfig.name ?? "unknown",
17244
17665
  type: evaluatorConfig.type ?? "llm-grader",
17245
17666
  weight,
17246
- ...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {}
17667
+ ...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {},
17668
+ ...evaluatorConfig.min_score !== void 0 ? { min_score: evaluatorConfig.min_score } : {}
17247
17669
  });
17248
17670
  scores.push({
17249
17671
  name: evaluatorConfig.name ?? "unknown",
@@ -17277,9 +17699,10 @@ async function runEvaluatorList(options) {
17277
17699
  }
17278
17700
  }
17279
17701
  }
17702
+ const effectiveThreshold = options.threshold ?? DEFAULT_THRESHOLD;
17280
17703
  const hasRequiredFailure = scored.some((entry) => {
17281
17704
  if (!entry.required) return false;
17282
- const minScore = typeof entry.required === "number" ? entry.required : PASS_THRESHOLD;
17705
+ const minScore = entry.min_score ?? (typeof entry.required === "number" ? entry.required : effectiveThreshold);
17283
17706
  return entry.score.score < minScore;
17284
17707
  });
17285
17708
  const scorable = scored.filter((entry) => entry.score.verdict !== "skip");
@@ -17290,17 +17713,23 @@ async function runEvaluatorList(options) {
17290
17713
  const expectedAspectCount = assertions.length || 1;
17291
17714
  const score = {
17292
17715
  score: aggregateScore,
17293
- verdict: scoreToVerdict(aggregateScore),
17716
+ verdict: scoreToVerdict(aggregateScore, effectiveThreshold),
17294
17717
  assertions,
17295
17718
  expectedAspectCount
17296
17719
  };
17297
17720
  return { score, scores };
17298
17721
  }
17722
+ function formatFilter(filter) {
17723
+ return typeof filter === "string" ? filter : filter.join(", ");
17724
+ }
17725
+ function matchesFilter3(id, filter) {
17726
+ return typeof filter === "string" ? micromatch3.isMatch(id, filter) : filter.some((pattern) => micromatch3.isMatch(id, pattern));
17727
+ }
17299
17728
  function filterEvalCases(evalCases, filter) {
17300
17729
  if (!filter) {
17301
17730
  return evalCases;
17302
17731
  }
17303
- return evalCases.filter((evalCase) => micromatch3.isMatch(evalCase.id, filter));
17732
+ return evalCases.filter((evalCase) => matchesFilter3(evalCase.id, filter));
17304
17733
  }
17305
17734
  function buildEvaluatorRegistry(overrides, resolveGraderProvider) {
17306
17735
  const llmGrader = overrides?.["llm-grader"] ?? new LlmGraderEvaluator({
@@ -17387,7 +17816,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
17387
17816
  return {
17388
17817
  timestamp: timestamp.toISOString(),
17389
17818
  testId: evalCase.id,
17390
- dataset: evalCase.dataset,
17819
+ suite: evalCase.suite,
17391
17820
  category: evalCase.category,
17392
17821
  conversationId: evalCase.conversation_id,
17393
17822
  score: 0,
@@ -17659,6 +18088,7 @@ async function evaluate(config) {
17659
18088
  verbose: config.verbose,
17660
18089
  maxConcurrency: config.workers ?? 3,
17661
18090
  filter: config.filter,
18091
+ threshold: config.threshold,
17662
18092
  evalCases,
17663
18093
  onResult: async (result) => {
17664
18094
  collectedResults.push(result);
@@ -17669,19 +18099,19 @@ async function evaluate(config) {
17669
18099
  const durationMs = Date.now() - startTime;
17670
18100
  return {
17671
18101
  results: allResults,
17672
- summary: computeSummary(allResults, durationMs)
18102
+ summary: computeSummary(allResults, durationMs, config.threshold)
17673
18103
  };
17674
18104
  }
17675
18105
  function mapAssertionType(type) {
17676
18106
  return type.replace(/_/g, "-");
17677
18107
  }
17678
- function computeSummary(results, durationMs) {
18108
+ function computeSummary(results, durationMs, threshold = DEFAULT_THRESHOLD) {
17679
18109
  const total = results.length;
17680
18110
  let passed = 0;
17681
18111
  let scoreSum = 0;
17682
18112
  for (const r of results) {
17683
18113
  scoreSum += r.score;
17684
- if (r.score >= PASS_THRESHOLD) {
18114
+ if (r.score >= threshold) {
17685
18115
  passed++;
17686
18116
  }
17687
18117
  }
@@ -17794,7 +18224,7 @@ var CONFIG_FILE_NAMES = [
17794
18224
  ];
17795
18225
  async function loadTsConfig(projectRoot) {
17796
18226
  const { existsSync: existsSync7 } = await import("node:fs");
17797
- const { pathToFileURL } = await import("node:url");
18227
+ const { pathToFileURL: pathToFileURL2 } = await import("node:url");
17798
18228
  const { join: join2 } = await import("node:path");
17799
18229
  for (const fileName of CONFIG_FILE_NAMES) {
17800
18230
  const filePath = join2(projectRoot, fileName);
@@ -17802,7 +18232,7 @@ async function loadTsConfig(projectRoot) {
17802
18232
  continue;
17803
18233
  }
17804
18234
  try {
17805
- const fileUrl = pathToFileURL(filePath).href;
18235
+ const fileUrl = pathToFileURL2(filePath).href;
17806
18236
  const mod = await import(fileUrl);
17807
18237
  const config = mod.default ?? mod;
17808
18238
  return AgentVConfigSchema.parse(config);
@@ -17936,7 +18366,7 @@ function shouldSkipCacheForTemperature(targetConfig) {
17936
18366
  }
17937
18367
 
17938
18368
  // src/projects.ts
17939
- import { existsSync as existsSync6, mkdirSync, readFileSync as readFileSync3, readdirSync as readdirSync3, statSync as statSync2, writeFileSync } from "node:fs";
18369
+ import { existsSync as existsSync6, mkdirSync as mkdirSync2, readFileSync as readFileSync3, readdirSync as readdirSync3, statSync as statSync2, writeFileSync } from "node:fs";
17940
18370
  import path47 from "node:path";
17941
18371
  import { parse as parseYaml3, stringify as stringifyYaml } from "yaml";
17942
18372
  function getProjectsRegistryPath() {
@@ -17962,7 +18392,7 @@ function saveProjectRegistry(registry) {
17962
18392
  const registryPath = getProjectsRegistryPath();
17963
18393
  const dir = path47.dirname(registryPath);
17964
18394
  if (!existsSync6(dir)) {
17965
- mkdirSync(dir, { recursive: true });
18395
+ mkdirSync2(dir, { recursive: true });
17966
18396
  }
17967
18397
  writeFileSync(registryPath, stringifyYaml(registry), "utf-8");
17968
18398
  }
@@ -18228,7 +18658,7 @@ var OtelTraceExporter = class {
18228
18658
  rootSpan.setAttribute("gen_ai.system", "agentv");
18229
18659
  rootSpan.setAttribute("agentv.test_id", result.testId);
18230
18660
  rootSpan.setAttribute("agentv.target", result.target);
18231
- if (result.dataset) rootSpan.setAttribute("agentv.dataset", result.dataset);
18661
+ if (result.suite) rootSpan.setAttribute("agentv.suite", result.suite);
18232
18662
  rootSpan.setAttribute("agentv.score", result.score);
18233
18663
  if (captureContent && result.output.length > 0) {
18234
18664
  const lastMsg = result.output[result.output.length - 1];
@@ -18437,7 +18867,7 @@ var OtelStreamingObserver = class {
18437
18867
  this.rootSpan.setAttribute("gen_ai.system", "agentv");
18438
18868
  this.rootSpan.setAttribute("agentv.test_id", testId);
18439
18869
  this.rootSpan.setAttribute("agentv.target", target);
18440
- if (evalSet) this.rootSpan.setAttribute("agentv.dataset", evalSet);
18870
+ if (evalSet) this.rootSpan.setAttribute("agentv.suite", evalSet);
18441
18871
  this.rootCtx = this.api.trace.setSpan(this.api.context.active(), this.rootSpan);
18442
18872
  }
18443
18873
  /** Create and immediately export a tool span */
@@ -18786,11 +19216,241 @@ function extractToolResultContent(content) {
18786
19216
  return parts.length > 0 ? parts.join("") : void 0;
18787
19217
  }
18788
19218
 
18789
- // src/import/session-discovery.ts
19219
+ // src/import/codex-parser.ts
19220
+ function parseCodexSession(jsonl) {
19221
+ const messages = [];
19222
+ let sessionId = "";
19223
+ let cwd;
19224
+ let model;
19225
+ let version;
19226
+ let startTimestamp;
19227
+ let endTimestamp;
19228
+ const pendingCalls = /* @__PURE__ */ new Map();
19229
+ const lines = jsonl.split("\n").filter((l) => l.trim().length > 0);
19230
+ for (const line of lines) {
19231
+ let entry;
19232
+ try {
19233
+ entry = JSON.parse(line);
19234
+ } catch {
19235
+ continue;
19236
+ }
19237
+ if (!entry.type) continue;
19238
+ if (entry.timestamp) {
19239
+ if (!startTimestamp) startTimestamp = entry.timestamp;
19240
+ endTimestamp = entry.timestamp;
19241
+ }
19242
+ const payload = entry.payload ?? {};
19243
+ switch (entry.type) {
19244
+ case "session_meta": {
19245
+ sessionId = String(payload.id ?? "");
19246
+ cwd = payload.cwd ? String(payload.cwd) : void 0;
19247
+ version = payload.cli_version ? String(payload.cli_version) : void 0;
19248
+ if (payload.model && !model) {
19249
+ model = String(payload.model);
19250
+ }
19251
+ break;
19252
+ }
19253
+ case "turn_context": {
19254
+ if (payload.model && !model) {
19255
+ model = String(payload.model);
19256
+ }
19257
+ if (payload.cwd && !cwd) {
19258
+ cwd = String(payload.cwd);
19259
+ }
19260
+ break;
19261
+ }
19262
+ case "response_item": {
19263
+ const itemType = String(payload.type ?? "");
19264
+ const role = String(payload.role ?? "");
19265
+ switch (itemType) {
19266
+ case "message": {
19267
+ if (role === "developer") break;
19268
+ const content = extractResponseItemContent(payload.content);
19269
+ if (role === "user" && content) {
19270
+ messages.push({ role: "user", content });
19271
+ } else if (role === "assistant" && content) {
19272
+ messages.push({ role: "assistant", content });
19273
+ }
19274
+ break;
19275
+ }
19276
+ case "function_call": {
19277
+ const toolName = String(payload.name ?? "");
19278
+ const callId = String(payload.call_id ?? "");
19279
+ let input;
19280
+ if (typeof payload.arguments === "string") {
19281
+ try {
19282
+ input = JSON.parse(payload.arguments);
19283
+ } catch {
19284
+ input = payload.arguments;
19285
+ }
19286
+ } else {
19287
+ input = payload.arguments;
19288
+ }
19289
+ const toolCall = { tool: toolName, input, id: callId };
19290
+ const msgIdx = messages.length;
19291
+ messages.push({
19292
+ role: "assistant",
19293
+ toolCalls: [toolCall]
19294
+ });
19295
+ if (callId) {
19296
+ pendingCalls.set(callId, { msgIdx, toolIdx: 0 });
19297
+ }
19298
+ break;
19299
+ }
19300
+ case "custom_tool_call": {
19301
+ const toolName = String(payload.name ?? "");
19302
+ const callId = String(payload.call_id ?? "");
19303
+ let input;
19304
+ if (typeof payload.arguments === "string") {
19305
+ try {
19306
+ input = JSON.parse(payload.arguments);
19307
+ } catch {
19308
+ input = payload.arguments;
19309
+ }
19310
+ } else {
19311
+ input = payload.arguments;
19312
+ }
19313
+ const toolCall = { tool: toolName, input, id: callId };
19314
+ const msgIdx = messages.length;
19315
+ messages.push({
19316
+ role: "assistant",
19317
+ toolCalls: [toolCall]
19318
+ });
19319
+ if (callId) {
19320
+ pendingCalls.set(callId, { msgIdx, toolIdx: 0 });
19321
+ }
19322
+ break;
19323
+ }
19324
+ case "function_call_output":
19325
+ case "custom_tool_call_output": {
19326
+ const callId = String(payload.call_id ?? "");
19327
+ const pending = pendingCalls.get(callId);
19328
+ if (pending) {
19329
+ const existingMsg = messages[pending.msgIdx];
19330
+ const existingCalls = [...existingMsg.toolCalls ?? []];
19331
+ existingCalls[pending.toolIdx] = {
19332
+ ...existingCalls[pending.toolIdx],
19333
+ output: payload.output
19334
+ };
19335
+ messages[pending.msgIdx] = { ...existingMsg, toolCalls: existingCalls };
19336
+ pendingCalls.delete(callId);
19337
+ }
19338
+ break;
19339
+ }
19340
+ // Skip reasoning blocks (thinking tokens)
19341
+ case "reasoning":
19342
+ break;
19343
+ }
19344
+ break;
19345
+ }
19346
+ }
19347
+ }
19348
+ let durationMs;
19349
+ if (startTimestamp && endTimestamp) {
19350
+ durationMs = new Date(endTimestamp).getTime() - new Date(startTimestamp).getTime();
19351
+ }
19352
+ const source = {
19353
+ provider: "codex",
19354
+ sessionId,
19355
+ cwd,
19356
+ startedAt: startTimestamp,
19357
+ model,
19358
+ version
19359
+ };
19360
+ return {
19361
+ messages,
19362
+ source,
19363
+ // Codex rollout files don't include token counts (only rate limit info)
19364
+ tokenUsage: void 0,
19365
+ durationMs,
19366
+ costUsd: null
19367
+ };
19368
+ }
19369
+ function extractResponseItemContent(content) {
19370
+ if (typeof content === "string") return content;
19371
+ if (!Array.isArray(content)) return void 0;
19372
+ const parts = [];
19373
+ for (const block of content) {
19374
+ if (typeof block === "object" && block !== null) {
19375
+ const b = block;
19376
+ if (typeof b.text === "string") {
19377
+ parts.push(b.text);
19378
+ }
19379
+ }
19380
+ }
19381
+ return parts.length > 0 ? parts.join("") : void 0;
19382
+ }
19383
+
19384
+ // src/import/codex-session-discovery.ts
18790
19385
  import { readdir as readdir8, stat as stat9 } from "node:fs/promises";
18791
19386
  import { homedir as homedir3 } from "node:os";
18792
19387
  import path48 from "node:path";
18793
- var DEFAULT_PROJECTS_DIR = () => path48.join(homedir3(), ".claude", "projects");
19388
+ var DEFAULT_SESSIONS_DIR = () => path48.join(homedir3(), ".codex", "sessions");
19389
+ async function discoverCodexSessions(opts) {
19390
+ const sessionsDir = opts?.sessionsDir ?? DEFAULT_SESSIONS_DIR();
19391
+ const limit = opts?.latest ? 1 : opts?.limit ?? 10;
19392
+ const sessions = [];
19393
+ let yearDirs;
19394
+ try {
19395
+ yearDirs = await readdir8(sessionsDir);
19396
+ } catch {
19397
+ return [];
19398
+ }
19399
+ for (const year of yearDirs) {
19400
+ const yearPath = path48.join(sessionsDir, year);
19401
+ let monthDirs;
19402
+ try {
19403
+ monthDirs = await readdir8(yearPath);
19404
+ } catch {
19405
+ continue;
19406
+ }
19407
+ for (const month of monthDirs) {
19408
+ const monthPath = path48.join(yearPath, month);
19409
+ let dayDirs;
19410
+ try {
19411
+ dayDirs = await readdir8(monthPath);
19412
+ } catch {
19413
+ continue;
19414
+ }
19415
+ for (const day of dayDirs) {
19416
+ if (opts?.date) {
19417
+ const dirDate = `${year}-${month}-${day}`;
19418
+ if (dirDate !== opts.date) continue;
19419
+ }
19420
+ const dayPath = path48.join(monthPath, day);
19421
+ let files;
19422
+ try {
19423
+ files = await readdir8(dayPath);
19424
+ } catch {
19425
+ continue;
19426
+ }
19427
+ for (const file of files) {
19428
+ if (!file.startsWith("rollout-") || !file.endsWith(".jsonl")) continue;
19429
+ const filePath = path48.join(dayPath, file);
19430
+ const nameWithoutExt = file.replace(/\.jsonl$/, "");
19431
+ const parts = nameWithoutExt.split("-");
19432
+ const sessionId = parts.length >= 6 ? parts.slice(-5).join("-") : nameWithoutExt;
19433
+ let updatedAt;
19434
+ try {
19435
+ const fileStat = await stat9(filePath);
19436
+ updatedAt = fileStat.mtime;
19437
+ } catch {
19438
+ updatedAt = /* @__PURE__ */ new Date(0);
19439
+ }
19440
+ sessions.push({ sessionId, filePath, filename: file, updatedAt });
19441
+ }
19442
+ }
19443
+ }
19444
+ }
19445
+ sessions.sort((a, b) => b.updatedAt.getTime() - a.updatedAt.getTime());
19446
+ return sessions.slice(0, limit);
19447
+ }
19448
+
19449
+ // src/import/session-discovery.ts
19450
+ import { readdir as readdir9, stat as stat10 } from "node:fs/promises";
19451
+ import { homedir as homedir4 } from "node:os";
19452
+ import path49 from "node:path";
19453
+ var DEFAULT_PROJECTS_DIR = () => path49.join(homedir4(), ".claude", "projects");
18794
19454
  function encodeProjectPath(projectPath) {
18795
19455
  return projectPath.replace(/\//g, "-");
18796
19456
  }
@@ -18799,7 +19459,7 @@ async function discoverClaudeSessions(opts) {
18799
19459
  const limit = opts?.latest ? 1 : opts?.limit ?? 10;
18800
19460
  let projectDirs;
18801
19461
  try {
18802
- projectDirs = await readdir8(projectsDir);
19462
+ projectDirs = await readdir9(projectsDir);
18803
19463
  } catch {
18804
19464
  return [];
18805
19465
  }
@@ -18809,10 +19469,10 @@ async function discoverClaudeSessions(opts) {
18809
19469
  }
18810
19470
  const sessions = [];
18811
19471
  for (const projectDir of projectDirs) {
18812
- const dirPath = path48.join(projectsDir, projectDir);
19472
+ const dirPath = path49.join(projectsDir, projectDir);
18813
19473
  let entries;
18814
19474
  try {
18815
- entries = await readdir8(dirPath);
19475
+ entries = await readdir9(dirPath);
18816
19476
  } catch {
18817
19477
  continue;
18818
19478
  }
@@ -18820,10 +19480,10 @@ async function discoverClaudeSessions(opts) {
18820
19480
  if (!entry.endsWith(".jsonl")) continue;
18821
19481
  const sessionId = entry.replace(/\.jsonl$/, "");
18822
19482
  if (opts?.sessionId && sessionId !== opts.sessionId) continue;
18823
- const filePath = path48.join(dirPath, entry);
19483
+ const filePath = path49.join(dirPath, entry);
18824
19484
  let updatedAt;
18825
19485
  try {
18826
- const fileStat = await stat9(filePath);
19486
+ const fileStat = await stat10(filePath);
18827
19487
  updatedAt = fileStat.mtime;
18828
19488
  } catch {
18829
19489
  updatedAt = /* @__PURE__ */ new Date(0);
@@ -18842,10 +19502,85 @@ async function discoverClaudeSessions(opts) {
18842
19502
 
18843
19503
  // src/import/types.ts
18844
19504
  import { readFile as readFile14 } from "node:fs/promises";
19505
+ function toTranscriptJsonLine(entry) {
19506
+ const firstUserMessage = entry.messages.find((m) => m.role === "user");
19507
+ const input = typeof firstUserMessage?.content === "string" ? firstUserMessage.content : "";
19508
+ return {
19509
+ input,
19510
+ output: entry.messages,
19511
+ token_usage: entry.tokenUsage ? {
19512
+ input: entry.tokenUsage.input,
19513
+ output: entry.tokenUsage.output,
19514
+ cached: entry.tokenUsage.cached
19515
+ } : void 0,
19516
+ duration_ms: entry.durationMs,
19517
+ cost_usd: entry.costUsd,
19518
+ source: {
19519
+ provider: entry.source.provider,
19520
+ session_id: entry.source.sessionId,
19521
+ model: entry.source.model,
19522
+ timestamp: entry.source.startedAt,
19523
+ git_branch: entry.source.gitBranch,
19524
+ cwd: entry.source.cwd ?? entry.source.projectPath,
19525
+ version: entry.source.version
19526
+ }
19527
+ };
19528
+ }
19529
+ async function readTranscriptJsonl(filePath) {
19530
+ const text = await readFile14(filePath, "utf8");
19531
+ return text.split("\n").filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
19532
+ }
18845
19533
  async function readTranscriptFile(filePath) {
18846
19534
  return readFile14(filePath, "utf8");
18847
19535
  }
18848
19536
 
19537
+ // src/import/transcript-provider.ts
19538
+ var TranscriptProvider = class _TranscriptProvider {
19539
+ id;
19540
+ kind = "transcript";
19541
+ targetName;
19542
+ lines;
19543
+ cursor = 0;
19544
+ constructor(targetName, lines) {
19545
+ this.targetName = targetName;
19546
+ this.id = `transcript:${targetName}`;
19547
+ this.lines = lines;
19548
+ }
19549
+ /**
19550
+ * Create a TranscriptProvider from a JSONL file path.
19551
+ */
19552
+ static async fromFile(filePath) {
19553
+ const lines = await readTranscriptJsonl(filePath);
19554
+ if (lines.length === 0) {
19555
+ throw new Error(`Transcript file is empty: ${filePath}`);
19556
+ }
19557
+ const providerName = lines[0].source.provider ?? "transcript";
19558
+ return new _TranscriptProvider(providerName, lines);
19559
+ }
19560
+ get lineCount() {
19561
+ return this.lines.length;
19562
+ }
19563
+ async invoke(_request) {
19564
+ if (this.cursor >= this.lines.length) {
19565
+ throw new Error(
19566
+ `Transcript exhausted: ${this.lines.length} line(s) available but ${this.cursor + 1} invocations attempted. Each transcript line maps to one test case.`
19567
+ );
19568
+ }
19569
+ const line = this.lines[this.cursor++];
19570
+ return {
19571
+ output: line.output,
19572
+ tokenUsage: line.token_usage ? {
19573
+ input: line.token_usage.input,
19574
+ output: line.token_usage.output,
19575
+ cached: line.token_usage.cached
19576
+ } : void 0,
19577
+ durationMs: line.duration_ms,
19578
+ costUsd: line.cost_usd ?? void 0,
19579
+ startTime: line.source.timestamp
19580
+ };
19581
+ }
19582
+ };
19583
+
18849
19584
  // src/index.ts
18850
19585
  function createAgentKernel() {
18851
19586
  return { status: "stub" };
@@ -18859,6 +19594,7 @@ export {
18859
19594
  DEFAULT_EVALUATOR_TEMPLATE,
18860
19595
  DEFAULT_EVAL_PATTERNS,
18861
19596
  DEFAULT_EXPLORATION_TOOLS,
19597
+ DEFAULT_THRESHOLD,
18862
19598
  DeterministicAssertionEvaluator,
18863
19599
  EvaluatorRegistry,
18864
19600
  ExecutionMetricsEvaluator,
@@ -18880,6 +19616,7 @@ export {
18880
19616
  TemplateNotFoundError,
18881
19617
  TokenUsageEvaluator,
18882
19618
  ToolTrajectoryEvaluator,
19619
+ TranscriptProvider,
18883
19620
  WorkspaceCreationError,
18884
19621
  WorkspacePoolManager,
18885
19622
  addProject,
@@ -18916,6 +19653,7 @@ export {
18916
19653
  detectFormat,
18917
19654
  discoverAssertions,
18918
19655
  discoverClaudeSessions,
19656
+ discoverCodexSessions,
18919
19657
  discoverCopilotSessions,
18920
19658
  discoverGraders,
18921
19659
  discoverGraders as discoverJudges,
@@ -18976,6 +19714,8 @@ export {
18976
19714
  normalizeLineEndings,
18977
19715
  parseAgentSkillsEvals,
18978
19716
  parseClaudeSession,
19717
+ parseCodexSession,
19718
+ parseCopilotEvents,
18979
19719
  parseJsonFromText,
18980
19720
  parseJsonSafe,
18981
19721
  readJsonFile,
@@ -18983,6 +19723,7 @@ export {
18983
19723
  readTestSuiteMetadata,
18984
19724
  readTextFile,
18985
19725
  readTranscriptFile,
19726
+ readTranscriptJsonl,
18986
19727
  removeProject,
18987
19728
  resolveAndCreateProvider,
18988
19729
  resolveDelegatedTargetDefinition,
@@ -19015,6 +19756,7 @@ export {
19015
19756
  substituteVariables,
19016
19757
  toCamelCaseDeep,
19017
19758
  toSnakeCaseDeep,
19759
+ toTranscriptJsonLine,
19018
19760
  tokensPerTool,
19019
19761
  touchProject,
19020
19762
  transpileEvalYaml,