@agentv/core 4.6.1 → 4.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,5 +1,6 @@
1
1
  import {
2
2
  COMMON_TARGET_SETTINGS,
3
+ LLM_GRADER_CAPABLE_KINDS,
3
4
  TEST_MESSAGE_ROLES,
4
5
  buildDirectoryChain,
5
6
  buildSearchRoots,
@@ -24,7 +25,7 @@ import {
24
25
  resolveDelegatedTargetDefinition,
25
26
  resolveFileReference,
26
27
  resolveTargetDefinition
27
- } from "./chunk-ZK4GG7PR.js";
28
+ } from "./chunk-75RFVESM.js";
28
29
  import {
29
30
  AgentvProvider
30
31
  } from "./chunk-PRNXHNLF.js";
@@ -856,8 +857,13 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
856
857
  const negate = rawEvaluator.negate === true ? true : void 0;
857
858
  if (isCustomType) {
858
859
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
859
- const required2 = parseRequired(rawEvaluator.required);
860
- const knownProps2 = /* @__PURE__ */ new Set(["name", "type", "weight", "required", "negate"]);
860
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
861
+ rawEvaluator.required,
862
+ rawEvaluator.min_score,
863
+ name,
864
+ evalId
865
+ );
866
+ const knownProps2 = /* @__PURE__ */ new Set(["name", "type", "weight", "required", "min_score", "negate"]);
861
867
  const config2 = {};
862
868
  for (const [key, value] of Object.entries(rawEvaluator)) {
863
869
  if (!knownProps2.has(key) && value !== void 0) {
@@ -869,6 +875,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
869
875
  type: customTypeName,
870
876
  ...weight2 !== void 0 ? { weight: weight2 } : {},
871
877
  ...required2 !== void 0 ? { required: required2 } : {},
878
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
872
879
  ...negate !== void 0 ? { negate } : {},
873
880
  ...Object.keys(config2).length > 0 ? { config: config2 } : {}
874
881
  });
@@ -938,7 +945,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
938
945
  );
939
946
  }
940
947
  }
941
- const required2 = parseRequired(rawEvaluator.required);
948
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
949
+ rawEvaluator.required,
950
+ rawEvaluator.min_score,
951
+ name,
952
+ evalId
953
+ );
942
954
  const knownProps2 = /* @__PURE__ */ new Set([
943
955
  "name",
944
956
  "type",
@@ -964,6 +976,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
964
976
  resolvedCwd,
965
977
  ...weight2 !== void 0 ? { weight: weight2 } : {},
966
978
  ...required2 !== void 0 ? { required: required2 } : {},
979
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
967
980
  ...negate !== void 0 ? { negate } : {},
968
981
  ...Object.keys(config2).length > 0 ? { config: config2 } : {},
969
982
  ...targetConfig !== void 0 ? { target: targetConfig } : {}
@@ -1092,7 +1105,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1092
1105
  };
1093
1106
  }
1094
1107
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1095
- const required2 = parseRequired(rawEvaluator.required);
1108
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
1109
+ rawEvaluator.required,
1110
+ rawEvaluator.min_score,
1111
+ name,
1112
+ evalId
1113
+ );
1096
1114
  evaluators.push({
1097
1115
  name,
1098
1116
  type: "composite",
@@ -1100,6 +1118,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1100
1118
  aggregator,
1101
1119
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1102
1120
  ...required2 !== void 0 ? { required: required2 } : {},
1121
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
1103
1122
  ...negate !== void 0 ? { negate } : {}
1104
1123
  });
1105
1124
  continue;
@@ -1210,7 +1229,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1210
1229
  continue;
1211
1230
  }
1212
1231
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1213
- const required2 = parseRequired(rawEvaluator.required);
1232
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
1233
+ rawEvaluator.required,
1234
+ rawEvaluator.min_score,
1235
+ name,
1236
+ evalId
1237
+ );
1214
1238
  const config2 = {
1215
1239
  name,
1216
1240
  type: "tool-trajectory",
@@ -1219,6 +1243,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1219
1243
  ...expected ? { expected } : {},
1220
1244
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1221
1245
  ...required2 !== void 0 ? { required: required2 } : {},
1246
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
1222
1247
  ...negate !== void 0 ? { negate } : {},
1223
1248
  ...argsMatch2 !== void 0 ? { argsMatch: argsMatch2 } : {}
1224
1249
  };
@@ -1281,7 +1306,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1281
1306
  const aggregation = asString(rawEvaluator.aggregation);
1282
1307
  const validAggregation = isValidFieldAggregationType(aggregation) ? aggregation : void 0;
1283
1308
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1284
- const required2 = parseRequired(rawEvaluator.required);
1309
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
1310
+ rawEvaluator.required,
1311
+ rawEvaluator.min_score,
1312
+ name,
1313
+ evalId
1314
+ );
1285
1315
  evaluators.push({
1286
1316
  name,
1287
1317
  type: "field-accuracy",
@@ -1289,6 +1319,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1289
1319
  ...validAggregation ? { aggregation: validAggregation } : {},
1290
1320
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1291
1321
  ...required2 !== void 0 ? { required: required2 } : {},
1322
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
1292
1323
  ...negate !== void 0 ? { negate } : {}
1293
1324
  });
1294
1325
  continue;
@@ -1302,13 +1333,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1302
1333
  continue;
1303
1334
  }
1304
1335
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1305
- const required2 = parseRequired(rawEvaluator.required);
1336
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
1337
+ rawEvaluator.required,
1338
+ rawEvaluator.min_score,
1339
+ name,
1340
+ evalId
1341
+ );
1306
1342
  evaluators.push({
1307
1343
  name,
1308
1344
  type: "latency",
1309
1345
  threshold,
1310
1346
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1311
1347
  ...required2 !== void 0 ? { required: required2 } : {},
1348
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
1312
1349
  ...negate !== void 0 ? { negate } : {}
1313
1350
  });
1314
1351
  continue;
@@ -1322,13 +1359,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1322
1359
  continue;
1323
1360
  }
1324
1361
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1325
- const required2 = parseRequired(rawEvaluator.required);
1362
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
1363
+ rawEvaluator.required,
1364
+ rawEvaluator.min_score,
1365
+ name,
1366
+ evalId
1367
+ );
1326
1368
  evaluators.push({
1327
1369
  name,
1328
1370
  type: "cost",
1329
1371
  budget,
1330
1372
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1331
1373
  ...required2 !== void 0 ? { required: required2 } : {},
1374
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
1332
1375
  ...negate !== void 0 ? { negate } : {}
1333
1376
  });
1334
1377
  continue;
@@ -1360,13 +1403,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1360
1403
  continue;
1361
1404
  }
1362
1405
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1363
- const required2 = parseRequired(rawEvaluator.required);
1406
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
1407
+ rawEvaluator.required,
1408
+ rawEvaluator.min_score,
1409
+ name,
1410
+ evalId
1411
+ );
1364
1412
  evaluators.push({
1365
1413
  name,
1366
1414
  type: "token-usage",
1367
1415
  ...validLimits,
1368
1416
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1369
1417
  ...required2 !== void 0 ? { required: required2 } : {},
1418
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
1370
1419
  ...negate !== void 0 ? { negate } : {}
1371
1420
  });
1372
1421
  continue;
@@ -1412,13 +1461,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1412
1461
  continue;
1413
1462
  }
1414
1463
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1415
- const required2 = parseRequired(rawEvaluator.required);
1464
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
1465
+ rawEvaluator.required,
1466
+ rawEvaluator.min_score,
1467
+ name,
1468
+ evalId
1469
+ );
1416
1470
  evaluators.push({
1417
1471
  name,
1418
1472
  type: "execution-metrics",
1419
1473
  ...validThresholds,
1420
1474
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1421
1475
  ...required2 !== void 0 ? { required: required2 } : {},
1476
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
1422
1477
  ...negate !== void 0 ? { negate } : {}
1423
1478
  });
1424
1479
  continue;
@@ -1432,7 +1487,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1432
1487
  const rawShouldTrigger = rawEvaluator.should_trigger;
1433
1488
  const shouldTrigger = typeof rawShouldTrigger === "boolean" ? rawShouldTrigger : void 0;
1434
1489
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1435
- const required2 = parseRequired(rawEvaluator.required);
1490
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
1491
+ rawEvaluator.required,
1492
+ rawEvaluator.min_score,
1493
+ name,
1494
+ evalId
1495
+ );
1436
1496
  evaluators.push({
1437
1497
  name,
1438
1498
  type: "skill-trigger",
@@ -1440,6 +1500,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1440
1500
  ...shouldTrigger !== void 0 ? { should_trigger: shouldTrigger } : {},
1441
1501
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1442
1502
  ...required2 !== void 0 ? { required: required2 } : {},
1503
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
1443
1504
  ...negate !== void 0 ? { negate } : {}
1444
1505
  });
1445
1506
  continue;
@@ -1451,13 +1512,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1451
1512
  continue;
1452
1513
  }
1453
1514
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1454
- const required2 = parseRequired(rawEvaluator.required);
1515
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
1516
+ rawEvaluator.required,
1517
+ rawEvaluator.min_score,
1518
+ name,
1519
+ evalId
1520
+ );
1455
1521
  evaluators.push({
1456
1522
  name,
1457
1523
  type: "contains",
1458
1524
  value,
1459
1525
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1460
1526
  ...required2 !== void 0 ? { required: required2 } : {},
1527
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
1461
1528
  ...negate !== void 0 ? { negate } : {}
1462
1529
  });
1463
1530
  continue;
@@ -1471,13 +1538,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1471
1538
  continue;
1472
1539
  }
1473
1540
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1474
- const required2 = parseRequired(rawEvaluator.required);
1541
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
1542
+ rawEvaluator.required,
1543
+ rawEvaluator.min_score,
1544
+ name,
1545
+ evalId
1546
+ );
1475
1547
  evaluators.push({
1476
1548
  name,
1477
1549
  type: typeValue,
1478
1550
  value,
1479
1551
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1480
1552
  ...required2 !== void 0 ? { required: required2 } : {},
1553
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
1481
1554
  ...negate !== void 0 ? { negate } : {}
1482
1555
  });
1483
1556
  continue;
@@ -1489,13 +1562,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1489
1562
  continue;
1490
1563
  }
1491
1564
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1492
- const required2 = parseRequired(rawEvaluator.required);
1565
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
1566
+ rawEvaluator.required,
1567
+ rawEvaluator.min_score,
1568
+ name,
1569
+ evalId
1570
+ );
1493
1571
  evaluators.push({
1494
1572
  name,
1495
1573
  type: "icontains",
1496
1574
  value,
1497
1575
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1498
1576
  ...required2 !== void 0 ? { required: required2 } : {},
1577
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
1499
1578
  ...negate !== void 0 ? { negate } : {}
1500
1579
  });
1501
1580
  continue;
@@ -1509,13 +1588,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1509
1588
  continue;
1510
1589
  }
1511
1590
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1512
- const required2 = parseRequired(rawEvaluator.required);
1591
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
1592
+ rawEvaluator.required,
1593
+ rawEvaluator.min_score,
1594
+ name,
1595
+ evalId
1596
+ );
1513
1597
  evaluators.push({
1514
1598
  name,
1515
1599
  type: typeValue,
1516
1600
  value,
1517
1601
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1518
1602
  ...required2 !== void 0 ? { required: required2 } : {},
1603
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
1519
1604
  ...negate !== void 0 ? { negate } : {}
1520
1605
  });
1521
1606
  continue;
@@ -1527,13 +1612,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1527
1612
  continue;
1528
1613
  }
1529
1614
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1530
- const required2 = parseRequired(rawEvaluator.required);
1615
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
1616
+ rawEvaluator.required,
1617
+ rawEvaluator.min_score,
1618
+ name,
1619
+ evalId
1620
+ );
1531
1621
  evaluators.push({
1532
1622
  name,
1533
1623
  type: typeValue,
1534
1624
  value,
1535
1625
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1536
1626
  ...required2 !== void 0 ? { required: required2 } : {},
1627
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
1537
1628
  ...negate !== void 0 ? { negate } : {}
1538
1629
  });
1539
1630
  continue;
@@ -1546,7 +1637,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1546
1637
  }
1547
1638
  const flags = asString(rawEvaluator.flags);
1548
1639
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1549
- const required2 = parseRequired(rawEvaluator.required);
1640
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
1641
+ rawEvaluator.required,
1642
+ rawEvaluator.min_score,
1643
+ name,
1644
+ evalId
1645
+ );
1550
1646
  evaluators.push({
1551
1647
  name,
1552
1648
  type: "regex",
@@ -1554,18 +1650,25 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1554
1650
  ...flags !== void 0 ? { flags } : {},
1555
1651
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1556
1652
  ...required2 !== void 0 ? { required: required2 } : {},
1653
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
1557
1654
  ...negate !== void 0 ? { negate } : {}
1558
1655
  });
1559
1656
  continue;
1560
1657
  }
1561
1658
  if (typeValue === "is-json") {
1562
1659
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1563
- const required2 = parseRequired(rawEvaluator.required);
1660
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
1661
+ rawEvaluator.required,
1662
+ rawEvaluator.min_score,
1663
+ name,
1664
+ evalId
1665
+ );
1564
1666
  evaluators.push({
1565
1667
  name,
1566
1668
  type: "is-json",
1567
1669
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1568
1670
  ...required2 !== void 0 ? { required: required2 } : {},
1671
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
1569
1672
  ...negate !== void 0 ? { negate } : {}
1570
1673
  });
1571
1674
  continue;
@@ -1577,13 +1680,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1577
1680
  continue;
1578
1681
  }
1579
1682
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1580
- const required2 = parseRequired(rawEvaluator.required);
1683
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
1684
+ rawEvaluator.required,
1685
+ rawEvaluator.min_score,
1686
+ name,
1687
+ evalId
1688
+ );
1581
1689
  evaluators.push({
1582
1690
  name,
1583
1691
  type: "equals",
1584
1692
  value,
1585
1693
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1586
1694
  ...required2 !== void 0 ? { required: required2 } : {},
1695
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
1587
1696
  ...negate !== void 0 ? { negate } : {}
1588
1697
  });
1589
1698
  continue;
@@ -1619,7 +1728,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1619
1728
  continue;
1620
1729
  }
1621
1730
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1622
- const required2 = parseRequired(rawEvaluator.required);
1731
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
1732
+ rawEvaluator.required,
1733
+ rawEvaluator.min_score,
1734
+ name,
1735
+ evalId
1736
+ );
1623
1737
  evaluators.push({
1624
1738
  name,
1625
1739
  type: "llm-grader",
@@ -1627,6 +1741,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1627
1741
  ...graderTargetName ? { target: graderTargetName } : {},
1628
1742
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1629
1743
  ...required2 !== void 0 ? { required: required2 } : {},
1744
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
1630
1745
  ...negate !== void 0 ? { negate } : {}
1631
1746
  });
1632
1747
  continue;
@@ -1696,7 +1811,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1696
1811
  continue;
1697
1812
  }
1698
1813
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1699
- const required2 = parseRequired(rawEvaluator.required);
1814
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
1815
+ rawEvaluator.required,
1816
+ rawEvaluator.min_score,
1817
+ name,
1818
+ evalId
1819
+ );
1700
1820
  evaluators.push({
1701
1821
  name,
1702
1822
  type: "llm-grader",
@@ -1704,12 +1824,18 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1704
1824
  ...graderTargetName ? { target: graderTargetName } : {},
1705
1825
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1706
1826
  ...required2 !== void 0 ? { required: required2 } : {},
1827
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
1707
1828
  ...negate !== void 0 ? { negate } : {}
1708
1829
  });
1709
1830
  continue;
1710
1831
  }
1711
1832
  const weight = validateWeight(rawEvaluator.weight, name, evalId);
1712
- const required = parseRequired(rawEvaluator.required);
1833
+ const { required, min_score } = parseRequiredAndMinScore(
1834
+ rawEvaluator.required,
1835
+ rawEvaluator.min_score,
1836
+ name,
1837
+ evalId
1838
+ );
1713
1839
  const knownProps = /* @__PURE__ */ new Set([
1714
1840
  "name",
1715
1841
  "type",
@@ -1720,6 +1846,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1720
1846
  "weight",
1721
1847
  "config",
1722
1848
  "required",
1849
+ "min_score",
1723
1850
  "negate",
1724
1851
  "max_steps",
1725
1852
  "maxSteps",
@@ -1749,6 +1876,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1749
1876
  ...graderTargetName ? { target: graderTargetName } : {},
1750
1877
  ...weight !== void 0 ? { weight } : {},
1751
1878
  ...required !== void 0 ? { required } : {},
1879
+ ...min_score !== void 0 ? { min_score } : {},
1752
1880
  ...negate !== void 0 ? { negate } : {},
1753
1881
  ...finalConfig ? { config: finalConfig } : {},
1754
1882
  ...llmMaxSteps !== void 0 ? { max_steps: llmMaxSteps } : {},
@@ -1880,10 +2008,23 @@ ${detailBlock}${ANSI_RESET4}`);
1880
2008
  console.warn(`${ANSI_YELLOW3}Warning: ${message}${ANSI_RESET4}`);
1881
2009
  }
1882
2010
  }
1883
- function parseRequired(value) {
1884
- if (value === true) return true;
1885
- if (typeof value === "number" && value > 0 && value <= 1) return value;
1886
- return void 0;
2011
+ function parseRequiredAndMinScore(rawRequired, rawMinScore, evaluatorName, evalId) {
2012
+ const result = {};
2013
+ if (typeof rawMinScore === "number" && rawMinScore > 0 && rawMinScore <= 1) {
2014
+ result.min_score = rawMinScore;
2015
+ }
2016
+ if (rawRequired === true) {
2017
+ result.required = true;
2018
+ } else if (typeof rawRequired === "number" && rawRequired > 0 && rawRequired <= 1) {
2019
+ if (result.min_score === void 0) {
2020
+ result.min_score = rawRequired;
2021
+ }
2022
+ result.required = rawRequired;
2023
+ logWarning2(
2024
+ `Evaluator '${evaluatorName}' in '${evalId}': 'required: ${rawRequired}' is deprecated. Use 'required: true' + 'min_score: ${rawRequired}' instead.`
2025
+ );
2026
+ }
2027
+ return result;
1887
2028
  }
1888
2029
  function validateWeight(rawWeight, evaluatorName, evalId) {
1889
2030
  if (rawWeight === void 0) {
@@ -1926,16 +2067,30 @@ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
1926
2067
  const id = asString(rawRubric.id) ?? `rubric-${index + 1}`;
1927
2068
  const expectedOutcome = asString(rawRubric.outcome) ?? "";
1928
2069
  const weight = typeof rawRubric.weight === "number" ? rawRubric.weight : 1;
2070
+ let minScore;
1929
2071
  let requiredMinScore;
1930
2072
  let required;
1931
- if (typeof rawRubric.required_min_score === "number") {
1932
- const minScore = rawRubric.required_min_score;
1933
- if (!Number.isInteger(minScore) || minScore < 0 || minScore > 10) {
2073
+ if (typeof rawRubric.min_score === "number") {
2074
+ const ms = rawRubric.min_score;
2075
+ if (ms <= 0 || ms > 1) {
1934
2076
  throw new Error(
1935
- `Invalid required_min_score for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be an integer 0-10 (got ${minScore})`
2077
+ `Invalid min_score for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be in (0, 1] (got ${ms})`
1936
2078
  );
1937
2079
  }
1938
- requiredMinScore = minScore;
2080
+ minScore = ms;
2081
+ requiredMinScore = Math.round(ms * 10);
2082
+ } else if (typeof rawRubric.required_min_score === "number") {
2083
+ const rms = rawRubric.required_min_score;
2084
+ if (!Number.isInteger(rms) || rms < 0 || rms > 10) {
2085
+ throw new Error(
2086
+ `Invalid required_min_score for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be an integer 0-10 (got ${rms})`
2087
+ );
2088
+ }
2089
+ requiredMinScore = rms;
2090
+ minScore = rms / 10;
2091
+ logWarning2(
2092
+ `Rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': 'required_min_score: ${rms}' is deprecated. Use 'min_score: ${rms / 10}' (0-1 scale) instead.`
2093
+ );
1939
2094
  }
1940
2095
  if (typeof rawRubric.required === "boolean") {
1941
2096
  required = rawRubric.required;
@@ -1955,6 +2110,7 @@ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
1955
2110
  weight,
1956
2111
  ...expectedOutcome.length > 0 ? { outcome: expectedOutcome } : {},
1957
2112
  ...required !== void 0 ? { required } : {},
2113
+ ...minScore !== void 0 ? { min_score: minScore } : {},
1958
2114
  ...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {},
1959
2115
  score_ranges: scoreRanges
1960
2116
  });
@@ -1971,6 +2127,7 @@ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
1971
2127
  weight,
1972
2128
  // Default to required: true if not specified (backward compatibility)
1973
2129
  required: required ?? true,
2130
+ ...minScore !== void 0 ? { min_score: minScore } : {},
1974
2131
  ...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {}
1975
2132
  });
1976
2133
  }
@@ -2099,12 +2256,22 @@ function parseInlineRubrics(rawRubrics) {
2099
2256
  id: asString(rubric.id) ?? `rubric-${index + 1}`,
2100
2257
  weight: typeof rubric.weight === "number" ? rubric.weight : 1
2101
2258
  };
2259
+ let inlineMinScore;
2260
+ let inlineRequiredMinScore;
2261
+ if (typeof rubric.min_score === "number") {
2262
+ inlineMinScore = rubric.min_score;
2263
+ inlineRequiredMinScore = Math.round(inlineMinScore * 10);
2264
+ } else if (typeof rubric.required_min_score === "number") {
2265
+ inlineRequiredMinScore = rubric.required_min_score;
2266
+ inlineMinScore = inlineRequiredMinScore / 10;
2267
+ }
2102
2268
  if (scoreRanges && scoreRanges.length > 0) {
2103
2269
  return {
2104
2270
  ...baseRubric,
2105
2271
  ...expectedOutcome.length > 0 ? { outcome: expectedOutcome } : {},
2106
2272
  ...typeof rubric.required === "boolean" ? { required: rubric.required } : {},
2107
- ...typeof rubric.required_min_score === "number" ? { required_min_score: rubric.required_min_score } : {},
2273
+ ...inlineMinScore !== void 0 ? { min_score: inlineMinScore } : {},
2274
+ ...inlineRequiredMinScore !== void 0 ? { required_min_score: inlineRequiredMinScore } : {},
2108
2275
  score_ranges: scoreRanges
2109
2276
  };
2110
2277
  }
@@ -2112,7 +2279,8 @@ function parseInlineRubrics(rawRubrics) {
2112
2279
  ...baseRubric,
2113
2280
  outcome: expectedOutcome,
2114
2281
  required: typeof rubric.required === "boolean" ? rubric.required : true,
2115
- ...typeof rubric.required_min_score === "number" ? { required_min_score: rubric.required_min_score } : {}
2282
+ ...inlineMinScore !== void 0 ? { min_score: inlineMinScore } : {},
2283
+ ...inlineRequiredMinScore !== void 0 ? { required_min_score: inlineRequiredMinScore } : {}
2116
2284
  };
2117
2285
  }).filter((r) => r.outcome && r.outcome.length > 0 || "score_ranges" in r && r.score_ranges);
2118
2286
  if (rubricItems.length === 0) {
@@ -2512,6 +2680,9 @@ function resolveExpectedMessages(raw) {
2512
2680
  var ANSI_YELLOW5 = "\x1B[33m";
2513
2681
  var ANSI_RED2 = "\x1B[31m";
2514
2682
  var ANSI_RESET6 = "\x1B[0m";
2683
+ function matchesFilter(id, filter) {
2684
+ return typeof filter === "string" ? micromatch.isMatch(id, filter) : filter.some((pattern) => micromatch.isMatch(id, pattern));
2685
+ }
2515
2686
  function detectFormat(filePath) {
2516
2687
  const ext = path6.extname(filePath).toLowerCase();
2517
2688
  if (ext === ".jsonl") return "jsonl";
@@ -2579,40 +2750,40 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
2579
2750
  const sidecar = await loadSidecarMetadata(absoluteTestPath, verbose);
2580
2751
  const rawFile = await readFile5(absoluteTestPath, "utf8");
2581
2752
  const rawCases = parseJsonlContent(rawFile, evalFilePath);
2582
- const fallbackEvalSet = path6.basename(absoluteTestPath, ".jsonl") || "eval";
2583
- const evalSetName = sidecar.name && sidecar.name.trim().length > 0 ? sidecar.name : fallbackEvalSet;
2753
+ const fallbackSuiteName = path6.basename(absoluteTestPath, ".jsonl") || "eval";
2754
+ const suiteName = sidecar.name && sidecar.name.trim().length > 0 ? sidecar.name : fallbackSuiteName;
2584
2755
  const globalEvaluator = coerceEvaluator(sidecar.evaluator, "sidecar") ?? "llm-grader";
2585
2756
  const globalExecution = sidecar.execution;
2586
2757
  if (verbose) {
2587
2758
  console.log(`
2588
- [JSONL Dataset: ${evalFilePath}]`);
2759
+ [JSONL Suite: ${evalFilePath}]`);
2589
2760
  console.log(` Cases: ${rawCases.length}`);
2590
- console.log(` Eval set: ${evalSetName}`);
2761
+ console.log(` Suite: ${suiteName}`);
2591
2762
  if (sidecar.description) {
2592
2763
  console.log(` Description: ${sidecar.description}`);
2593
2764
  }
2594
2765
  }
2595
2766
  const results = [];
2596
2767
  for (let lineIndex = 0; lineIndex < rawCases.length; lineIndex++) {
2597
- const evalcase = rawCases[lineIndex];
2768
+ const testCaseConfig = rawCases[lineIndex];
2598
2769
  const lineNumber = lineIndex + 1;
2599
- const id = asString4(evalcase.id);
2600
- if (filterPattern && (!id || !micromatch.isMatch(id, filterPattern))) {
2770
+ const id = asString4(testCaseConfig.id);
2771
+ if (filterPattern && (!id || !matchesFilter(id, filterPattern))) {
2601
2772
  continue;
2602
2773
  }
2603
- const conversationId = asString4(evalcase.conversation_id);
2604
- let outcome = asString4(evalcase.criteria);
2605
- if (!outcome && evalcase.expected_outcome !== void 0) {
2606
- outcome = asString4(evalcase.expected_outcome);
2774
+ const conversationId = asString4(testCaseConfig.conversation_id);
2775
+ let outcome = asString4(testCaseConfig.criteria);
2776
+ if (!outcome && testCaseConfig.expected_outcome !== void 0) {
2777
+ outcome = asString4(testCaseConfig.expected_outcome);
2607
2778
  if (outcome) {
2608
2779
  logWarning4(
2609
- `Test '${asString4(evalcase.id) ?? "unknown"}': 'expected_outcome' is deprecated. Use 'criteria' instead.`
2780
+ `Test '${asString4(testCaseConfig.id) ?? "unknown"}': 'expected_outcome' is deprecated. Use 'criteria' instead.`
2610
2781
  );
2611
2782
  }
2612
2783
  }
2613
- const rawInputMessages = resolveInputMessages(evalcase);
2614
- const expectedMessages = resolveExpectedMessages(evalcase) ?? [];
2615
- const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || evalcase.assert !== void 0;
2784
+ const rawInputMessages = resolveInputMessages(testCaseConfig);
2785
+ const expectedMessages = resolveExpectedMessages(testCaseConfig) ?? [];
2786
+ const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || testCaseConfig.assert !== void 0;
2616
2787
  if (!id || !hasEvaluationSpec || !rawInputMessages || rawInputMessages.length === 0) {
2617
2788
  logError2(
2618
2789
  `Skipping incomplete test at line ${lineNumber}: ${id ?? "unknown"}. Missing required fields: id, input, and at least one of criteria/expected_output/assert`
@@ -2649,18 +2820,23 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
2649
2820
  }
2650
2821
  }
2651
2822
  const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
2652
- const caseExecution = isJsonObject(evalcase.execution) ? evalcase.execution : void 0;
2823
+ const caseExecution = isJsonObject(testCaseConfig.execution) ? testCaseConfig.execution : void 0;
2653
2824
  const mergedExecution = caseExecution ?? globalExecution;
2654
- const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
2825
+ const testCaseEvaluatorKind = coerceEvaluator(testCaseConfig.evaluator, id) ?? globalEvaluator;
2655
2826
  let evaluators;
2656
2827
  try {
2657
- evaluators = await parseEvaluators(evalcase, mergedExecution, searchRoots, id ?? "unknown");
2828
+ evaluators = await parseEvaluators(
2829
+ testCaseConfig,
2830
+ mergedExecution,
2831
+ searchRoots,
2832
+ id ?? "unknown"
2833
+ );
2658
2834
  } catch (error) {
2659
2835
  const message = error instanceof Error ? error.message : String(error);
2660
2836
  logError2(`Skipping test '${id}' at line ${lineNumber}: ${message}`);
2661
2837
  continue;
2662
2838
  }
2663
- const inlineRubrics = evalcase.rubrics;
2839
+ const inlineRubrics = testCaseConfig.rubrics;
2664
2840
  if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
2665
2841
  const rubricEvaluator = parseInlineRubrics(inlineRubrics);
2666
2842
  if (rubricEvaluator) {
@@ -2671,7 +2847,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
2671
2847
  const userFilePaths = collectResolvedInputFilePaths(inputMessages);
2672
2848
  const testCase = {
2673
2849
  id,
2674
- dataset: evalSetName,
2850
+ suite: suiteName,
2675
2851
  conversation_id: conversationId,
2676
2852
  question,
2677
2853
  input: inputMessages,
@@ -2679,7 +2855,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
2679
2855
  reference_answer: referenceAnswer,
2680
2856
  file_paths: userFilePaths,
2681
2857
  criteria: outcome ?? "",
2682
- evaluator: evalCaseEvaluatorKind,
2858
+ evaluator: testCaseEvaluatorKind,
2683
2859
  assertions: evaluators
2684
2860
  };
2685
2861
  results.push(testCase);
@@ -2862,6 +3038,9 @@ function buildChatPromptFromSegments(options) {
2862
3038
  var ANSI_YELLOW6 = "\x1B[33m";
2863
3039
  var ANSI_RED3 = "\x1B[31m";
2864
3040
  var ANSI_RESET7 = "\x1B[0m";
3041
+ function matchesFilter2(id, filter) {
3042
+ return typeof filter === "string" ? micromatch2.isMatch(id, filter) : filter.some((pattern) => micromatch2.isMatch(id, pattern));
3043
+ }
2865
3044
  function resolveTests(suite) {
2866
3045
  if (suite.tests !== void 0) return suite.tests;
2867
3046
  if (suite.eval_cases !== void 0) {
@@ -2941,18 +3120,18 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
2941
3120
  throw new Error(`Invalid test file format: ${evalFilePath}`);
2942
3121
  }
2943
3122
  const suite = interpolated;
2944
- const evalSetNameFromSuite = asString5(suite.name)?.trim();
2945
- const fallbackEvalSet = path7.basename(absoluteTestPath).replace(/\.eval\.ya?ml$/i, "").replace(/\.ya?ml$/i, "") || "eval";
2946
- const evalSetName = evalSetNameFromSuite && evalSetNameFromSuite.length > 0 ? evalSetNameFromSuite : fallbackEvalSet;
2947
- const rawTestcases = resolveTests(suite);
3123
+ const suiteNameFromFile = asString5(suite.name)?.trim();
3124
+ const fallbackSuiteName = path7.basename(absoluteTestPath).replace(/\.eval\.ya?ml$/i, "").replace(/\.ya?ml$/i, "") || "eval";
3125
+ const suiteName = suiteNameFromFile && suiteNameFromFile.length > 0 ? suiteNameFromFile : fallbackSuiteName;
3126
+ const rawTestCases = resolveTests(suite);
2948
3127
  const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm-grader";
2949
3128
  const evalFileDir = path7.dirname(absoluteTestPath);
2950
- let expandedTestcases;
2951
- if (typeof rawTestcases === "string") {
2952
- const externalPath = path7.resolve(evalFileDir, rawTestcases);
2953
- expandedTestcases = await loadCasesFromFile(externalPath);
2954
- } else if (Array.isArray(rawTestcases)) {
2955
- expandedTestcases = await expandFileReferences(rawTestcases, evalFileDir);
3129
+ let expandedTestCases;
3130
+ if (typeof rawTestCases === "string") {
3131
+ const externalPath = path7.resolve(evalFileDir, rawTestCases);
3132
+ expandedTestCases = await loadCasesFromFile(externalPath);
3133
+ } else if (Array.isArray(rawTestCases)) {
3134
+ expandedTestCases = await expandFileReferences(rawTestCases, evalFileDir);
2956
3135
  } else {
2957
3136
  throw new Error(`Invalid test file format: ${evalFilePath} - missing 'tests' field`);
2958
3137
  }
@@ -2967,32 +3146,33 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
2967
3146
  }
2968
3147
  const globalExecution = suiteAssertions !== void 0 ? { ...rawGlobalExecution ?? {}, assertions: suiteAssertions } : rawGlobalExecution;
2969
3148
  const results = [];
2970
- for (const rawEvalcase of expandedTestcases) {
2971
- if (!isJsonObject(rawEvalcase)) {
3149
+ for (const rawTestCase of expandedTestCases) {
3150
+ if (!isJsonObject(rawTestCase)) {
2972
3151
  logWarning5("Skipping invalid test entry (expected object)");
2973
3152
  continue;
2974
3153
  }
2975
- const evalcase = rawEvalcase;
2976
- const id = asString5(evalcase.id);
2977
- if (filterPattern && (!id || !micromatch2.isMatch(id, filterPattern))) {
3154
+ const testCaseConfig = rawTestCase;
3155
+ const id = asString5(testCaseConfig.id);
3156
+ if (filterPattern && (!id || !matchesFilter2(id, filterPattern))) {
2978
3157
  continue;
2979
3158
  }
2980
- const conversationId = asString5(evalcase.conversation_id);
2981
- let outcome = asString5(evalcase.criteria);
2982
- if (!outcome && evalcase.expected_outcome !== void 0) {
2983
- outcome = asString5(evalcase.expected_outcome);
3159
+ const conversationId = asString5(testCaseConfig.conversation_id);
3160
+ let outcome = asString5(testCaseConfig.criteria);
3161
+ if (!outcome && testCaseConfig.expected_outcome !== void 0) {
3162
+ outcome = asString5(testCaseConfig.expected_outcome);
2984
3163
  if (outcome) {
2985
3164
  logWarning5(
2986
- `Test '${asString5(evalcase.id) ?? "unknown"}': 'expected_outcome' is deprecated. Use 'criteria' instead.`
3165
+ `Test '${asString5(testCaseConfig.id) ?? "unknown"}': 'expected_outcome' is deprecated. Use 'criteria' instead.`
2987
3166
  );
2988
3167
  }
2989
3168
  }
2990
- const caseExecution = isJsonObject(evalcase.execution) ? evalcase.execution : void 0;
3169
+ const caseExecution = isJsonObject(testCaseConfig.execution) ? testCaseConfig.execution : void 0;
2991
3170
  const skipDefaults = caseExecution?.skip_defaults === true;
3171
+ const caseThreshold = typeof caseExecution?.threshold === "number" && caseExecution.threshold >= 0 && caseExecution.threshold <= 1 ? caseExecution.threshold : void 0;
2992
3172
  const effectiveSuiteInputFiles = suiteInputFiles && !skipDefaults ? suiteInputFiles : void 0;
2993
- const testInputMessages = resolveInputMessages(evalcase, effectiveSuiteInputFiles);
2994
- const expectedMessages = resolveExpectedMessages(evalcase) ?? [];
2995
- const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || evalcase.assertions !== void 0 || evalcase.assert !== void 0;
3173
+ const testInputMessages = resolveInputMessages(testCaseConfig, effectiveSuiteInputFiles);
3174
+ const expectedMessages = resolveExpectedMessages(testCaseConfig) ?? [];
3175
+ const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || testCaseConfig.assertions !== void 0 || testCaseConfig.assert !== void 0;
2996
3176
  if (!id || !hasEvaluationSpec || !testInputMessages || testInputMessages.length === 0) {
2997
3177
  logError3(
2998
3178
  `Skipping incomplete test: ${id ?? "unknown"}. Missing required fields: id, input, and at least one of criteria/expected_output/assertions`
@@ -3039,16 +3219,21 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
3039
3219
  }
3040
3220
  }
3041
3221
  const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
3042
- const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
3222
+ const testCaseEvaluatorKind = coerceEvaluator(testCaseConfig.evaluator, id) ?? globalEvaluator;
3043
3223
  let evaluators;
3044
3224
  try {
3045
- evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
3225
+ evaluators = await parseEvaluators(
3226
+ testCaseConfig,
3227
+ globalExecution,
3228
+ searchRoots,
3229
+ id ?? "unknown"
3230
+ );
3046
3231
  } catch (error) {
3047
3232
  const message = error instanceof Error ? error.message : String(error);
3048
3233
  logError3(`Skipping test '${id}': ${message}`);
3049
3234
  continue;
3050
3235
  }
3051
- const inlineRubrics = evalcase.rubrics;
3236
+ const inlineRubrics = testCaseConfig.rubrics;
3052
3237
  if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
3053
3238
  const rubricEvaluator = parseInlineRubrics(inlineRubrics);
3054
3239
  if (rubricEvaluator) {
@@ -3057,13 +3242,13 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
3057
3242
  }
3058
3243
  warnUnconsumedCriteria(outcome, evaluators, id ?? "unknown");
3059
3244
  const userFilePaths = collectResolvedInputFilePaths(inputMessages);
3060
- const caseWorkspace = await resolveWorkspaceConfig(evalcase.workspace, evalFileDir);
3245
+ const caseWorkspace = await resolveWorkspaceConfig(testCaseConfig.workspace, evalFileDir);
3061
3246
  const mergedWorkspace = mergeWorkspaceConfigs(suiteWorkspace, caseWorkspace);
3062
- const metadata = isJsonObject(evalcase.metadata) ? evalcase.metadata : void 0;
3063
- const caseTargets = extractTargetsFromTestCase(evalcase);
3247
+ const metadata = isJsonObject(testCaseConfig.metadata) ? testCaseConfig.metadata : void 0;
3248
+ const caseTargets = extractTargetsFromTestCase(testCaseConfig);
3064
3249
  const testCase = {
3065
3250
  id,
3066
- dataset: evalSetName,
3251
+ suite: suiteName,
3067
3252
  category: options?.category,
3068
3253
  conversation_id: conversationId,
3069
3254
  question,
@@ -3072,11 +3257,12 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
3072
3257
  reference_answer: referenceAnswer,
3073
3258
  file_paths: userFilePaths,
3074
3259
  criteria: outcome ?? "",
3075
- evaluator: evalCaseEvaluatorKind,
3260
+ evaluator: testCaseEvaluatorKind,
3076
3261
  assertions: evaluators,
3077
3262
  workspace: mergedWorkspace,
3078
3263
  metadata,
3079
- targets: caseTargets
3264
+ targets: caseTargets,
3265
+ ...caseThreshold !== void 0 ? { threshold: caseThreshold } : {}
3080
3266
  };
3081
3267
  results.push(testCase);
3082
3268
  }
@@ -3620,7 +3806,7 @@ var AzureProvider = class {
3620
3806
  };
3621
3807
  this.retryConfig = config.retry;
3622
3808
  const azure = createAzure(buildAzureOptions(config));
3623
- this.model = azure.chat(config.deploymentName);
3809
+ this.model = config.apiFormat === "responses" ? azure(config.deploymentName) : azure.chat(config.deploymentName);
3624
3810
  }
3625
3811
  id;
3626
3812
  kind = "azure";
@@ -3746,7 +3932,9 @@ function buildAzureOptions(config) {
3746
3932
  const options = {
3747
3933
  apiKey: config.apiKey,
3748
3934
  apiVersion: config.version,
3749
- useDeploymentBasedUrls: true
3935
+ // Chat completions still use deployment-scoped Azure URLs for compatibility
3936
+ // with existing deployments. Responses API should use the SDK's v1 path.
3937
+ useDeploymentBasedUrls: config.apiFormat !== "responses"
3750
3938
  };
3751
3939
  const baseURL = normalizeAzureBaseUrl(config.resourceName);
3752
3940
  if (baseURL) {
@@ -7354,6 +7542,22 @@ function extractAzureResourceName(baseUrl) {
7354
7542
  if (urlMatch) return urlMatch[1];
7355
7543
  return baseUrl;
7356
7544
  }
7545
+ function normalizeAzureSdkBaseUrl(baseUrl) {
7546
+ const trimmed = baseUrl.trim().replace(/\/+$/, "");
7547
+ if (!trimmed) {
7548
+ return trimmed;
7549
+ }
7550
+ if (!/^https?:\/\//i.test(trimmed)) {
7551
+ return `https://${trimmed}.openai.azure.com/openai/v1`;
7552
+ }
7553
+ if (/\/openai\/v1$/i.test(trimmed)) {
7554
+ return trimmed;
7555
+ }
7556
+ if (/\/openai$/i.test(trimmed)) {
7557
+ return `${trimmed}/v1`;
7558
+ }
7559
+ return `${trimmed}/openai/v1`;
7560
+ }
7357
7561
 
7358
7562
  // src/evaluation/providers/pi-utils.ts
7359
7563
  function extractPiTextContent(content) {
@@ -8184,11 +8388,41 @@ async function defaultPiRunner(options) {
8184
8388
  // src/evaluation/providers/pi-coding-agent.ts
8185
8389
  import { execSync as execSync2 } from "node:child_process";
8186
8390
  import { randomUUID as randomUUID8 } from "node:crypto";
8187
- import { accessSync as accessSync2, createWriteStream as createWriteStream6 } from "node:fs";
8391
+ import { accessSync as accessSync2, createWriteStream as createWriteStream6, mkdirSync } from "node:fs";
8188
8392
  import { mkdir as mkdir7 } from "node:fs/promises";
8189
- import path20 from "node:path";
8393
+ import path21 from "node:path";
8190
8394
  import { createInterface } from "node:readline";
8191
- import { fileURLToPath as fileURLToPath3 } from "node:url";
8395
+ import { fileURLToPath as fileURLToPath3, pathToFileURL } from "node:url";
8396
+
8397
+ // src/paths.ts
8398
+ import os2 from "node:os";
8399
+ import path20 from "node:path";
8400
+ var logged = false;
8401
+ function getAgentvHome() {
8402
+ const envHome = process.env.AGENTV_HOME;
8403
+ if (envHome && envHome !== "undefined") {
8404
+ if (!logged) {
8405
+ logged = true;
8406
+ console.warn(`Using AGENTV_HOME: ${envHome}`);
8407
+ }
8408
+ return envHome;
8409
+ }
8410
+ return path20.join(os2.homedir(), ".agentv");
8411
+ }
8412
+ function getWorkspacesRoot() {
8413
+ return path20.join(getAgentvHome(), "workspaces");
8414
+ }
8415
+ function getSubagentsRoot() {
8416
+ return path20.join(getAgentvHome(), "subagents");
8417
+ }
8418
+ function getTraceStateRoot() {
8419
+ return path20.join(getAgentvHome(), "trace-state");
8420
+ }
8421
+ function getWorkspacePoolRoot() {
8422
+ return path20.join(getAgentvHome(), "workspace-pool");
8423
+ }
8424
+
8425
+ // src/evaluation/providers/pi-coding-agent.ts
8192
8426
  var piCodingAgentModule = null;
8193
8427
  var piAiModule = null;
8194
8428
  var loadingPromise = null;
@@ -8206,46 +8440,126 @@ async function promptInstall() {
8206
8440
  rl.close();
8207
8441
  }
8208
8442
  }
8209
- function findAgentvRoot() {
8210
- const thisFile = fileURLToPath3(import.meta.url);
8211
- let dir = path20.dirname(thisFile);
8212
- for (let i = 0; i < 10; i++) {
8443
+ function findManagedSdkInstallRoot() {
8444
+ return path21.join(getAgentvHome(), "deps", "pi-sdk");
8445
+ }
8446
+ function resolveGlobalNpmRoot() {
8447
+ try {
8448
+ const root = execSync2("npm root -g", {
8449
+ encoding: "utf-8",
8450
+ stdio: ["ignore", "pipe", "ignore"]
8451
+ }).trim();
8452
+ return root.length > 0 ? root : void 0;
8453
+ } catch {
8454
+ return void 0;
8455
+ }
8456
+ }
8457
+ function buildGlobalModuleEntry(moduleName, globalNpmRoot) {
8458
+ return path21.join(globalNpmRoot, ...moduleName.split("/"), "dist", "index.js");
8459
+ }
8460
+ function findAccessiblePath(paths) {
8461
+ for (const candidate of paths) {
8213
8462
  try {
8214
- const pkg = path20.join(dir, "package.json");
8215
- accessSync2(pkg);
8216
- return dir;
8463
+ accessSync2(candidate);
8464
+ return candidate;
8217
8465
  } catch {
8218
- const parent = path20.dirname(dir);
8219
- if (parent === dir) break;
8220
- dir = parent;
8221
8466
  }
8222
8467
  }
8223
- return path20.dirname(thisFile);
8468
+ return void 0;
8224
8469
  }
8225
- async function doLoadSdkModules() {
8470
+ async function tryImportLocalSdkModules() {
8226
8471
  try {
8227
8472
  [piCodingAgentModule, piAiModule] = await Promise.all([
8228
8473
  import("@mariozechner/pi-coding-agent"),
8229
8474
  import("@mariozechner/pi-ai")
8230
8475
  ]);
8476
+ return true;
8231
8477
  } catch {
8232
- if (await promptInstall()) {
8233
- const installDir = findAgentvRoot();
8234
- console.error(`Installing @mariozechner/pi-coding-agent into ${installDir}...`);
8235
- execSync2("bun add @mariozechner/pi-coding-agent", {
8236
- cwd: installDir,
8237
- stdio: "inherit"
8238
- });
8239
- [piCodingAgentModule, piAiModule] = await Promise.all([
8240
- import("@mariozechner/pi-coding-agent"),
8241
- import("@mariozechner/pi-ai")
8242
- ]);
8243
- } else {
8244
- throw new Error(
8245
- "pi-coding-agent SDK is not installed. Install it with:\n bun add @mariozechner/pi-coding-agent"
8246
- );
8478
+ return false;
8479
+ }
8480
+ }
8481
+ async function tryImportManagedSdkModules() {
8482
+ const managedRoot = findManagedSdkInstallRoot();
8483
+ const piCodingAgentEntry = findAccessiblePath([
8484
+ path21.join(managedRoot, "node_modules", "@mariozechner", "pi-coding-agent", "dist", "index.js")
8485
+ ]);
8486
+ const piAiEntry = findAccessiblePath([
8487
+ path21.join(managedRoot, "node_modules", "@mariozechner", "pi-ai", "dist", "index.js"),
8488
+ path21.join(
8489
+ managedRoot,
8490
+ "node_modules",
8491
+ "@mariozechner",
8492
+ "pi-coding-agent",
8493
+ "node_modules",
8494
+ "@mariozechner",
8495
+ "pi-ai",
8496
+ "dist",
8497
+ "index.js"
8498
+ )
8499
+ ]);
8500
+ if (!piCodingAgentEntry || !piAiEntry) return false;
8501
+ try {
8502
+ [piCodingAgentModule, piAiModule] = await Promise.all([
8503
+ import(pathToFileURL(piCodingAgentEntry).href),
8504
+ import(pathToFileURL(piAiEntry).href)
8505
+ ]);
8506
+ return true;
8507
+ } catch {
8508
+ return false;
8509
+ }
8510
+ }
8511
+ async function tryImportGlobalSdkModules() {
8512
+ const globalNpmRoot = resolveGlobalNpmRoot();
8513
+ if (!globalNpmRoot) return false;
8514
+ const piCodingAgentEntry = findAccessiblePath([
8515
+ buildGlobalModuleEntry("@mariozechner/pi-coding-agent", globalNpmRoot)
8516
+ ]);
8517
+ const piAiEntry = findAccessiblePath([
8518
+ buildGlobalModuleEntry("@mariozechner/pi-ai", globalNpmRoot),
8519
+ path21.join(
8520
+ globalNpmRoot,
8521
+ "@mariozechner",
8522
+ "pi-coding-agent",
8523
+ "node_modules",
8524
+ "@mariozechner",
8525
+ "pi-ai",
8526
+ "dist",
8527
+ "index.js"
8528
+ )
8529
+ ]);
8530
+ if (!piCodingAgentEntry || !piAiEntry) return false;
8531
+ try {
8532
+ [piCodingAgentModule, piAiModule] = await Promise.all([
8533
+ import(pathToFileURL(piCodingAgentEntry).href),
8534
+ import(pathToFileURL(piAiEntry).href)
8535
+ ]);
8536
+ return true;
8537
+ } catch {
8538
+ return false;
8539
+ }
8540
+ }
8541
+ function installSdkModules(installDir) {
8542
+ console.error(`Installing @mariozechner/pi-coding-agent into ${installDir} via npm...`);
8543
+ mkdirSync(installDir, { recursive: true });
8544
+ execSync2("npm install --no-save --no-package-lock @mariozechner/pi-coding-agent", {
8545
+ cwd: installDir,
8546
+ stdio: "inherit"
8547
+ });
8548
+ }
8549
+ async function doLoadSdkModules() {
8550
+ if (await tryImportLocalSdkModules() || await tryImportManagedSdkModules() || await tryImportGlobalSdkModules()) {
8551
+ return;
8552
+ }
8553
+ if (await promptInstall()) {
8554
+ const installDir = findManagedSdkInstallRoot();
8555
+ installSdkModules(installDir);
8556
+ if (await tryImportManagedSdkModules()) {
8557
+ return;
8247
8558
  }
8248
8559
  }
8560
+ throw new Error(
8561
+ "pi-coding-agent SDK is not installed. Install it with:\n npm install @mariozechner/pi-coding-agent"
8562
+ );
8249
8563
  }
8250
8564
  async function loadSdkModules() {
8251
8565
  if (!piCodingAgentModule || !piAiModule) {
@@ -8302,12 +8616,16 @@ var PiCodingAgentProvider = class {
8302
8616
  try {
8303
8617
  const cwd = this.resolveCwd(request.cwd);
8304
8618
  const rawProvider = this.config.subprovider ?? "google";
8305
- const hasBaseUrl = !!this.config.baseUrl;
8619
+ const normalizedBaseUrl = this.normalizeSdkBaseUrl(rawProvider, this.config.baseUrl);
8620
+ const hasBaseUrl = !!normalizedBaseUrl;
8306
8621
  const providerName = resolveSubprovider(rawProvider, hasBaseUrl);
8307
8622
  const modelId = this.config.model ?? "gemini-2.5-flash";
8308
8623
  this.setApiKeyEnv(rawProvider, hasBaseUrl);
8309
- this.setBaseUrlEnv(rawProvider, hasBaseUrl);
8624
+ this.setBaseUrlEnv(rawProvider, normalizedBaseUrl, hasBaseUrl);
8310
8625
  let model = sdk.getModel(providerName, modelId);
8626
+ if (model && normalizedBaseUrl) {
8627
+ model = { ...model, baseUrl: normalizedBaseUrl };
8628
+ }
8311
8629
  if (!model) {
8312
8630
  const envProvider = providerName.replace(/-responses$/, "");
8313
8631
  model = {
@@ -8315,7 +8633,7 @@ var PiCodingAgentProvider = class {
8315
8633
  name: modelId,
8316
8634
  api: providerName,
8317
8635
  provider: envProvider,
8318
- baseUrl: this.config.baseUrl ?? "",
8636
+ baseUrl: normalizedBaseUrl ?? "",
8319
8637
  reasoning: false,
8320
8638
  input: ["text"],
8321
8639
  cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
@@ -8482,19 +8800,27 @@ ${fileList}`;
8482
8800
  }
8483
8801
  }
8484
8802
  /** Maps config baseUrl to the provider-specific env var the SDK reads. */
8485
- setBaseUrlEnv(providerName, hasBaseUrl = false) {
8486
- if (!this.config.baseUrl) return;
8803
+ setBaseUrlEnv(providerName, baseUrl = this.config.baseUrl, hasBaseUrl = false) {
8804
+ const normalizedBaseUrl = this.normalizeSdkBaseUrl(providerName, baseUrl);
8805
+ if (!normalizedBaseUrl) return;
8487
8806
  const envKey = resolveEnvBaseUrlName(providerName, hasBaseUrl);
8488
8807
  if (envKey) {
8489
- process.env[envKey] = this.config.baseUrl;
8808
+ process.env[envKey] = normalizedBaseUrl;
8490
8809
  }
8491
8810
  }
8811
+ normalizeSdkBaseUrl(providerName, baseUrl) {
8812
+ if (!baseUrl) return void 0;
8813
+ if (providerName.toLowerCase() === "azure") {
8814
+ return normalizeAzureSdkBaseUrl(baseUrl);
8815
+ }
8816
+ return baseUrl;
8817
+ }
8492
8818
  resolveCwd(cwdOverride) {
8493
8819
  if (cwdOverride) {
8494
- return path20.resolve(cwdOverride);
8820
+ return path21.resolve(cwdOverride);
8495
8821
  }
8496
8822
  if (this.config.cwd) {
8497
- return path20.resolve(this.config.cwd);
8823
+ return path21.resolve(this.config.cwd);
8498
8824
  }
8499
8825
  return process.cwd();
8500
8826
  }
@@ -8513,9 +8839,9 @@ ${fileList}`;
8513
8839
  }
8514
8840
  resolveLogDirectory() {
8515
8841
  if (this.config.logDir) {
8516
- return path20.resolve(this.config.logDir);
8842
+ return path21.resolve(this.config.logDir);
8517
8843
  }
8518
- return path20.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
8844
+ return path21.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
8519
8845
  }
8520
8846
  async createStreamLogger(request) {
8521
8847
  const logDir = this.resolveLogDirectory();
@@ -8529,7 +8855,7 @@ ${fileList}`;
8529
8855
  console.warn(`Skipping Pi stream logging (could not create ${logDir}): ${message}`);
8530
8856
  return void 0;
8531
8857
  }
8532
- const filePath = path20.join(logDir, buildLogFilename6(request, this.targetName));
8858
+ const filePath = path21.join(logDir, buildLogFilename6(request, this.targetName));
8533
8859
  try {
8534
8860
  const logger = await PiStreamLogger2.create({
8535
8861
  filePath,
@@ -8754,7 +9080,7 @@ import path30 from "node:path";
8754
9080
  // src/evaluation/providers/vscode/utils/fs.ts
8755
9081
  import { constants as constants2 } from "node:fs";
8756
9082
  import { access as access2, mkdir as mkdir8, readdir as readdir2, rm as rm2, stat as stat2 } from "node:fs/promises";
8757
- import path21 from "node:path";
9083
+ import path22 from "node:path";
8758
9084
  async function pathExists(target) {
8759
9085
  try {
8760
9086
  await access2(target, constants2.F_OK);
@@ -8770,7 +9096,7 @@ async function readDirEntries(target) {
8770
9096
  const entries = await readdir2(target, { withFileTypes: true });
8771
9097
  return entries.map((entry) => ({
8772
9098
  name: entry.name,
8773
- absolutePath: path21.join(target, entry.name),
9099
+ absolutePath: path22.join(target, entry.name),
8774
9100
  isDirectory: entry.isDirectory()
8775
9101
  }));
8776
9102
  }
@@ -8785,9 +9111,9 @@ async function removeIfExists(target) {
8785
9111
  }
8786
9112
 
8787
9113
  // src/evaluation/providers/vscode/utils/path.ts
8788
- import path22 from "node:path";
9114
+ import path23 from "node:path";
8789
9115
  function pathToFileUri2(filePath) {
8790
- const absolutePath = path22.isAbsolute(filePath) ? filePath : path22.resolve(filePath);
9116
+ const absolutePath = path23.isAbsolute(filePath) ? filePath : path23.resolve(filePath);
8791
9117
  const normalizedPath = absolutePath.replace(/\\/g, "/");
8792
9118
  if (/^[a-zA-Z]:\//.test(normalizedPath)) {
8793
9119
  return `file:///${normalizedPath}`;
@@ -8796,7 +9122,7 @@ function pathToFileUri2(filePath) {
8796
9122
  }
8797
9123
 
8798
9124
  // src/evaluation/providers/vscode/dispatch/promptBuilder.ts
8799
- import path23 from "node:path";
9125
+ import path24 from "node:path";
8800
9126
 
8801
9127
  // src/evaluation/providers/vscode/utils/template.ts
8802
9128
  function renderTemplate2(content, variables) {
@@ -8888,8 +9214,8 @@ function createBatchRequestPrompt(userQuery, responseFileTmp, responseFileFinal,
8888
9214
  });
8889
9215
  }
8890
9216
  function createBatchOrchestratorPrompt(requestFiles, responseFiles, templateContent) {
8891
- const requestLines = requestFiles.map((file, index) => `${index + 1}. messages/${path23.basename(file)}`).join("\n");
8892
- const responseList = responseFiles.map((file) => `"${path23.basename(file)}"`).join(", ");
9217
+ const requestLines = requestFiles.map((file, index) => `${index + 1}. messages/${path24.basename(file)}`).join("\n");
9218
+ const responseList = responseFiles.map((file) => `"${path24.basename(file)}"`).join(", ");
8893
9219
  return renderTemplate2(templateContent, {
8894
9220
  requestFiles: requestLines,
8895
9221
  responseList
@@ -8898,7 +9224,7 @@ function createBatchOrchestratorPrompt(requestFiles, responseFiles, templateCont
8898
9224
 
8899
9225
  // src/evaluation/providers/vscode/dispatch/responseWaiter.ts
8900
9226
  import { readFile as readFile9 } from "node:fs/promises";
8901
- import path24 from "node:path";
9227
+ import path25 from "node:path";
8902
9228
 
8903
9229
  // src/evaluation/providers/vscode/utils/time.ts
8904
9230
  function sleep2(ms) {
@@ -8957,7 +9283,7 @@ async function waitForResponseOutput(responseFileFinal, pollInterval = 1e3, sile
8957
9283
  }
8958
9284
  async function waitForBatchResponses(responseFilesFinal, pollInterval = 1e3, silent = false, timeoutMs = DEFAULT_TIMEOUT_MS) {
8959
9285
  if (!silent) {
8960
- const fileList = responseFilesFinal.map((file) => path24.basename(file)).join(", ");
9286
+ const fileList = responseFilesFinal.map((file) => path25.basename(file)).join(", ");
8961
9287
  console.error(`waiting for ${responseFilesFinal.length} batch response(s): ${fileList}`);
8962
9288
  }
8963
9289
  const deadline = Date.now() + timeoutMs;
@@ -8966,7 +9292,7 @@ async function waitForBatchResponses(responseFilesFinal, pollInterval = 1e3, sil
8966
9292
  while (pending.size > 0) {
8967
9293
  if (Date.now() >= deadline) {
8968
9294
  if (!silent) {
8969
- const remaining = [...pending].map((f) => path24.basename(f)).join(", ");
9295
+ const remaining = [...pending].map((f) => path25.basename(f)).join(", ");
8970
9296
  console.error(
8971
9297
  `error: timed out after ${Math.round(timeoutMs / 1e3)}s waiting for batch responses. Still pending: ${remaining}`
8972
9298
  );
@@ -9022,36 +9348,6 @@ import { promisify as promisify2 } from "node:util";
9022
9348
 
9023
9349
  // src/evaluation/providers/vscode/dispatch/constants.ts
9024
9350
  import path26 from "node:path";
9025
-
9026
- // src/paths.ts
9027
- import os2 from "node:os";
9028
- import path25 from "node:path";
9029
- var logged = false;
9030
- function getAgentvHome() {
9031
- const envHome = process.env.AGENTV_HOME;
9032
- if (envHome && envHome !== "undefined") {
9033
- if (!logged) {
9034
- logged = true;
9035
- console.warn(`Using AGENTV_HOME: ${envHome}`);
9036
- }
9037
- return envHome;
9038
- }
9039
- return path25.join(os2.homedir(), ".agentv");
9040
- }
9041
- function getWorkspacesRoot() {
9042
- return path25.join(getAgentvHome(), "workspaces");
9043
- }
9044
- function getSubagentsRoot() {
9045
- return path25.join(getAgentvHome(), "subagents");
9046
- }
9047
- function getTraceStateRoot() {
9048
- return path25.join(getAgentvHome(), "trace-state");
9049
- }
9050
- function getWorkspacePoolRoot() {
9051
- return path25.join(getAgentvHome(), "workspace-pool");
9052
- }
9053
-
9054
- // src/evaluation/providers/vscode/dispatch/constants.ts
9055
9351
  var DEFAULT_LOCK_NAME = "subagent.lock";
9056
9352
  var DEFAULT_ALIVE_FILENAME = ".alive";
9057
9353
  function getDefaultSubagentRoot(vscodeCmd = "code") {
@@ -10308,9 +10604,10 @@ function resolveAndCreateProvider(definition, env = process.env) {
10308
10604
  }
10309
10605
 
10310
10606
  // src/evaluation/evaluators/scoring.ts
10311
- var PASS_THRESHOLD = 0.8;
10312
- function scoreToVerdict(score) {
10313
- return score >= PASS_THRESHOLD ? "pass" : "fail";
10607
+ var DEFAULT_THRESHOLD = 0.8;
10608
+ var PASS_THRESHOLD = DEFAULT_THRESHOLD;
10609
+ function scoreToVerdict(score, threshold = DEFAULT_THRESHOLD) {
10610
+ return score >= threshold ? "pass" : "fail";
10314
10611
  }
10315
10612
  function clampScore(value) {
10316
10613
  if (Number.isNaN(value) || !Number.isFinite(value)) {
@@ -10499,13 +10796,13 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
10499
10796
  async function execShellWithStdin(command, stdinPayload, options = {}) {
10500
10797
  const { mkdir: mkdir16, readFile: readFile15, rm: rm6, writeFile: writeFile9 } = await import("node:fs/promises");
10501
10798
  const { tmpdir: tmpdir3 } = await import("node:os");
10502
- const path49 = await import("node:path");
10799
+ const path50 = await import("node:path");
10503
10800
  const { randomUUID: randomUUID10 } = await import("node:crypto");
10504
- const dir = path49.join(tmpdir3(), `agentv-exec-${randomUUID10()}`);
10801
+ const dir = path50.join(tmpdir3(), `agentv-exec-${randomUUID10()}`);
10505
10802
  await mkdir16(dir, { recursive: true });
10506
- const stdinPath = path49.join(dir, "stdin.txt");
10507
- const stdoutPath = path49.join(dir, "stdout.txt");
10508
- const stderrPath = path49.join(dir, "stderr.txt");
10803
+ const stdinPath = path50.join(dir, "stdin.txt");
10804
+ const stdoutPath = path50.join(dir, "stdout.txt");
10805
+ const stderrPath = path50.join(dir, "stderr.txt");
10509
10806
  await writeFile9(stdinPath, stdinPayload, "utf8");
10510
10807
  const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
10511
10808
  const { spawn: spawn5 } = await import("node:child_process");
@@ -11703,7 +12000,7 @@ ${outputSchema}`;
11703
12000
  parts.push("[[ ## scoring_criteria ## ]]");
11704
12001
  for (const rubric of rubrics) {
11705
12002
  const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
11706
- const minScoreLabel = rubric.required_min_score !== void 0 ? ` [REQUIRED: min score ${rubric.required_min_score}]` : "";
12003
+ const minScoreLabel = rubric.min_score !== void 0 ? ` [REQUIRED: min score ${rubric.min_score}]` : rubric.required_min_score !== void 0 ? ` [REQUIRED: min score ${rubric.required_min_score}]` : "";
11707
12004
  parts.push("", `### Criterion: ${rubric.id}${weightLabel}${minScoreLabel}`);
11708
12005
  if (rubric.outcome) {
11709
12006
  parts.push(`Description: ${rubric.outcome}`);
@@ -11757,54 +12054,106 @@ ${outputSchema}`;
11757
12054
  async runWithRetry(options) {
11758
12055
  const { context, graderProvider, systemPrompt, userPrompt, schema, images } = options;
11759
12056
  let lastError;
12057
+ let lastInvalidResponse;
12058
+ let shouldAttemptStructureFix = false;
11760
12059
  for (let attempt = 1; attempt <= 3; attempt++) {
11761
12060
  try {
11762
- const model = graderProvider.asLanguageModel?.();
11763
- if (model) {
11764
- const modelOptions = {
11765
- ...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
11766
- ...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
11767
- };
11768
- const hasImages = images && images.length > 0;
11769
- const result = hasImages ? await generateText2({
11770
- model,
11771
- system: systemPrompt,
11772
- messages: [
11773
- {
11774
- role: "user",
11775
- content: [
11776
- { type: "text", text: userPrompt },
11777
- ...toAiSdkImageParts(images)
11778
- ]
11779
- }
11780
- ],
11781
- ...modelOptions
11782
- }) : await generateText2({
11783
- model,
11784
- system: systemPrompt,
11785
- prompt: userPrompt,
11786
- ...modelOptions
11787
- });
11788
- const data2 = schema.parse(parseJsonFromText(result.text));
11789
- const rawUsage = result.usage;
11790
- const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens } : void 0;
11791
- return { data: data2, tokenUsage };
12061
+ const result = await this.generateStructuredResponse({
12062
+ context,
12063
+ graderProvider,
12064
+ systemPrompt,
12065
+ userPrompt,
12066
+ images
12067
+ });
12068
+ const canRepairResponse = result.text.trim().length > 0;
12069
+ lastInvalidResponse = canRepairResponse ? result : void 0;
12070
+ let data;
12071
+ try {
12072
+ data = schema.parse(parseJsonFromText(result.text));
12073
+ } catch (e) {
12074
+ lastError = e instanceof Error ? e : new Error(String(e));
12075
+ shouldAttemptStructureFix = canRepairResponse;
12076
+ continue;
11792
12077
  }
11793
- const response = await graderProvider.invoke({
11794
- question: userPrompt,
12078
+ return {
12079
+ data,
12080
+ providerResponse: result.providerResponse,
12081
+ tokenUsage: result.tokenUsage
12082
+ };
12083
+ } catch (e) {
12084
+ lastError = e instanceof Error ? e : new Error(String(e));
12085
+ }
12086
+ }
12087
+ if (shouldAttemptStructureFix && lastInvalidResponse) {
12088
+ try {
12089
+ const repaired = await this.generateStructuredResponse({
12090
+ context,
12091
+ graderProvider,
11795
12092
  systemPrompt,
11796
- evalCaseId: context.evalCase.id,
11797
- attempt: context.attempt,
11798
- maxOutputTokens: this.maxOutputTokens,
11799
- temperature: this.temperature
12093
+ userPrompt: buildStructureRepairPrompt({
12094
+ validationError: lastError?.message ?? "Schema validation failed",
12095
+ invalidResponse: lastInvalidResponse.text
12096
+ })
11800
12097
  });
11801
- const data = schema.parse(parseJsonFromText(extractLastAssistantContent(response.output)));
11802
- return { data, providerResponse: response, tokenUsage: response.tokenUsage };
12098
+ const data = schema.parse(parseJsonFromText(repaired.text));
12099
+ return {
12100
+ data,
12101
+ providerResponse: repaired.providerResponse,
12102
+ tokenUsage: sumTokenUsage(lastInvalidResponse.tokenUsage, repaired.tokenUsage)
12103
+ };
11803
12104
  } catch (e) {
11804
12105
  lastError = e instanceof Error ? e : new Error(String(e));
11805
12106
  }
11806
12107
  }
11807
- throw new Error(`Failed to parse evaluator response after 3 attempts: ${lastError?.message}`);
12108
+ throw new Error(
12109
+ `Failed to parse evaluator response after 3 attempts and 1 structure-fix attempt: ${lastError?.message}`
12110
+ );
12111
+ }
12112
+ async generateStructuredResponse(options) {
12113
+ const { context, graderProvider, systemPrompt, userPrompt, images } = options;
12114
+ const model = graderProvider.asLanguageModel?.();
12115
+ if (model) {
12116
+ const modelOptions = {
12117
+ ...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
12118
+ ...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
12119
+ };
12120
+ const hasImages = images && images.length > 0;
12121
+ const result = hasImages ? await generateText2({
12122
+ model,
12123
+ system: systemPrompt,
12124
+ messages: [
12125
+ {
12126
+ role: "user",
12127
+ content: [
12128
+ { type: "text", text: userPrompt },
12129
+ ...toAiSdkImageParts(images)
12130
+ ]
12131
+ }
12132
+ ],
12133
+ ...modelOptions
12134
+ }) : await generateText2({
12135
+ model,
12136
+ system: systemPrompt,
12137
+ prompt: userPrompt,
12138
+ ...modelOptions
12139
+ });
12140
+ const rawUsage = result.usage;
12141
+ const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens } : void 0;
12142
+ return { text: result.text, tokenUsage };
12143
+ }
12144
+ const response = await graderProvider.invoke({
12145
+ question: userPrompt,
12146
+ systemPrompt,
12147
+ evalCaseId: context.evalCase.id,
12148
+ attempt: context.attempt,
12149
+ maxOutputTokens: this.maxOutputTokens,
12150
+ temperature: this.temperature
12151
+ });
12152
+ return {
12153
+ text: extractLastAssistantContent(response.output),
12154
+ providerResponse: response,
12155
+ tokenUsage: response.tokenUsage
12156
+ };
11808
12157
  }
11809
12158
  };
11810
12159
  function buildOutputSchema() {
@@ -11824,6 +12173,29 @@ function buildOutputSchema() {
11824
12173
  "}"
11825
12174
  ].join("\n");
11826
12175
  }
12176
+ function buildStructureRepairPrompt(options) {
12177
+ const { validationError, invalidResponse } = options;
12178
+ return [
12179
+ "The following evaluation response has useful grading content but invalid JSON structure.",
12180
+ "Repair it to satisfy the schema in the system prompt.",
12181
+ "Preserve the evaluation meaning, do not re-grade the answer, and return only a single JSON object.",
12182
+ "",
12183
+ "Validation error:",
12184
+ validationError,
12185
+ "",
12186
+ "Invalid response:",
12187
+ invalidResponse
12188
+ ].join("\n");
12189
+ }
12190
+ function sumTokenUsage(first, second) {
12191
+ if (!first && !second) {
12192
+ return void 0;
12193
+ }
12194
+ return {
12195
+ input: (first?.input ?? 0) + (second?.input ?? 0),
12196
+ output: (first?.output ?? 0) + (second?.output ?? 0)
12197
+ };
12198
+ }
11827
12199
  function buildRubricOutputSchema() {
11828
12200
  return `You are an expert evaluator. Evaluate the candidate answer against each rubric item.
11829
12201
  You must return a valid JSON object matching this schema:
@@ -11923,19 +12295,21 @@ function calculateScoreRangeResult(result, rubrics) {
11923
12295
  rawScores[rubric.id] = rawScore;
11924
12296
  totalWeight += rubric.weight;
11925
12297
  weightedScoreSum += normalizedScore * rubric.weight;
11926
- let requiredMinScore;
11927
- if (rubric.required_min_score !== void 0) {
11928
- requiredMinScore = rubric.required_min_score;
12298
+ let minScoreThreshold;
12299
+ if (rubric.min_score !== void 0) {
12300
+ minScoreThreshold = rubric.min_score;
12301
+ } else if (rubric.required_min_score !== void 0) {
12302
+ minScoreThreshold = rubric.required_min_score / 10;
11929
12303
  } else if (rubric.required === true) {
11930
- requiredMinScore = 10;
12304
+ minScoreThreshold = 1;
11931
12305
  }
11932
12306
  const matchingRange = rubric.score_ranges?.find(
11933
12307
  (r) => rawScore >= r.score_range[0] && rawScore <= r.score_range[1]
11934
12308
  );
11935
12309
  const rangeDescription = matchingRange?.outcome ?? "";
11936
12310
  const criterionLabel = rubric.outcome ?? rubric.id;
11937
- const passed = !(requiredMinScore !== void 0 && rawScore < requiredMinScore) && rawScore >= 7;
11938
- if (requiredMinScore !== void 0 && rawScore < requiredMinScore) {
12311
+ const passed = !(minScoreThreshold !== void 0 && normalizedScore < minScoreThreshold) && rawScore >= 7;
12312
+ if (minScoreThreshold !== void 0 && normalizedScore < minScoreThreshold) {
11939
12313
  failedRequired = true;
11940
12314
  }
11941
12315
  assertions.push({
@@ -12012,11 +12386,11 @@ function createFilesystemTools(workspacePath) {
12012
12386
  execute: async (input) => {
12013
12387
  try {
12014
12388
  const resolved = resolveSandboxed(workspacePath, input.path);
12015
- const stat10 = await fs2.stat(resolved);
12016
- if (stat10.isDirectory()) {
12389
+ const stat11 = await fs2.stat(resolved);
12390
+ if (stat11.isDirectory()) {
12017
12391
  return { error: `'${input.path}' is a directory, not a file` };
12018
12392
  }
12019
- const buffer = Buffer.alloc(Math.min(stat10.size, MAX_FILE_SIZE));
12393
+ const buffer = Buffer.alloc(Math.min(stat11.size, MAX_FILE_SIZE));
12020
12394
  const fd = await fs2.open(resolved, "r");
12021
12395
  try {
12022
12396
  await fd.read(buffer, 0, buffer.length, 0);
@@ -12024,8 +12398,8 @@ function createFilesystemTools(workspacePath) {
12024
12398
  await fd.close();
12025
12399
  }
12026
12400
  const content = buffer.toString("utf-8");
12027
- const truncated = stat10.size > MAX_FILE_SIZE;
12028
- return { content, truncated, size: stat10.size };
12401
+ const truncated = stat11.size > MAX_FILE_SIZE;
12402
+ return { content, truncated, size: stat11.size };
12029
12403
  } catch (error) {
12030
12404
  return { error: error instanceof Error ? error.message : String(error) };
12031
12405
  }
@@ -12076,8 +12450,8 @@ async function searchDirectory(dirPath, workspacePath, regex, matches) {
12076
12450
  const ext = path35.extname(entry.name).toLowerCase();
12077
12451
  if (BINARY_EXTENSIONS.has(ext)) continue;
12078
12452
  try {
12079
- const stat10 = await fs2.stat(fullPath);
12080
- if (stat10.size > MAX_FILE_SIZE) continue;
12453
+ const stat11 = await fs2.stat(fullPath);
12454
+ if (stat11.size > MAX_FILE_SIZE) continue;
12081
12455
  const content = await fs2.readFile(fullPath, "utf-8");
12082
12456
  const lines = content.split("\n");
12083
12457
  for (let i = 0; i < lines.length; i++) {
@@ -12718,115 +13092,115 @@ var FieldAccuracyEvaluator = class {
12718
13092
  * Evaluate a single field against the expected value.
12719
13093
  */
12720
13094
  evaluateField(fieldConfig, candidateData, expectedData) {
12721
- const { path: path49, match, required = true, weight = 1 } = fieldConfig;
12722
- const candidateValue = resolvePath(candidateData, path49);
12723
- const expectedValue = resolvePath(expectedData, path49);
13095
+ const { path: path50, match, required = true, weight = 1 } = fieldConfig;
13096
+ const candidateValue = resolvePath(candidateData, path50);
13097
+ const expectedValue = resolvePath(expectedData, path50);
12724
13098
  if (expectedValue === void 0) {
12725
13099
  return {
12726
- path: path49,
13100
+ path: path50,
12727
13101
  score: 1,
12728
13102
  // No expected value means no comparison needed
12729
13103
  weight,
12730
13104
  hit: true,
12731
- message: `${path49}: no expected value`
13105
+ message: `${path50}: no expected value`
12732
13106
  };
12733
13107
  }
12734
13108
  if (candidateValue === void 0) {
12735
13109
  if (required) {
12736
13110
  return {
12737
- path: path49,
13111
+ path: path50,
12738
13112
  score: 0,
12739
13113
  weight,
12740
13114
  hit: false,
12741
- message: `${path49} (required, missing)`
13115
+ message: `${path50} (required, missing)`
12742
13116
  };
12743
13117
  }
12744
13118
  return {
12745
- path: path49,
13119
+ path: path50,
12746
13120
  score: 1,
12747
13121
  // Don't penalize missing optional fields
12748
13122
  weight: 0,
12749
13123
  // Zero weight means it won't affect the score
12750
13124
  hit: true,
12751
- message: `${path49}: optional field missing`
13125
+ message: `${path50}: optional field missing`
12752
13126
  };
12753
13127
  }
12754
13128
  switch (match) {
12755
13129
  case "exact":
12756
- return this.compareExact(path49, candidateValue, expectedValue, weight);
13130
+ return this.compareExact(path50, candidateValue, expectedValue, weight);
12757
13131
  case "numeric_tolerance":
12758
13132
  return this.compareNumericTolerance(
12759
- path49,
13133
+ path50,
12760
13134
  candidateValue,
12761
13135
  expectedValue,
12762
13136
  fieldConfig,
12763
13137
  weight
12764
13138
  );
12765
13139
  case "date":
12766
- return this.compareDate(path49, candidateValue, expectedValue, fieldConfig, weight);
13140
+ return this.compareDate(path50, candidateValue, expectedValue, fieldConfig, weight);
12767
13141
  default:
12768
13142
  return {
12769
- path: path49,
13143
+ path: path50,
12770
13144
  score: 0,
12771
13145
  weight,
12772
13146
  hit: false,
12773
- message: `${path49}: unknown match type "${match}"`
13147
+ message: `${path50}: unknown match type "${match}"`
12774
13148
  };
12775
13149
  }
12776
13150
  }
12777
13151
  /**
12778
13152
  * Exact equality comparison.
12779
13153
  */
12780
- compareExact(path49, candidateValue, expectedValue, weight) {
13154
+ compareExact(path50, candidateValue, expectedValue, weight) {
12781
13155
  if (deepEqual(candidateValue, expectedValue)) {
12782
13156
  return {
12783
- path: path49,
13157
+ path: path50,
12784
13158
  score: 1,
12785
13159
  weight,
12786
13160
  hit: true,
12787
- message: path49
13161
+ message: path50
12788
13162
  };
12789
13163
  }
12790
13164
  if (typeof candidateValue !== typeof expectedValue) {
12791
13165
  return {
12792
- path: path49,
13166
+ path: path50,
12793
13167
  score: 0,
12794
13168
  weight,
12795
13169
  hit: false,
12796
- message: `${path49} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
13170
+ message: `${path50} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
12797
13171
  };
12798
13172
  }
12799
13173
  return {
12800
- path: path49,
13174
+ path: path50,
12801
13175
  score: 0,
12802
13176
  weight,
12803
13177
  hit: false,
12804
- message: `${path49} (value mismatch)`
13178
+ message: `${path50} (value mismatch)`
12805
13179
  };
12806
13180
  }
12807
13181
  /**
12808
13182
  * Numeric comparison with absolute or relative tolerance.
12809
13183
  */
12810
- compareNumericTolerance(path49, candidateValue, expectedValue, fieldConfig, weight) {
13184
+ compareNumericTolerance(path50, candidateValue, expectedValue, fieldConfig, weight) {
12811
13185
  const { tolerance = 0, relative = false } = fieldConfig;
12812
13186
  const candidateNum = toNumber(candidateValue);
12813
13187
  const expectedNum = toNumber(expectedValue);
12814
13188
  if (candidateNum === null || expectedNum === null) {
12815
13189
  return {
12816
- path: path49,
13190
+ path: path50,
12817
13191
  score: 0,
12818
13192
  weight,
12819
13193
  hit: false,
12820
- message: `${path49} (non-numeric value)`
13194
+ message: `${path50} (non-numeric value)`
12821
13195
  };
12822
13196
  }
12823
13197
  if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
12824
13198
  return {
12825
- path: path49,
13199
+ path: path50,
12826
13200
  score: 0,
12827
13201
  weight,
12828
13202
  hit: false,
12829
- message: `${path49} (invalid numeric value)`
13203
+ message: `${path50} (invalid numeric value)`
12830
13204
  };
12831
13205
  }
12832
13206
  const diff = Math.abs(candidateNum - expectedNum);
@@ -12839,61 +13213,61 @@ var FieldAccuracyEvaluator = class {
12839
13213
  }
12840
13214
  if (withinTolerance) {
12841
13215
  return {
12842
- path: path49,
13216
+ path: path50,
12843
13217
  score: 1,
12844
13218
  weight,
12845
13219
  hit: true,
12846
- message: `${path49} (within tolerance: diff=${diff.toFixed(2)})`
13220
+ message: `${path50} (within tolerance: diff=${diff.toFixed(2)})`
12847
13221
  };
12848
13222
  }
12849
13223
  return {
12850
- path: path49,
13224
+ path: path50,
12851
13225
  score: 0,
12852
13226
  weight,
12853
13227
  hit: false,
12854
- message: `${path49} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
13228
+ message: `${path50} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
12855
13229
  };
12856
13230
  }
12857
13231
  /**
12858
13232
  * Date comparison with format normalization.
12859
13233
  */
12860
- compareDate(path49, candidateValue, expectedValue, fieldConfig, weight) {
13234
+ compareDate(path50, candidateValue, expectedValue, fieldConfig, weight) {
12861
13235
  const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
12862
13236
  const candidateDate = parseDate(String(candidateValue), formats);
12863
13237
  const expectedDate = parseDate(String(expectedValue), formats);
12864
13238
  if (candidateDate === null) {
12865
13239
  return {
12866
- path: path49,
13240
+ path: path50,
12867
13241
  score: 0,
12868
13242
  weight,
12869
13243
  hit: false,
12870
- message: `${path49} (unparseable candidate date)`
13244
+ message: `${path50} (unparseable candidate date)`
12871
13245
  };
12872
13246
  }
12873
13247
  if (expectedDate === null) {
12874
13248
  return {
12875
- path: path49,
13249
+ path: path50,
12876
13250
  score: 0,
12877
13251
  weight,
12878
13252
  hit: false,
12879
- message: `${path49} (unparseable expected date)`
13253
+ message: `${path50} (unparseable expected date)`
12880
13254
  };
12881
13255
  }
12882
13256
  if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
12883
13257
  return {
12884
- path: path49,
13258
+ path: path50,
12885
13259
  score: 1,
12886
13260
  weight,
12887
13261
  hit: true,
12888
- message: path49
13262
+ message: path50
12889
13263
  };
12890
13264
  }
12891
13265
  return {
12892
- path: path49,
13266
+ path: path50,
12893
13267
  score: 0,
12894
13268
  weight,
12895
13269
  hit: false,
12896
- message: `${path49} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
13270
+ message: `${path50} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
12897
13271
  };
12898
13272
  }
12899
13273
  /**
@@ -12926,11 +13300,11 @@ var FieldAccuracyEvaluator = class {
12926
13300
  };
12927
13301
  }
12928
13302
  };
12929
- function resolvePath(obj, path49) {
12930
- if (!path49 || !obj) {
13303
+ function resolvePath(obj, path50) {
13304
+ if (!path50 || !obj) {
12931
13305
  return void 0;
12932
13306
  }
12933
- const parts = path49.split(/\.|\[|\]/).filter((p) => p.length > 0);
13307
+ const parts = path50.split(/\.|\[|\]/).filter((p) => p.length > 0);
12934
13308
  let current = obj;
12935
13309
  for (const part of parts) {
12936
13310
  if (current === null || current === void 0) {
@@ -13422,8 +13796,8 @@ var TokenUsageEvaluator = class {
13422
13796
  };
13423
13797
 
13424
13798
  // src/evaluation/evaluators/tool-trajectory.ts
13425
- function getNestedValue(obj, path49) {
13426
- const parts = path49.split(".");
13799
+ function getNestedValue(obj, path50) {
13800
+ const parts = path50.split(".");
13427
13801
  let current = obj;
13428
13802
  for (const part of parts) {
13429
13803
  if (current === null || current === void 0 || typeof current !== "object") {
@@ -15195,7 +15569,7 @@ var WorkspacePoolManager = class {
15195
15569
  }
15196
15570
  /**
15197
15571
  * Reset an existing slot for reuse:
15198
- * 1. Reset repos (git reset --hard {ref} && git clean -fd per repo)
15572
+ * 1. Reset repos (fetch from origin when resolve=remote, then git reset --hard && git clean per repo)
15199
15573
  * 2. Re-copy template files (skip repo directories)
15200
15574
  */
15201
15575
  async resetSlot(slotPath, templatePath, repos, poolReset = "fast") {
@@ -15208,7 +15582,17 @@ var WorkspacePoolManager = class {
15208
15582
  continue;
15209
15583
  }
15210
15584
  const ref = repo.checkout?.ref ?? "HEAD";
15211
- await git(["reset", "--hard", ref], { cwd: repoDir });
15585
+ const resolve = repo.checkout?.resolve ?? "remote";
15586
+ if (resolve === "remote") {
15587
+ const fetchArgs = ["fetch", "origin", ref];
15588
+ if (repo.clone?.depth) {
15589
+ fetchArgs.splice(1, 0, "--depth", String(repo.clone.depth));
15590
+ }
15591
+ await git(fetchArgs, { cwd: repoDir });
15592
+ await git(["reset", "--hard", "FETCH_HEAD"], { cwd: repoDir });
15593
+ } else {
15594
+ await git(["reset", "--hard", ref], { cwd: repoDir });
15595
+ }
15212
15596
  const cleanFlag = poolReset === "strict" ? "-fdx" : "-fd";
15213
15597
  await git(["clean", cleanFlag], { cwd: repoDir });
15214
15598
  }
@@ -15505,7 +15889,7 @@ async function executeWorkspaceScript(config, context, failureMode = "fatal") {
15505
15889
  }
15506
15890
 
15507
15891
  // src/evaluation/orchestrator.ts
15508
- function classifyQualityStatus(score, threshold = PASS_THRESHOLD) {
15892
+ function classifyQualityStatus(score, threshold = DEFAULT_THRESHOLD) {
15509
15893
  return score >= threshold ? "ok" : "quality_failure";
15510
15894
  }
15511
15895
  function buildSkippedEvaluatorError(scores) {
@@ -15597,7 +15981,7 @@ async function runEvaluation(options) {
15597
15981
  const filteredEvalCases = filterEvalCases(evalCases, filter);
15598
15982
  if (filteredEvalCases.length === 0) {
15599
15983
  if (filter) {
15600
- throw new Error(`No tests matched filter '${filter}' in ${evalFilePath}`);
15984
+ throw new Error(`No tests matched filter '${formatFilter(filter)}' in ${evalFilePath}`);
15601
15985
  }
15602
15986
  return [];
15603
15987
  }
@@ -15649,6 +16033,9 @@ async function runEvaluation(options) {
15649
16033
  const graderName = targetContext.graderTarget ?? targetContext.name;
15650
16034
  const resolvedGrader = resolveTargetByName(graderName);
15651
16035
  if (!resolvedGrader) {
16036
+ if (!LLM_GRADER_CAPABLE_KINDS.includes(targetContext.kind)) {
16037
+ return void 0;
16038
+ }
15652
16039
  return getOrCreateProvider(targetContext);
15653
16040
  }
15654
16041
  return getOrCreateProvider(resolvedGrader);
@@ -15979,7 +16366,7 @@ async function runEvaluation(options) {
15979
16366
  const budgetResult = {
15980
16367
  timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
15981
16368
  testId: evalCase.id,
15982
- dataset: evalCase.dataset,
16369
+ suite: evalCase.suite,
15983
16370
  category: evalCase.category,
15984
16371
  score: 0,
15985
16372
  assertions: [],
@@ -16016,7 +16403,7 @@ async function runEvaluation(options) {
16016
16403
  const haltResult = {
16017
16404
  timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
16018
16405
  testId: evalCase.id,
16019
- dataset: evalCase.dataset,
16406
+ suite: evalCase.suite,
16020
16407
  category: evalCase.category,
16021
16408
  score: 0,
16022
16409
  assertions: [],
@@ -16328,7 +16715,7 @@ async function runBatchEvaluation(options) {
16328
16715
  targetResolver,
16329
16716
  availableTargets,
16330
16717
  verbose,
16331
- threshold: batchThreshold
16718
+ threshold: evalCase.threshold ?? batchThreshold
16332
16719
  });
16333
16720
  if (providerError) {
16334
16721
  result = {
@@ -16790,8 +17177,9 @@ async function runEvalCase(options) {
16790
17177
  fileChanges,
16791
17178
  workspacePath,
16792
17179
  verbose,
16793
- threshold: caseThreshold
17180
+ threshold: evalCase.threshold ?? caseThreshold
16794
17181
  });
17182
+ const effectiveThreshold = evalCase.threshold ?? caseThreshold;
16795
17183
  const totalDurationMs = Date.now() - caseStartMs;
16796
17184
  const graderTokens = aggregateEvaluatorTokenUsage(result.scores);
16797
17185
  const evalRunTokenUsage = tokenUsage || graderTokens ? {
@@ -16805,7 +17193,7 @@ async function runEvalCase(options) {
16805
17193
  ...evalRunTokenUsage ? { tokenUsage: evalRunTokenUsage } : {}
16806
17194
  };
16807
17195
  const skippedEvaluatorError = buildSkippedEvaluatorError(result.scores);
16808
- const executionStatus = providerError || skippedEvaluatorError ? "execution_error" : classifyQualityStatus(result.score, caseThreshold);
17196
+ const executionStatus = providerError || skippedEvaluatorError ? "execution_error" : classifyQualityStatus(result.score, effectiveThreshold);
16809
17197
  const targetUsedField = targetUsed ? { targetUsed } : {};
16810
17198
  const finalResult = providerError ? {
16811
17199
  ...result,
@@ -17006,7 +17394,8 @@ async function evaluateCandidate(options) {
17006
17394
  targetResolver,
17007
17395
  availableTargets,
17008
17396
  fileChanges,
17009
- workspacePath
17397
+ workspacePath,
17398
+ threshold: evalThreshold
17010
17399
  });
17011
17400
  const completedAt = nowFn();
17012
17401
  let agentRequest;
@@ -17037,7 +17426,7 @@ async function evaluateCandidate(options) {
17037
17426
  return {
17038
17427
  timestamp: completedAt.toISOString(),
17039
17428
  testId: evalCase.id,
17040
- dataset: evalCase.dataset,
17429
+ suite: evalCase.suite,
17041
17430
  category: evalCase.category,
17042
17431
  conversationId: evalCase.conversation_id,
17043
17432
  score: score.score,
@@ -17080,7 +17469,8 @@ async function runEvaluatorsForCase(options) {
17080
17469
  targetResolver,
17081
17470
  availableTargets,
17082
17471
  fileChanges,
17083
- workspacePath
17472
+ workspacePath,
17473
+ threshold
17084
17474
  } = options;
17085
17475
  if (evalCase.assertions && evalCase.assertions.length > 0) {
17086
17476
  return runEvaluatorList({
@@ -17106,7 +17496,8 @@ async function runEvaluatorsForCase(options) {
17106
17496
  targetResolver,
17107
17497
  availableTargets,
17108
17498
  fileChanges,
17109
- workspacePath
17499
+ workspacePath,
17500
+ threshold
17110
17501
  });
17111
17502
  }
17112
17503
  const evaluatorKind = evalCase.evaluator ?? "llm-grader";
@@ -17208,7 +17599,8 @@ async function runEvaluatorList(options) {
17208
17599
  name: evaluatorConfig.name,
17209
17600
  type: evaluatorConfig.type,
17210
17601
  weight,
17211
- ...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {}
17602
+ ...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {},
17603
+ ...evaluatorConfig.min_score !== void 0 ? { min_score: evaluatorConfig.min_score } : {}
17212
17604
  });
17213
17605
  scores.push({
17214
17606
  name: evaluatorConfig.name,
@@ -17243,7 +17635,8 @@ async function runEvaluatorList(options) {
17243
17635
  name: evaluatorConfig.name ?? "unknown",
17244
17636
  type: evaluatorConfig.type ?? "llm-grader",
17245
17637
  weight,
17246
- ...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {}
17638
+ ...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {},
17639
+ ...evaluatorConfig.min_score !== void 0 ? { min_score: evaluatorConfig.min_score } : {}
17247
17640
  });
17248
17641
  scores.push({
17249
17642
  name: evaluatorConfig.name ?? "unknown",
@@ -17277,9 +17670,10 @@ async function runEvaluatorList(options) {
17277
17670
  }
17278
17671
  }
17279
17672
  }
17673
+ const effectiveThreshold = options.threshold ?? DEFAULT_THRESHOLD;
17280
17674
  const hasRequiredFailure = scored.some((entry) => {
17281
17675
  if (!entry.required) return false;
17282
- const minScore = typeof entry.required === "number" ? entry.required : PASS_THRESHOLD;
17676
+ const minScore = entry.min_score ?? (typeof entry.required === "number" ? entry.required : effectiveThreshold);
17283
17677
  return entry.score.score < minScore;
17284
17678
  });
17285
17679
  const scorable = scored.filter((entry) => entry.score.verdict !== "skip");
@@ -17290,17 +17684,23 @@ async function runEvaluatorList(options) {
17290
17684
  const expectedAspectCount = assertions.length || 1;
17291
17685
  const score = {
17292
17686
  score: aggregateScore,
17293
- verdict: scoreToVerdict(aggregateScore),
17687
+ verdict: scoreToVerdict(aggregateScore, effectiveThreshold),
17294
17688
  assertions,
17295
17689
  expectedAspectCount
17296
17690
  };
17297
17691
  return { score, scores };
17298
17692
  }
17693
+ function formatFilter(filter) {
17694
+ return typeof filter === "string" ? filter : filter.join(", ");
17695
+ }
17696
+ function matchesFilter3(id, filter) {
17697
+ return typeof filter === "string" ? micromatch3.isMatch(id, filter) : filter.some((pattern) => micromatch3.isMatch(id, pattern));
17698
+ }
17299
17699
  function filterEvalCases(evalCases, filter) {
17300
17700
  if (!filter) {
17301
17701
  return evalCases;
17302
17702
  }
17303
- return evalCases.filter((evalCase) => micromatch3.isMatch(evalCase.id, filter));
17703
+ return evalCases.filter((evalCase) => matchesFilter3(evalCase.id, filter));
17304
17704
  }
17305
17705
  function buildEvaluatorRegistry(overrides, resolveGraderProvider) {
17306
17706
  const llmGrader = overrides?.["llm-grader"] ?? new LlmGraderEvaluator({
@@ -17387,7 +17787,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
17387
17787
  return {
17388
17788
  timestamp: timestamp.toISOString(),
17389
17789
  testId: evalCase.id,
17390
- dataset: evalCase.dataset,
17790
+ suite: evalCase.suite,
17391
17791
  category: evalCase.category,
17392
17792
  conversationId: evalCase.conversation_id,
17393
17793
  score: 0,
@@ -17659,6 +18059,7 @@ async function evaluate(config) {
17659
18059
  verbose: config.verbose,
17660
18060
  maxConcurrency: config.workers ?? 3,
17661
18061
  filter: config.filter,
18062
+ threshold: config.threshold,
17662
18063
  evalCases,
17663
18064
  onResult: async (result) => {
17664
18065
  collectedResults.push(result);
@@ -17669,19 +18070,19 @@ async function evaluate(config) {
17669
18070
  const durationMs = Date.now() - startTime;
17670
18071
  return {
17671
18072
  results: allResults,
17672
- summary: computeSummary(allResults, durationMs)
18073
+ summary: computeSummary(allResults, durationMs, config.threshold)
17673
18074
  };
17674
18075
  }
17675
18076
  function mapAssertionType(type) {
17676
18077
  return type.replace(/_/g, "-");
17677
18078
  }
17678
- function computeSummary(results, durationMs) {
18079
+ function computeSummary(results, durationMs, threshold = DEFAULT_THRESHOLD) {
17679
18080
  const total = results.length;
17680
18081
  let passed = 0;
17681
18082
  let scoreSum = 0;
17682
18083
  for (const r of results) {
17683
18084
  scoreSum += r.score;
17684
- if (r.score >= PASS_THRESHOLD) {
18085
+ if (r.score >= threshold) {
17685
18086
  passed++;
17686
18087
  }
17687
18088
  }
@@ -17794,7 +18195,7 @@ var CONFIG_FILE_NAMES = [
17794
18195
  ];
17795
18196
  async function loadTsConfig(projectRoot) {
17796
18197
  const { existsSync: existsSync7 } = await import("node:fs");
17797
- const { pathToFileURL } = await import("node:url");
18198
+ const { pathToFileURL: pathToFileURL2 } = await import("node:url");
17798
18199
  const { join: join2 } = await import("node:path");
17799
18200
  for (const fileName of CONFIG_FILE_NAMES) {
17800
18201
  const filePath = join2(projectRoot, fileName);
@@ -17802,7 +18203,7 @@ async function loadTsConfig(projectRoot) {
17802
18203
  continue;
17803
18204
  }
17804
18205
  try {
17805
- const fileUrl = pathToFileURL(filePath).href;
18206
+ const fileUrl = pathToFileURL2(filePath).href;
17806
18207
  const mod = await import(fileUrl);
17807
18208
  const config = mod.default ?? mod;
17808
18209
  return AgentVConfigSchema.parse(config);
@@ -17936,7 +18337,7 @@ function shouldSkipCacheForTemperature(targetConfig) {
17936
18337
  }
17937
18338
 
17938
18339
  // src/projects.ts
17939
- import { existsSync as existsSync6, mkdirSync, readFileSync as readFileSync3, readdirSync as readdirSync3, statSync as statSync2, writeFileSync } from "node:fs";
18340
+ import { existsSync as existsSync6, mkdirSync as mkdirSync2, readFileSync as readFileSync3, readdirSync as readdirSync3, statSync as statSync2, writeFileSync } from "node:fs";
17940
18341
  import path47 from "node:path";
17941
18342
  import { parse as parseYaml3, stringify as stringifyYaml } from "yaml";
17942
18343
  function getProjectsRegistryPath() {
@@ -17962,7 +18363,7 @@ function saveProjectRegistry(registry) {
17962
18363
  const registryPath = getProjectsRegistryPath();
17963
18364
  const dir = path47.dirname(registryPath);
17964
18365
  if (!existsSync6(dir)) {
17965
- mkdirSync(dir, { recursive: true });
18366
+ mkdirSync2(dir, { recursive: true });
17966
18367
  }
17967
18368
  writeFileSync(registryPath, stringifyYaml(registry), "utf-8");
17968
18369
  }
@@ -18228,7 +18629,7 @@ var OtelTraceExporter = class {
18228
18629
  rootSpan.setAttribute("gen_ai.system", "agentv");
18229
18630
  rootSpan.setAttribute("agentv.test_id", result.testId);
18230
18631
  rootSpan.setAttribute("agentv.target", result.target);
18231
- if (result.dataset) rootSpan.setAttribute("agentv.dataset", result.dataset);
18632
+ if (result.suite) rootSpan.setAttribute("agentv.suite", result.suite);
18232
18633
  rootSpan.setAttribute("agentv.score", result.score);
18233
18634
  if (captureContent && result.output.length > 0) {
18234
18635
  const lastMsg = result.output[result.output.length - 1];
@@ -18437,7 +18838,7 @@ var OtelStreamingObserver = class {
18437
18838
  this.rootSpan.setAttribute("gen_ai.system", "agentv");
18438
18839
  this.rootSpan.setAttribute("agentv.test_id", testId);
18439
18840
  this.rootSpan.setAttribute("agentv.target", target);
18440
- if (evalSet) this.rootSpan.setAttribute("agentv.dataset", evalSet);
18841
+ if (evalSet) this.rootSpan.setAttribute("agentv.suite", evalSet);
18441
18842
  this.rootCtx = this.api.trace.setSpan(this.api.context.active(), this.rootSpan);
18442
18843
  }
18443
18844
  /** Create and immediately export a tool span */
@@ -18786,11 +19187,241 @@ function extractToolResultContent(content) {
18786
19187
  return parts.length > 0 ? parts.join("") : void 0;
18787
19188
  }
18788
19189
 
18789
- // src/import/session-discovery.ts
19190
+ // src/import/codex-parser.ts
19191
+ function parseCodexSession(jsonl) {
19192
+ const messages = [];
19193
+ let sessionId = "";
19194
+ let cwd;
19195
+ let model;
19196
+ let version;
19197
+ let startTimestamp;
19198
+ let endTimestamp;
19199
+ const pendingCalls = /* @__PURE__ */ new Map();
19200
+ const lines = jsonl.split("\n").filter((l) => l.trim().length > 0);
19201
+ for (const line of lines) {
19202
+ let entry;
19203
+ try {
19204
+ entry = JSON.parse(line);
19205
+ } catch {
19206
+ continue;
19207
+ }
19208
+ if (!entry.type) continue;
19209
+ if (entry.timestamp) {
19210
+ if (!startTimestamp) startTimestamp = entry.timestamp;
19211
+ endTimestamp = entry.timestamp;
19212
+ }
19213
+ const payload = entry.payload ?? {};
19214
+ switch (entry.type) {
19215
+ case "session_meta": {
19216
+ sessionId = String(payload.id ?? "");
19217
+ cwd = payload.cwd ? String(payload.cwd) : void 0;
19218
+ version = payload.cli_version ? String(payload.cli_version) : void 0;
19219
+ if (payload.model && !model) {
19220
+ model = String(payload.model);
19221
+ }
19222
+ break;
19223
+ }
19224
+ case "turn_context": {
19225
+ if (payload.model && !model) {
19226
+ model = String(payload.model);
19227
+ }
19228
+ if (payload.cwd && !cwd) {
19229
+ cwd = String(payload.cwd);
19230
+ }
19231
+ break;
19232
+ }
19233
+ case "response_item": {
19234
+ const itemType = String(payload.type ?? "");
19235
+ const role = String(payload.role ?? "");
19236
+ switch (itemType) {
19237
+ case "message": {
19238
+ if (role === "developer") break;
19239
+ const content = extractResponseItemContent(payload.content);
19240
+ if (role === "user" && content) {
19241
+ messages.push({ role: "user", content });
19242
+ } else if (role === "assistant" && content) {
19243
+ messages.push({ role: "assistant", content });
19244
+ }
19245
+ break;
19246
+ }
19247
+ case "function_call": {
19248
+ const toolName = String(payload.name ?? "");
19249
+ const callId = String(payload.call_id ?? "");
19250
+ let input;
19251
+ if (typeof payload.arguments === "string") {
19252
+ try {
19253
+ input = JSON.parse(payload.arguments);
19254
+ } catch {
19255
+ input = payload.arguments;
19256
+ }
19257
+ } else {
19258
+ input = payload.arguments;
19259
+ }
19260
+ const toolCall = { tool: toolName, input, id: callId };
19261
+ const msgIdx = messages.length;
19262
+ messages.push({
19263
+ role: "assistant",
19264
+ toolCalls: [toolCall]
19265
+ });
19266
+ if (callId) {
19267
+ pendingCalls.set(callId, { msgIdx, toolIdx: 0 });
19268
+ }
19269
+ break;
19270
+ }
19271
+ case "custom_tool_call": {
19272
+ const toolName = String(payload.name ?? "");
19273
+ const callId = String(payload.call_id ?? "");
19274
+ let input;
19275
+ if (typeof payload.arguments === "string") {
19276
+ try {
19277
+ input = JSON.parse(payload.arguments);
19278
+ } catch {
19279
+ input = payload.arguments;
19280
+ }
19281
+ } else {
19282
+ input = payload.arguments;
19283
+ }
19284
+ const toolCall = { tool: toolName, input, id: callId };
19285
+ const msgIdx = messages.length;
19286
+ messages.push({
19287
+ role: "assistant",
19288
+ toolCalls: [toolCall]
19289
+ });
19290
+ if (callId) {
19291
+ pendingCalls.set(callId, { msgIdx, toolIdx: 0 });
19292
+ }
19293
+ break;
19294
+ }
19295
+ case "function_call_output":
19296
+ case "custom_tool_call_output": {
19297
+ const callId = String(payload.call_id ?? "");
19298
+ const pending = pendingCalls.get(callId);
19299
+ if (pending) {
19300
+ const existingMsg = messages[pending.msgIdx];
19301
+ const existingCalls = [...existingMsg.toolCalls ?? []];
19302
+ existingCalls[pending.toolIdx] = {
19303
+ ...existingCalls[pending.toolIdx],
19304
+ output: payload.output
19305
+ };
19306
+ messages[pending.msgIdx] = { ...existingMsg, toolCalls: existingCalls };
19307
+ pendingCalls.delete(callId);
19308
+ }
19309
+ break;
19310
+ }
19311
+ // Skip reasoning blocks (thinking tokens)
19312
+ case "reasoning":
19313
+ break;
19314
+ }
19315
+ break;
19316
+ }
19317
+ }
19318
+ }
19319
+ let durationMs;
19320
+ if (startTimestamp && endTimestamp) {
19321
+ durationMs = new Date(endTimestamp).getTime() - new Date(startTimestamp).getTime();
19322
+ }
19323
+ const source = {
19324
+ provider: "codex",
19325
+ sessionId,
19326
+ cwd,
19327
+ startedAt: startTimestamp,
19328
+ model,
19329
+ version
19330
+ };
19331
+ return {
19332
+ messages,
19333
+ source,
19334
+ // Codex rollout files don't include token counts (only rate limit info)
19335
+ tokenUsage: void 0,
19336
+ durationMs,
19337
+ costUsd: null
19338
+ };
19339
+ }
19340
+ function extractResponseItemContent(content) {
19341
+ if (typeof content === "string") return content;
19342
+ if (!Array.isArray(content)) return void 0;
19343
+ const parts = [];
19344
+ for (const block of content) {
19345
+ if (typeof block === "object" && block !== null) {
19346
+ const b = block;
19347
+ if (typeof b.text === "string") {
19348
+ parts.push(b.text);
19349
+ }
19350
+ }
19351
+ }
19352
+ return parts.length > 0 ? parts.join("") : void 0;
19353
+ }
19354
+
19355
+ // src/import/codex-session-discovery.ts
18790
19356
  import { readdir as readdir8, stat as stat9 } from "node:fs/promises";
18791
19357
  import { homedir as homedir3 } from "node:os";
18792
19358
  import path48 from "node:path";
18793
- var DEFAULT_PROJECTS_DIR = () => path48.join(homedir3(), ".claude", "projects");
19359
+ var DEFAULT_SESSIONS_DIR = () => path48.join(homedir3(), ".codex", "sessions");
19360
+ async function discoverCodexSessions(opts) {
19361
+ const sessionsDir = opts?.sessionsDir ?? DEFAULT_SESSIONS_DIR();
19362
+ const limit = opts?.latest ? 1 : opts?.limit ?? 10;
19363
+ const sessions = [];
19364
+ let yearDirs;
19365
+ try {
19366
+ yearDirs = await readdir8(sessionsDir);
19367
+ } catch {
19368
+ return [];
19369
+ }
19370
+ for (const year of yearDirs) {
19371
+ const yearPath = path48.join(sessionsDir, year);
19372
+ let monthDirs;
19373
+ try {
19374
+ monthDirs = await readdir8(yearPath);
19375
+ } catch {
19376
+ continue;
19377
+ }
19378
+ for (const month of monthDirs) {
19379
+ const monthPath = path48.join(yearPath, month);
19380
+ let dayDirs;
19381
+ try {
19382
+ dayDirs = await readdir8(monthPath);
19383
+ } catch {
19384
+ continue;
19385
+ }
19386
+ for (const day of dayDirs) {
19387
+ if (opts?.date) {
19388
+ const dirDate = `${year}-${month}-${day}`;
19389
+ if (dirDate !== opts.date) continue;
19390
+ }
19391
+ const dayPath = path48.join(monthPath, day);
19392
+ let files;
19393
+ try {
19394
+ files = await readdir8(dayPath);
19395
+ } catch {
19396
+ continue;
19397
+ }
19398
+ for (const file of files) {
19399
+ if (!file.startsWith("rollout-") || !file.endsWith(".jsonl")) continue;
19400
+ const filePath = path48.join(dayPath, file);
19401
+ const nameWithoutExt = file.replace(/\.jsonl$/, "");
19402
+ const parts = nameWithoutExt.split("-");
19403
+ const sessionId = parts.length >= 6 ? parts.slice(-5).join("-") : nameWithoutExt;
19404
+ let updatedAt;
19405
+ try {
19406
+ const fileStat = await stat9(filePath);
19407
+ updatedAt = fileStat.mtime;
19408
+ } catch {
19409
+ updatedAt = /* @__PURE__ */ new Date(0);
19410
+ }
19411
+ sessions.push({ sessionId, filePath, filename: file, updatedAt });
19412
+ }
19413
+ }
19414
+ }
19415
+ }
19416
+ sessions.sort((a, b) => b.updatedAt.getTime() - a.updatedAt.getTime());
19417
+ return sessions.slice(0, limit);
19418
+ }
19419
+
19420
+ // src/import/session-discovery.ts
19421
+ import { readdir as readdir9, stat as stat10 } from "node:fs/promises";
19422
+ import { homedir as homedir4 } from "node:os";
19423
+ import path49 from "node:path";
19424
+ var DEFAULT_PROJECTS_DIR = () => path49.join(homedir4(), ".claude", "projects");
18794
19425
  function encodeProjectPath(projectPath) {
18795
19426
  return projectPath.replace(/\//g, "-");
18796
19427
  }
@@ -18799,7 +19430,7 @@ async function discoverClaudeSessions(opts) {
18799
19430
  const limit = opts?.latest ? 1 : opts?.limit ?? 10;
18800
19431
  let projectDirs;
18801
19432
  try {
18802
- projectDirs = await readdir8(projectsDir);
19433
+ projectDirs = await readdir9(projectsDir);
18803
19434
  } catch {
18804
19435
  return [];
18805
19436
  }
@@ -18809,10 +19440,10 @@ async function discoverClaudeSessions(opts) {
18809
19440
  }
18810
19441
  const sessions = [];
18811
19442
  for (const projectDir of projectDirs) {
18812
- const dirPath = path48.join(projectsDir, projectDir);
19443
+ const dirPath = path49.join(projectsDir, projectDir);
18813
19444
  let entries;
18814
19445
  try {
18815
- entries = await readdir8(dirPath);
19446
+ entries = await readdir9(dirPath);
18816
19447
  } catch {
18817
19448
  continue;
18818
19449
  }
@@ -18820,10 +19451,10 @@ async function discoverClaudeSessions(opts) {
18820
19451
  if (!entry.endsWith(".jsonl")) continue;
18821
19452
  const sessionId = entry.replace(/\.jsonl$/, "");
18822
19453
  if (opts?.sessionId && sessionId !== opts.sessionId) continue;
18823
- const filePath = path48.join(dirPath, entry);
19454
+ const filePath = path49.join(dirPath, entry);
18824
19455
  let updatedAt;
18825
19456
  try {
18826
- const fileStat = await stat9(filePath);
19457
+ const fileStat = await stat10(filePath);
18827
19458
  updatedAt = fileStat.mtime;
18828
19459
  } catch {
18829
19460
  updatedAt = /* @__PURE__ */ new Date(0);
@@ -18842,10 +19473,85 @@ async function discoverClaudeSessions(opts) {
18842
19473
 
18843
19474
  // src/import/types.ts
18844
19475
  import { readFile as readFile14 } from "node:fs/promises";
19476
+ function toTranscriptJsonLine(entry) {
19477
+ const firstUserMessage = entry.messages.find((m) => m.role === "user");
19478
+ const input = typeof firstUserMessage?.content === "string" ? firstUserMessage.content : "";
19479
+ return {
19480
+ input,
19481
+ output: entry.messages,
19482
+ token_usage: entry.tokenUsage ? {
19483
+ input: entry.tokenUsage.input,
19484
+ output: entry.tokenUsage.output,
19485
+ cached: entry.tokenUsage.cached
19486
+ } : void 0,
19487
+ duration_ms: entry.durationMs,
19488
+ cost_usd: entry.costUsd,
19489
+ source: {
19490
+ provider: entry.source.provider,
19491
+ session_id: entry.source.sessionId,
19492
+ model: entry.source.model,
19493
+ timestamp: entry.source.startedAt,
19494
+ git_branch: entry.source.gitBranch,
19495
+ cwd: entry.source.cwd ?? entry.source.projectPath,
19496
+ version: entry.source.version
19497
+ }
19498
+ };
19499
+ }
19500
+ async function readTranscriptJsonl(filePath) {
19501
+ const text = await readFile14(filePath, "utf8");
19502
+ return text.split("\n").filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
19503
+ }
18845
19504
  async function readTranscriptFile(filePath) {
18846
19505
  return readFile14(filePath, "utf8");
18847
19506
  }
18848
19507
 
19508
+ // src/import/transcript-provider.ts
19509
+ var TranscriptProvider = class _TranscriptProvider {
19510
+ id;
19511
+ kind = "transcript";
19512
+ targetName;
19513
+ lines;
19514
+ cursor = 0;
19515
+ constructor(targetName, lines) {
19516
+ this.targetName = targetName;
19517
+ this.id = `transcript:${targetName}`;
19518
+ this.lines = lines;
19519
+ }
19520
+ /**
19521
+ * Create a TranscriptProvider from a JSONL file path.
19522
+ */
19523
+ static async fromFile(filePath) {
19524
+ const lines = await readTranscriptJsonl(filePath);
19525
+ if (lines.length === 0) {
19526
+ throw new Error(`Transcript file is empty: ${filePath}`);
19527
+ }
19528
+ const providerName = lines[0].source.provider ?? "transcript";
19529
+ return new _TranscriptProvider(providerName, lines);
19530
+ }
19531
+ get lineCount() {
19532
+ return this.lines.length;
19533
+ }
19534
+ async invoke(_request) {
19535
+ if (this.cursor >= this.lines.length) {
19536
+ throw new Error(
19537
+ `Transcript exhausted: ${this.lines.length} line(s) available but ${this.cursor + 1} invocations attempted. Each transcript line maps to one test case.`
19538
+ );
19539
+ }
19540
+ const line = this.lines[this.cursor++];
19541
+ return {
19542
+ output: line.output,
19543
+ tokenUsage: line.token_usage ? {
19544
+ input: line.token_usage.input,
19545
+ output: line.token_usage.output,
19546
+ cached: line.token_usage.cached
19547
+ } : void 0,
19548
+ durationMs: line.duration_ms,
19549
+ costUsd: line.cost_usd ?? void 0,
19550
+ startTime: line.source.timestamp
19551
+ };
19552
+ }
19553
+ };
19554
+
18849
19555
  // src/index.ts
18850
19556
  function createAgentKernel() {
18851
19557
  return { status: "stub" };
@@ -18859,6 +19565,7 @@ export {
18859
19565
  DEFAULT_EVALUATOR_TEMPLATE,
18860
19566
  DEFAULT_EVAL_PATTERNS,
18861
19567
  DEFAULT_EXPLORATION_TOOLS,
19568
+ DEFAULT_THRESHOLD,
18862
19569
  DeterministicAssertionEvaluator,
18863
19570
  EvaluatorRegistry,
18864
19571
  ExecutionMetricsEvaluator,
@@ -18880,6 +19587,7 @@ export {
18880
19587
  TemplateNotFoundError,
18881
19588
  TokenUsageEvaluator,
18882
19589
  ToolTrajectoryEvaluator,
19590
+ TranscriptProvider,
18883
19591
  WorkspaceCreationError,
18884
19592
  WorkspacePoolManager,
18885
19593
  addProject,
@@ -18916,6 +19624,7 @@ export {
18916
19624
  detectFormat,
18917
19625
  discoverAssertions,
18918
19626
  discoverClaudeSessions,
19627
+ discoverCodexSessions,
18919
19628
  discoverCopilotSessions,
18920
19629
  discoverGraders,
18921
19630
  discoverGraders as discoverJudges,
@@ -18976,6 +19685,8 @@ export {
18976
19685
  normalizeLineEndings,
18977
19686
  parseAgentSkillsEvals,
18978
19687
  parseClaudeSession,
19688
+ parseCodexSession,
19689
+ parseCopilotEvents,
18979
19690
  parseJsonFromText,
18980
19691
  parseJsonSafe,
18981
19692
  readJsonFile,
@@ -18983,6 +19694,7 @@ export {
18983
19694
  readTestSuiteMetadata,
18984
19695
  readTextFile,
18985
19696
  readTranscriptFile,
19697
+ readTranscriptJsonl,
18986
19698
  removeProject,
18987
19699
  resolveAndCreateProvider,
18988
19700
  resolveDelegatedTargetDefinition,
@@ -19015,6 +19727,7 @@ export {
19015
19727
  substituteVariables,
19016
19728
  toCamelCaseDeep,
19017
19729
  toSnakeCaseDeep,
19730
+ toTranscriptJsonLine,
19018
19731
  tokensPerTool,
19019
19732
  touchProject,
19020
19733
  transpileEvalYaml,