@agentv/core 4.6.0 → 4.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,5 +1,6 @@
1
1
  import {
2
2
  COMMON_TARGET_SETTINGS,
3
+ LLM_GRADER_CAPABLE_KINDS,
3
4
  TEST_MESSAGE_ROLES,
4
5
  buildDirectoryChain,
5
6
  buildSearchRoots,
@@ -21,9 +22,10 @@ import {
21
22
  normalizeLineEndings,
22
23
  readJsonFile,
23
24
  readTextFile,
25
+ resolveDelegatedTargetDefinition,
24
26
  resolveFileReference,
25
27
  resolveTargetDefinition
26
- } from "./chunk-AIQ5FO4G.js";
28
+ } from "./chunk-75RFVESM.js";
27
29
  import {
28
30
  AgentvProvider
29
31
  } from "./chunk-PRNXHNLF.js";
@@ -855,8 +857,13 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
855
857
  const negate = rawEvaluator.negate === true ? true : void 0;
856
858
  if (isCustomType) {
857
859
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
858
- const required2 = parseRequired(rawEvaluator.required);
859
- const knownProps2 = /* @__PURE__ */ new Set(["name", "type", "weight", "required", "negate"]);
860
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
861
+ rawEvaluator.required,
862
+ rawEvaluator.min_score,
863
+ name,
864
+ evalId
865
+ );
866
+ const knownProps2 = /* @__PURE__ */ new Set(["name", "type", "weight", "required", "min_score", "negate"]);
860
867
  const config2 = {};
861
868
  for (const [key, value] of Object.entries(rawEvaluator)) {
862
869
  if (!knownProps2.has(key) && value !== void 0) {
@@ -868,6 +875,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
868
875
  type: customTypeName,
869
876
  ...weight2 !== void 0 ? { weight: weight2 } : {},
870
877
  ...required2 !== void 0 ? { required: required2 } : {},
878
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
871
879
  ...negate !== void 0 ? { negate } : {},
872
880
  ...Object.keys(config2).length > 0 ? { config: config2 } : {}
873
881
  });
@@ -937,7 +945,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
937
945
  );
938
946
  }
939
947
  }
940
- const required2 = parseRequired(rawEvaluator.required);
948
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
949
+ rawEvaluator.required,
950
+ rawEvaluator.min_score,
951
+ name,
952
+ evalId
953
+ );
941
954
  const knownProps2 = /* @__PURE__ */ new Set([
942
955
  "name",
943
956
  "type",
@@ -963,6 +976,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
963
976
  resolvedCwd,
964
977
  ...weight2 !== void 0 ? { weight: weight2 } : {},
965
978
  ...required2 !== void 0 ? { required: required2 } : {},
979
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
966
980
  ...negate !== void 0 ? { negate } : {},
967
981
  ...Object.keys(config2).length > 0 ? { config: config2 } : {},
968
982
  ...targetConfig !== void 0 ? { target: targetConfig } : {}
@@ -1091,7 +1105,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1091
1105
  };
1092
1106
  }
1093
1107
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1094
- const required2 = parseRequired(rawEvaluator.required);
1108
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
1109
+ rawEvaluator.required,
1110
+ rawEvaluator.min_score,
1111
+ name,
1112
+ evalId
1113
+ );
1095
1114
  evaluators.push({
1096
1115
  name,
1097
1116
  type: "composite",
@@ -1099,6 +1118,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1099
1118
  aggregator,
1100
1119
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1101
1120
  ...required2 !== void 0 ? { required: required2 } : {},
1121
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
1102
1122
  ...negate !== void 0 ? { negate } : {}
1103
1123
  });
1104
1124
  continue;
@@ -1209,7 +1229,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1209
1229
  continue;
1210
1230
  }
1211
1231
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1212
- const required2 = parseRequired(rawEvaluator.required);
1232
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
1233
+ rawEvaluator.required,
1234
+ rawEvaluator.min_score,
1235
+ name,
1236
+ evalId
1237
+ );
1213
1238
  const config2 = {
1214
1239
  name,
1215
1240
  type: "tool-trajectory",
@@ -1218,6 +1243,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1218
1243
  ...expected ? { expected } : {},
1219
1244
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1220
1245
  ...required2 !== void 0 ? { required: required2 } : {},
1246
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
1221
1247
  ...negate !== void 0 ? { negate } : {},
1222
1248
  ...argsMatch2 !== void 0 ? { argsMatch: argsMatch2 } : {}
1223
1249
  };
@@ -1280,7 +1306,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1280
1306
  const aggregation = asString(rawEvaluator.aggregation);
1281
1307
  const validAggregation = isValidFieldAggregationType(aggregation) ? aggregation : void 0;
1282
1308
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1283
- const required2 = parseRequired(rawEvaluator.required);
1309
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
1310
+ rawEvaluator.required,
1311
+ rawEvaluator.min_score,
1312
+ name,
1313
+ evalId
1314
+ );
1284
1315
  evaluators.push({
1285
1316
  name,
1286
1317
  type: "field-accuracy",
@@ -1288,6 +1319,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1288
1319
  ...validAggregation ? { aggregation: validAggregation } : {},
1289
1320
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1290
1321
  ...required2 !== void 0 ? { required: required2 } : {},
1322
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
1291
1323
  ...negate !== void 0 ? { negate } : {}
1292
1324
  });
1293
1325
  continue;
@@ -1301,13 +1333,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1301
1333
  continue;
1302
1334
  }
1303
1335
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1304
- const required2 = parseRequired(rawEvaluator.required);
1336
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
1337
+ rawEvaluator.required,
1338
+ rawEvaluator.min_score,
1339
+ name,
1340
+ evalId
1341
+ );
1305
1342
  evaluators.push({
1306
1343
  name,
1307
1344
  type: "latency",
1308
1345
  threshold,
1309
1346
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1310
1347
  ...required2 !== void 0 ? { required: required2 } : {},
1348
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
1311
1349
  ...negate !== void 0 ? { negate } : {}
1312
1350
  });
1313
1351
  continue;
@@ -1321,13 +1359,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1321
1359
  continue;
1322
1360
  }
1323
1361
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1324
- const required2 = parseRequired(rawEvaluator.required);
1362
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
1363
+ rawEvaluator.required,
1364
+ rawEvaluator.min_score,
1365
+ name,
1366
+ evalId
1367
+ );
1325
1368
  evaluators.push({
1326
1369
  name,
1327
1370
  type: "cost",
1328
1371
  budget,
1329
1372
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1330
1373
  ...required2 !== void 0 ? { required: required2 } : {},
1374
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
1331
1375
  ...negate !== void 0 ? { negate } : {}
1332
1376
  });
1333
1377
  continue;
@@ -1359,13 +1403,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1359
1403
  continue;
1360
1404
  }
1361
1405
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1362
- const required2 = parseRequired(rawEvaluator.required);
1406
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
1407
+ rawEvaluator.required,
1408
+ rawEvaluator.min_score,
1409
+ name,
1410
+ evalId
1411
+ );
1363
1412
  evaluators.push({
1364
1413
  name,
1365
1414
  type: "token-usage",
1366
1415
  ...validLimits,
1367
1416
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1368
1417
  ...required2 !== void 0 ? { required: required2 } : {},
1418
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
1369
1419
  ...negate !== void 0 ? { negate } : {}
1370
1420
  });
1371
1421
  continue;
@@ -1411,13 +1461,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1411
1461
  continue;
1412
1462
  }
1413
1463
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1414
- const required2 = parseRequired(rawEvaluator.required);
1464
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
1465
+ rawEvaluator.required,
1466
+ rawEvaluator.min_score,
1467
+ name,
1468
+ evalId
1469
+ );
1415
1470
  evaluators.push({
1416
1471
  name,
1417
1472
  type: "execution-metrics",
1418
1473
  ...validThresholds,
1419
1474
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1420
1475
  ...required2 !== void 0 ? { required: required2 } : {},
1476
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
1421
1477
  ...negate !== void 0 ? { negate } : {}
1422
1478
  });
1423
1479
  continue;
@@ -1431,7 +1487,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1431
1487
  const rawShouldTrigger = rawEvaluator.should_trigger;
1432
1488
  const shouldTrigger = typeof rawShouldTrigger === "boolean" ? rawShouldTrigger : void 0;
1433
1489
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1434
- const required2 = parseRequired(rawEvaluator.required);
1490
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
1491
+ rawEvaluator.required,
1492
+ rawEvaluator.min_score,
1493
+ name,
1494
+ evalId
1495
+ );
1435
1496
  evaluators.push({
1436
1497
  name,
1437
1498
  type: "skill-trigger",
@@ -1439,6 +1500,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1439
1500
  ...shouldTrigger !== void 0 ? { should_trigger: shouldTrigger } : {},
1440
1501
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1441
1502
  ...required2 !== void 0 ? { required: required2 } : {},
1503
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
1442
1504
  ...negate !== void 0 ? { negate } : {}
1443
1505
  });
1444
1506
  continue;
@@ -1450,13 +1512,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1450
1512
  continue;
1451
1513
  }
1452
1514
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1453
- const required2 = parseRequired(rawEvaluator.required);
1515
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
1516
+ rawEvaluator.required,
1517
+ rawEvaluator.min_score,
1518
+ name,
1519
+ evalId
1520
+ );
1454
1521
  evaluators.push({
1455
1522
  name,
1456
1523
  type: "contains",
1457
1524
  value,
1458
1525
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1459
1526
  ...required2 !== void 0 ? { required: required2 } : {},
1527
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
1460
1528
  ...negate !== void 0 ? { negate } : {}
1461
1529
  });
1462
1530
  continue;
@@ -1470,13 +1538,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1470
1538
  continue;
1471
1539
  }
1472
1540
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1473
- const required2 = parseRequired(rawEvaluator.required);
1541
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
1542
+ rawEvaluator.required,
1543
+ rawEvaluator.min_score,
1544
+ name,
1545
+ evalId
1546
+ );
1474
1547
  evaluators.push({
1475
1548
  name,
1476
1549
  type: typeValue,
1477
1550
  value,
1478
1551
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1479
1552
  ...required2 !== void 0 ? { required: required2 } : {},
1553
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
1480
1554
  ...negate !== void 0 ? { negate } : {}
1481
1555
  });
1482
1556
  continue;
@@ -1488,13 +1562,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1488
1562
  continue;
1489
1563
  }
1490
1564
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1491
- const required2 = parseRequired(rawEvaluator.required);
1565
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
1566
+ rawEvaluator.required,
1567
+ rawEvaluator.min_score,
1568
+ name,
1569
+ evalId
1570
+ );
1492
1571
  evaluators.push({
1493
1572
  name,
1494
1573
  type: "icontains",
1495
1574
  value,
1496
1575
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1497
1576
  ...required2 !== void 0 ? { required: required2 } : {},
1577
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
1498
1578
  ...negate !== void 0 ? { negate } : {}
1499
1579
  });
1500
1580
  continue;
@@ -1508,13 +1588,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1508
1588
  continue;
1509
1589
  }
1510
1590
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1511
- const required2 = parseRequired(rawEvaluator.required);
1591
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
1592
+ rawEvaluator.required,
1593
+ rawEvaluator.min_score,
1594
+ name,
1595
+ evalId
1596
+ );
1512
1597
  evaluators.push({
1513
1598
  name,
1514
1599
  type: typeValue,
1515
1600
  value,
1516
1601
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1517
1602
  ...required2 !== void 0 ? { required: required2 } : {},
1603
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
1518
1604
  ...negate !== void 0 ? { negate } : {}
1519
1605
  });
1520
1606
  continue;
@@ -1526,13 +1612,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1526
1612
  continue;
1527
1613
  }
1528
1614
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1529
- const required2 = parseRequired(rawEvaluator.required);
1615
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
1616
+ rawEvaluator.required,
1617
+ rawEvaluator.min_score,
1618
+ name,
1619
+ evalId
1620
+ );
1530
1621
  evaluators.push({
1531
1622
  name,
1532
1623
  type: typeValue,
1533
1624
  value,
1534
1625
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1535
1626
  ...required2 !== void 0 ? { required: required2 } : {},
1627
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
1536
1628
  ...negate !== void 0 ? { negate } : {}
1537
1629
  });
1538
1630
  continue;
@@ -1545,7 +1637,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1545
1637
  }
1546
1638
  const flags = asString(rawEvaluator.flags);
1547
1639
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1548
- const required2 = parseRequired(rawEvaluator.required);
1640
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
1641
+ rawEvaluator.required,
1642
+ rawEvaluator.min_score,
1643
+ name,
1644
+ evalId
1645
+ );
1549
1646
  evaluators.push({
1550
1647
  name,
1551
1648
  type: "regex",
@@ -1553,18 +1650,25 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1553
1650
  ...flags !== void 0 ? { flags } : {},
1554
1651
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1555
1652
  ...required2 !== void 0 ? { required: required2 } : {},
1653
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
1556
1654
  ...negate !== void 0 ? { negate } : {}
1557
1655
  });
1558
1656
  continue;
1559
1657
  }
1560
1658
  if (typeValue === "is-json") {
1561
1659
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1562
- const required2 = parseRequired(rawEvaluator.required);
1660
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
1661
+ rawEvaluator.required,
1662
+ rawEvaluator.min_score,
1663
+ name,
1664
+ evalId
1665
+ );
1563
1666
  evaluators.push({
1564
1667
  name,
1565
1668
  type: "is-json",
1566
1669
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1567
1670
  ...required2 !== void 0 ? { required: required2 } : {},
1671
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
1568
1672
  ...negate !== void 0 ? { negate } : {}
1569
1673
  });
1570
1674
  continue;
@@ -1576,13 +1680,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1576
1680
  continue;
1577
1681
  }
1578
1682
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1579
- const required2 = parseRequired(rawEvaluator.required);
1683
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
1684
+ rawEvaluator.required,
1685
+ rawEvaluator.min_score,
1686
+ name,
1687
+ evalId
1688
+ );
1580
1689
  evaluators.push({
1581
1690
  name,
1582
1691
  type: "equals",
1583
1692
  value,
1584
1693
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1585
1694
  ...required2 !== void 0 ? { required: required2 } : {},
1695
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
1586
1696
  ...negate !== void 0 ? { negate } : {}
1587
1697
  });
1588
1698
  continue;
@@ -1618,7 +1728,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1618
1728
  continue;
1619
1729
  }
1620
1730
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1621
- const required2 = parseRequired(rawEvaluator.required);
1731
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
1732
+ rawEvaluator.required,
1733
+ rawEvaluator.min_score,
1734
+ name,
1735
+ evalId
1736
+ );
1622
1737
  evaluators.push({
1623
1738
  name,
1624
1739
  type: "llm-grader",
@@ -1626,6 +1741,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1626
1741
  ...graderTargetName ? { target: graderTargetName } : {},
1627
1742
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1628
1743
  ...required2 !== void 0 ? { required: required2 } : {},
1744
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
1629
1745
  ...negate !== void 0 ? { negate } : {}
1630
1746
  });
1631
1747
  continue;
@@ -1695,7 +1811,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1695
1811
  continue;
1696
1812
  }
1697
1813
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1698
- const required2 = parseRequired(rawEvaluator.required);
1814
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
1815
+ rawEvaluator.required,
1816
+ rawEvaluator.min_score,
1817
+ name,
1818
+ evalId
1819
+ );
1699
1820
  evaluators.push({
1700
1821
  name,
1701
1822
  type: "llm-grader",
@@ -1703,12 +1824,18 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1703
1824
  ...graderTargetName ? { target: graderTargetName } : {},
1704
1825
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1705
1826
  ...required2 !== void 0 ? { required: required2 } : {},
1827
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
1706
1828
  ...negate !== void 0 ? { negate } : {}
1707
1829
  });
1708
1830
  continue;
1709
1831
  }
1710
1832
  const weight = validateWeight(rawEvaluator.weight, name, evalId);
1711
- const required = parseRequired(rawEvaluator.required);
1833
+ const { required, min_score } = parseRequiredAndMinScore(
1834
+ rawEvaluator.required,
1835
+ rawEvaluator.min_score,
1836
+ name,
1837
+ evalId
1838
+ );
1712
1839
  const knownProps = /* @__PURE__ */ new Set([
1713
1840
  "name",
1714
1841
  "type",
@@ -1719,6 +1846,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1719
1846
  "weight",
1720
1847
  "config",
1721
1848
  "required",
1849
+ "min_score",
1722
1850
  "negate",
1723
1851
  "max_steps",
1724
1852
  "maxSteps",
@@ -1748,6 +1876,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1748
1876
  ...graderTargetName ? { target: graderTargetName } : {},
1749
1877
  ...weight !== void 0 ? { weight } : {},
1750
1878
  ...required !== void 0 ? { required } : {},
1879
+ ...min_score !== void 0 ? { min_score } : {},
1751
1880
  ...negate !== void 0 ? { negate } : {},
1752
1881
  ...finalConfig ? { config: finalConfig } : {},
1753
1882
  ...llmMaxSteps !== void 0 ? { max_steps: llmMaxSteps } : {},
@@ -1879,10 +2008,23 @@ ${detailBlock}${ANSI_RESET4}`);
1879
2008
  console.warn(`${ANSI_YELLOW3}Warning: ${message}${ANSI_RESET4}`);
1880
2009
  }
1881
2010
  }
1882
- function parseRequired(value) {
1883
- if (value === true) return true;
1884
- if (typeof value === "number" && value > 0 && value <= 1) return value;
1885
- return void 0;
2011
+ function parseRequiredAndMinScore(rawRequired, rawMinScore, evaluatorName, evalId) {
2012
+ const result = {};
2013
+ if (typeof rawMinScore === "number" && rawMinScore > 0 && rawMinScore <= 1) {
2014
+ result.min_score = rawMinScore;
2015
+ }
2016
+ if (rawRequired === true) {
2017
+ result.required = true;
2018
+ } else if (typeof rawRequired === "number" && rawRequired > 0 && rawRequired <= 1) {
2019
+ if (result.min_score === void 0) {
2020
+ result.min_score = rawRequired;
2021
+ }
2022
+ result.required = rawRequired;
2023
+ logWarning2(
2024
+ `Evaluator '${evaluatorName}' in '${evalId}': 'required: ${rawRequired}' is deprecated. Use 'required: true' + 'min_score: ${rawRequired}' instead.`
2025
+ );
2026
+ }
2027
+ return result;
1886
2028
  }
1887
2029
  function validateWeight(rawWeight, evaluatorName, evalId) {
1888
2030
  if (rawWeight === void 0) {
@@ -1925,16 +2067,30 @@ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
1925
2067
  const id = asString(rawRubric.id) ?? `rubric-${index + 1}`;
1926
2068
  const expectedOutcome = asString(rawRubric.outcome) ?? "";
1927
2069
  const weight = typeof rawRubric.weight === "number" ? rawRubric.weight : 1;
2070
+ let minScore;
1928
2071
  let requiredMinScore;
1929
2072
  let required;
1930
- if (typeof rawRubric.required_min_score === "number") {
1931
- const minScore = rawRubric.required_min_score;
1932
- if (!Number.isInteger(minScore) || minScore < 0 || minScore > 10) {
2073
+ if (typeof rawRubric.min_score === "number") {
2074
+ const ms = rawRubric.min_score;
2075
+ if (ms <= 0 || ms > 1) {
2076
+ throw new Error(
2077
+ `Invalid min_score for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be in (0, 1] (got ${ms})`
2078
+ );
2079
+ }
2080
+ minScore = ms;
2081
+ requiredMinScore = Math.round(ms * 10);
2082
+ } else if (typeof rawRubric.required_min_score === "number") {
2083
+ const rms = rawRubric.required_min_score;
2084
+ if (!Number.isInteger(rms) || rms < 0 || rms > 10) {
1933
2085
  throw new Error(
1934
- `Invalid required_min_score for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be an integer 0-10 (got ${minScore})`
2086
+ `Invalid required_min_score for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be an integer 0-10 (got ${rms})`
1935
2087
  );
1936
2088
  }
1937
- requiredMinScore = minScore;
2089
+ requiredMinScore = rms;
2090
+ minScore = rms / 10;
2091
+ logWarning2(
2092
+ `Rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': 'required_min_score: ${rms}' is deprecated. Use 'min_score: ${rms / 10}' (0-1 scale) instead.`
2093
+ );
1938
2094
  }
1939
2095
  if (typeof rawRubric.required === "boolean") {
1940
2096
  required = rawRubric.required;
@@ -1954,6 +2110,7 @@ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
1954
2110
  weight,
1955
2111
  ...expectedOutcome.length > 0 ? { outcome: expectedOutcome } : {},
1956
2112
  ...required !== void 0 ? { required } : {},
2113
+ ...minScore !== void 0 ? { min_score: minScore } : {},
1957
2114
  ...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {},
1958
2115
  score_ranges: scoreRanges
1959
2116
  });
@@ -1970,6 +2127,7 @@ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
1970
2127
  weight,
1971
2128
  // Default to required: true if not specified (backward compatibility)
1972
2129
  required: required ?? true,
2130
+ ...minScore !== void 0 ? { min_score: minScore } : {},
1973
2131
  ...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {}
1974
2132
  });
1975
2133
  }
@@ -2098,12 +2256,22 @@ function parseInlineRubrics(rawRubrics) {
2098
2256
  id: asString(rubric.id) ?? `rubric-${index + 1}`,
2099
2257
  weight: typeof rubric.weight === "number" ? rubric.weight : 1
2100
2258
  };
2259
+ let inlineMinScore;
2260
+ let inlineRequiredMinScore;
2261
+ if (typeof rubric.min_score === "number") {
2262
+ inlineMinScore = rubric.min_score;
2263
+ inlineRequiredMinScore = Math.round(inlineMinScore * 10);
2264
+ } else if (typeof rubric.required_min_score === "number") {
2265
+ inlineRequiredMinScore = rubric.required_min_score;
2266
+ inlineMinScore = inlineRequiredMinScore / 10;
2267
+ }
2101
2268
  if (scoreRanges && scoreRanges.length > 0) {
2102
2269
  return {
2103
2270
  ...baseRubric,
2104
2271
  ...expectedOutcome.length > 0 ? { outcome: expectedOutcome } : {},
2105
2272
  ...typeof rubric.required === "boolean" ? { required: rubric.required } : {},
2106
- ...typeof rubric.required_min_score === "number" ? { required_min_score: rubric.required_min_score } : {},
2273
+ ...inlineMinScore !== void 0 ? { min_score: inlineMinScore } : {},
2274
+ ...inlineRequiredMinScore !== void 0 ? { required_min_score: inlineRequiredMinScore } : {},
2107
2275
  score_ranges: scoreRanges
2108
2276
  };
2109
2277
  }
@@ -2111,7 +2279,8 @@ function parseInlineRubrics(rawRubrics) {
2111
2279
  ...baseRubric,
2112
2280
  outcome: expectedOutcome,
2113
2281
  required: typeof rubric.required === "boolean" ? rubric.required : true,
2114
- ...typeof rubric.required_min_score === "number" ? { required_min_score: rubric.required_min_score } : {}
2282
+ ...inlineMinScore !== void 0 ? { min_score: inlineMinScore } : {},
2283
+ ...inlineRequiredMinScore !== void 0 ? { required_min_score: inlineRequiredMinScore } : {}
2115
2284
  };
2116
2285
  }).filter((r) => r.outcome && r.outcome.length > 0 || "score_ranges" in r && r.score_ranges);
2117
2286
  if (rubricItems.length === 0) {
@@ -2511,6 +2680,9 @@ function resolveExpectedMessages(raw) {
2511
2680
  var ANSI_YELLOW5 = "\x1B[33m";
2512
2681
  var ANSI_RED2 = "\x1B[31m";
2513
2682
  var ANSI_RESET6 = "\x1B[0m";
2683
+ function matchesFilter(id, filter) {
2684
+ return typeof filter === "string" ? micromatch.isMatch(id, filter) : filter.some((pattern) => micromatch.isMatch(id, pattern));
2685
+ }
2514
2686
  function detectFormat(filePath) {
2515
2687
  const ext = path6.extname(filePath).toLowerCase();
2516
2688
  if (ext === ".jsonl") return "jsonl";
@@ -2578,40 +2750,40 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
2578
2750
  const sidecar = await loadSidecarMetadata(absoluteTestPath, verbose);
2579
2751
  const rawFile = await readFile5(absoluteTestPath, "utf8");
2580
2752
  const rawCases = parseJsonlContent(rawFile, evalFilePath);
2581
- const fallbackEvalSet = path6.basename(absoluteTestPath, ".jsonl") || "eval";
2582
- const evalSetName = sidecar.name && sidecar.name.trim().length > 0 ? sidecar.name : fallbackEvalSet;
2753
+ const fallbackSuiteName = path6.basename(absoluteTestPath, ".jsonl") || "eval";
2754
+ const suiteName = sidecar.name && sidecar.name.trim().length > 0 ? sidecar.name : fallbackSuiteName;
2583
2755
  const globalEvaluator = coerceEvaluator(sidecar.evaluator, "sidecar") ?? "llm-grader";
2584
2756
  const globalExecution = sidecar.execution;
2585
2757
  if (verbose) {
2586
2758
  console.log(`
2587
- [JSONL Dataset: ${evalFilePath}]`);
2759
+ [JSONL Suite: ${evalFilePath}]`);
2588
2760
  console.log(` Cases: ${rawCases.length}`);
2589
- console.log(` Eval set: ${evalSetName}`);
2761
+ console.log(` Suite: ${suiteName}`);
2590
2762
  if (sidecar.description) {
2591
2763
  console.log(` Description: ${sidecar.description}`);
2592
2764
  }
2593
2765
  }
2594
2766
  const results = [];
2595
2767
  for (let lineIndex = 0; lineIndex < rawCases.length; lineIndex++) {
2596
- const evalcase = rawCases[lineIndex];
2768
+ const testCaseConfig = rawCases[lineIndex];
2597
2769
  const lineNumber = lineIndex + 1;
2598
- const id = asString4(evalcase.id);
2599
- if (filterPattern && (!id || !micromatch.isMatch(id, filterPattern))) {
2770
+ const id = asString4(testCaseConfig.id);
2771
+ if (filterPattern && (!id || !matchesFilter(id, filterPattern))) {
2600
2772
  continue;
2601
2773
  }
2602
- const conversationId = asString4(evalcase.conversation_id);
2603
- let outcome = asString4(evalcase.criteria);
2604
- if (!outcome && evalcase.expected_outcome !== void 0) {
2605
- outcome = asString4(evalcase.expected_outcome);
2774
+ const conversationId = asString4(testCaseConfig.conversation_id);
2775
+ let outcome = asString4(testCaseConfig.criteria);
2776
+ if (!outcome && testCaseConfig.expected_outcome !== void 0) {
2777
+ outcome = asString4(testCaseConfig.expected_outcome);
2606
2778
  if (outcome) {
2607
2779
  logWarning4(
2608
- `Test '${asString4(evalcase.id) ?? "unknown"}': 'expected_outcome' is deprecated. Use 'criteria' instead.`
2780
+ `Test '${asString4(testCaseConfig.id) ?? "unknown"}': 'expected_outcome' is deprecated. Use 'criteria' instead.`
2609
2781
  );
2610
2782
  }
2611
2783
  }
2612
- const rawInputMessages = resolveInputMessages(evalcase);
2613
- const expectedMessages = resolveExpectedMessages(evalcase) ?? [];
2614
- const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || evalcase.assert !== void 0;
2784
+ const rawInputMessages = resolveInputMessages(testCaseConfig);
2785
+ const expectedMessages = resolveExpectedMessages(testCaseConfig) ?? [];
2786
+ const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || testCaseConfig.assert !== void 0;
2615
2787
  if (!id || !hasEvaluationSpec || !rawInputMessages || rawInputMessages.length === 0) {
2616
2788
  logError2(
2617
2789
  `Skipping incomplete test at line ${lineNumber}: ${id ?? "unknown"}. Missing required fields: id, input, and at least one of criteria/expected_output/assert`
@@ -2648,18 +2820,23 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
2648
2820
  }
2649
2821
  }
2650
2822
  const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
2651
- const caseExecution = isJsonObject(evalcase.execution) ? evalcase.execution : void 0;
2823
+ const caseExecution = isJsonObject(testCaseConfig.execution) ? testCaseConfig.execution : void 0;
2652
2824
  const mergedExecution = caseExecution ?? globalExecution;
2653
- const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
2825
+ const testCaseEvaluatorKind = coerceEvaluator(testCaseConfig.evaluator, id) ?? globalEvaluator;
2654
2826
  let evaluators;
2655
2827
  try {
2656
- evaluators = await parseEvaluators(evalcase, mergedExecution, searchRoots, id ?? "unknown");
2828
+ evaluators = await parseEvaluators(
2829
+ testCaseConfig,
2830
+ mergedExecution,
2831
+ searchRoots,
2832
+ id ?? "unknown"
2833
+ );
2657
2834
  } catch (error) {
2658
2835
  const message = error instanceof Error ? error.message : String(error);
2659
2836
  logError2(`Skipping test '${id}' at line ${lineNumber}: ${message}`);
2660
2837
  continue;
2661
2838
  }
2662
- const inlineRubrics = evalcase.rubrics;
2839
+ const inlineRubrics = testCaseConfig.rubrics;
2663
2840
  if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
2664
2841
  const rubricEvaluator = parseInlineRubrics(inlineRubrics);
2665
2842
  if (rubricEvaluator) {
@@ -2670,7 +2847,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
2670
2847
  const userFilePaths = collectResolvedInputFilePaths(inputMessages);
2671
2848
  const testCase = {
2672
2849
  id,
2673
- dataset: evalSetName,
2850
+ suite: suiteName,
2674
2851
  conversation_id: conversationId,
2675
2852
  question,
2676
2853
  input: inputMessages,
@@ -2678,7 +2855,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
2678
2855
  reference_answer: referenceAnswer,
2679
2856
  file_paths: userFilePaths,
2680
2857
  criteria: outcome ?? "",
2681
- evaluator: evalCaseEvaluatorKind,
2858
+ evaluator: testCaseEvaluatorKind,
2682
2859
  assertions: evaluators
2683
2860
  };
2684
2861
  results.push(testCase);
@@ -2861,6 +3038,9 @@ function buildChatPromptFromSegments(options) {
2861
3038
  var ANSI_YELLOW6 = "\x1B[33m";
2862
3039
  var ANSI_RED3 = "\x1B[31m";
2863
3040
  var ANSI_RESET7 = "\x1B[0m";
3041
+ function matchesFilter2(id, filter) {
3042
+ return typeof filter === "string" ? micromatch2.isMatch(id, filter) : filter.some((pattern) => micromatch2.isMatch(id, pattern));
3043
+ }
2864
3044
  function resolveTests(suite) {
2865
3045
  if (suite.tests !== void 0) return suite.tests;
2866
3046
  if (suite.eval_cases !== void 0) {
@@ -2940,18 +3120,18 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
2940
3120
  throw new Error(`Invalid test file format: ${evalFilePath}`);
2941
3121
  }
2942
3122
  const suite = interpolated;
2943
- const evalSetNameFromSuite = asString5(suite.name)?.trim();
2944
- const fallbackEvalSet = path7.basename(absoluteTestPath).replace(/\.eval\.ya?ml$/i, "").replace(/\.ya?ml$/i, "") || "eval";
2945
- const evalSetName = evalSetNameFromSuite && evalSetNameFromSuite.length > 0 ? evalSetNameFromSuite : fallbackEvalSet;
2946
- const rawTestcases = resolveTests(suite);
3123
+ const suiteNameFromFile = asString5(suite.name)?.trim();
3124
+ const fallbackSuiteName = path7.basename(absoluteTestPath).replace(/\.eval\.ya?ml$/i, "").replace(/\.ya?ml$/i, "") || "eval";
3125
+ const suiteName = suiteNameFromFile && suiteNameFromFile.length > 0 ? suiteNameFromFile : fallbackSuiteName;
3126
+ const rawTestCases = resolveTests(suite);
2947
3127
  const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm-grader";
2948
3128
  const evalFileDir = path7.dirname(absoluteTestPath);
2949
- let expandedTestcases;
2950
- if (typeof rawTestcases === "string") {
2951
- const externalPath = path7.resolve(evalFileDir, rawTestcases);
2952
- expandedTestcases = await loadCasesFromFile(externalPath);
2953
- } else if (Array.isArray(rawTestcases)) {
2954
- expandedTestcases = await expandFileReferences(rawTestcases, evalFileDir);
3129
+ let expandedTestCases;
3130
+ if (typeof rawTestCases === "string") {
3131
+ const externalPath = path7.resolve(evalFileDir, rawTestCases);
3132
+ expandedTestCases = await loadCasesFromFile(externalPath);
3133
+ } else if (Array.isArray(rawTestCases)) {
3134
+ expandedTestCases = await expandFileReferences(rawTestCases, evalFileDir);
2955
3135
  } else {
2956
3136
  throw new Error(`Invalid test file format: ${evalFilePath} - missing 'tests' field`);
2957
3137
  }
@@ -2966,32 +3146,33 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
2966
3146
  }
2967
3147
  const globalExecution = suiteAssertions !== void 0 ? { ...rawGlobalExecution ?? {}, assertions: suiteAssertions } : rawGlobalExecution;
2968
3148
  const results = [];
2969
- for (const rawEvalcase of expandedTestcases) {
2970
- if (!isJsonObject(rawEvalcase)) {
3149
+ for (const rawTestCase of expandedTestCases) {
3150
+ if (!isJsonObject(rawTestCase)) {
2971
3151
  logWarning5("Skipping invalid test entry (expected object)");
2972
3152
  continue;
2973
3153
  }
2974
- const evalcase = rawEvalcase;
2975
- const id = asString5(evalcase.id);
2976
- if (filterPattern && (!id || !micromatch2.isMatch(id, filterPattern))) {
3154
+ const testCaseConfig = rawTestCase;
3155
+ const id = asString5(testCaseConfig.id);
3156
+ if (filterPattern && (!id || !matchesFilter2(id, filterPattern))) {
2977
3157
  continue;
2978
3158
  }
2979
- const conversationId = asString5(evalcase.conversation_id);
2980
- let outcome = asString5(evalcase.criteria);
2981
- if (!outcome && evalcase.expected_outcome !== void 0) {
2982
- outcome = asString5(evalcase.expected_outcome);
3159
+ const conversationId = asString5(testCaseConfig.conversation_id);
3160
+ let outcome = asString5(testCaseConfig.criteria);
3161
+ if (!outcome && testCaseConfig.expected_outcome !== void 0) {
3162
+ outcome = asString5(testCaseConfig.expected_outcome);
2983
3163
  if (outcome) {
2984
3164
  logWarning5(
2985
- `Test '${asString5(evalcase.id) ?? "unknown"}': 'expected_outcome' is deprecated. Use 'criteria' instead.`
3165
+ `Test '${asString5(testCaseConfig.id) ?? "unknown"}': 'expected_outcome' is deprecated. Use 'criteria' instead.`
2986
3166
  );
2987
3167
  }
2988
3168
  }
2989
- const caseExecution = isJsonObject(evalcase.execution) ? evalcase.execution : void 0;
3169
+ const caseExecution = isJsonObject(testCaseConfig.execution) ? testCaseConfig.execution : void 0;
2990
3170
  const skipDefaults = caseExecution?.skip_defaults === true;
3171
+ const caseThreshold = typeof caseExecution?.threshold === "number" && caseExecution.threshold >= 0 && caseExecution.threshold <= 1 ? caseExecution.threshold : void 0;
2991
3172
  const effectiveSuiteInputFiles = suiteInputFiles && !skipDefaults ? suiteInputFiles : void 0;
2992
- const testInputMessages = resolveInputMessages(evalcase, effectiveSuiteInputFiles);
2993
- const expectedMessages = resolveExpectedMessages(evalcase) ?? [];
2994
- const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || evalcase.assertions !== void 0 || evalcase.assert !== void 0;
3173
+ const testInputMessages = resolveInputMessages(testCaseConfig, effectiveSuiteInputFiles);
3174
+ const expectedMessages = resolveExpectedMessages(testCaseConfig) ?? [];
3175
+ const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || testCaseConfig.assertions !== void 0 || testCaseConfig.assert !== void 0;
2995
3176
  if (!id || !hasEvaluationSpec || !testInputMessages || testInputMessages.length === 0) {
2996
3177
  logError3(
2997
3178
  `Skipping incomplete test: ${id ?? "unknown"}. Missing required fields: id, input, and at least one of criteria/expected_output/assertions`
@@ -3038,16 +3219,21 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
3038
3219
  }
3039
3220
  }
3040
3221
  const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
3041
- const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
3222
+ const testCaseEvaluatorKind = coerceEvaluator(testCaseConfig.evaluator, id) ?? globalEvaluator;
3042
3223
  let evaluators;
3043
3224
  try {
3044
- evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
3225
+ evaluators = await parseEvaluators(
3226
+ testCaseConfig,
3227
+ globalExecution,
3228
+ searchRoots,
3229
+ id ?? "unknown"
3230
+ );
3045
3231
  } catch (error) {
3046
3232
  const message = error instanceof Error ? error.message : String(error);
3047
3233
  logError3(`Skipping test '${id}': ${message}`);
3048
3234
  continue;
3049
3235
  }
3050
- const inlineRubrics = evalcase.rubrics;
3236
+ const inlineRubrics = testCaseConfig.rubrics;
3051
3237
  if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
3052
3238
  const rubricEvaluator = parseInlineRubrics(inlineRubrics);
3053
3239
  if (rubricEvaluator) {
@@ -3056,13 +3242,13 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
3056
3242
  }
3057
3243
  warnUnconsumedCriteria(outcome, evaluators, id ?? "unknown");
3058
3244
  const userFilePaths = collectResolvedInputFilePaths(inputMessages);
3059
- const caseWorkspace = await resolveWorkspaceConfig(evalcase.workspace, evalFileDir);
3245
+ const caseWorkspace = await resolveWorkspaceConfig(testCaseConfig.workspace, evalFileDir);
3060
3246
  const mergedWorkspace = mergeWorkspaceConfigs(suiteWorkspace, caseWorkspace);
3061
- const metadata = isJsonObject(evalcase.metadata) ? evalcase.metadata : void 0;
3062
- const caseTargets = extractTargetsFromTestCase(evalcase);
3247
+ const metadata = isJsonObject(testCaseConfig.metadata) ? testCaseConfig.metadata : void 0;
3248
+ const caseTargets = extractTargetsFromTestCase(testCaseConfig);
3063
3249
  const testCase = {
3064
3250
  id,
3065
- dataset: evalSetName,
3251
+ suite: suiteName,
3066
3252
  category: options?.category,
3067
3253
  conversation_id: conversationId,
3068
3254
  question,
@@ -3071,11 +3257,12 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
3071
3257
  reference_answer: referenceAnswer,
3072
3258
  file_paths: userFilePaths,
3073
3259
  criteria: outcome ?? "",
3074
- evaluator: evalCaseEvaluatorKind,
3260
+ evaluator: testCaseEvaluatorKind,
3075
3261
  assertions: evaluators,
3076
3262
  workspace: mergedWorkspace,
3077
3263
  metadata,
3078
- targets: caseTargets
3264
+ targets: caseTargets,
3265
+ ...caseThreshold !== void 0 ? { threshold: caseThreshold } : {}
3079
3266
  };
3080
3267
  results.push(testCase);
3081
3268
  }
@@ -3619,7 +3806,7 @@ var AzureProvider = class {
3619
3806
  };
3620
3807
  this.retryConfig = config.retry;
3621
3808
  const azure = createAzure(buildAzureOptions(config));
3622
- this.model = azure.chat(config.deploymentName);
3809
+ this.model = config.apiFormat === "responses" ? azure(config.deploymentName) : azure.chat(config.deploymentName);
3623
3810
  }
3624
3811
  id;
3625
3812
  kind = "azure";
@@ -3745,7 +3932,9 @@ function buildAzureOptions(config) {
3745
3932
  const options = {
3746
3933
  apiKey: config.apiKey,
3747
3934
  apiVersion: config.version,
3748
- useDeploymentBasedUrls: true
3935
+ // Chat completions still use deployment-scoped Azure URLs for compatibility
3936
+ // with existing deployments. Responses API should use the SDK's v1 path.
3937
+ useDeploymentBasedUrls: config.apiFormat !== "responses"
3749
3938
  };
3750
3939
  const baseURL = normalizeAzureBaseUrl(config.resourceName);
3751
3940
  if (baseURL) {
@@ -5216,15 +5405,16 @@ var CliProvider = class {
5216
5405
  outputFilePath
5217
5406
  );
5218
5407
  const renderedCommand = renderTemplate(this.config.command, templateValues);
5408
+ const effectiveCwd = requests[0]?.cwd ?? this.config.cwd;
5219
5409
  if (this.verbose) {
5220
5410
  console.log(
5221
- `[cli-provider:${this.targetName}] (batch size=${requests.length}) cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
5411
+ `[cli-provider:${this.targetName}] (batch size=${requests.length}) cwd=${effectiveCwd ?? ""} command=${renderedCommand}`
5222
5412
  );
5223
5413
  }
5224
5414
  try {
5225
5415
  const startTime = Date.now();
5226
5416
  const result = await this.runCommand(renderedCommand, {
5227
- cwd: this.config.cwd,
5417
+ cwd: effectiveCwd,
5228
5418
  env: process.env,
5229
5419
  timeoutMs: this.config.timeoutMs,
5230
5420
  signal: controller.signal
@@ -5257,7 +5447,7 @@ var CliProvider = class {
5257
5447
  command: renderedCommand,
5258
5448
  stderr: result.stderr,
5259
5449
  exitCode: result.exitCode ?? 0,
5260
- cwd: this.config.cwd,
5450
+ cwd: effectiveCwd,
5261
5451
  outputFile: outputFilePath
5262
5452
  }
5263
5453
  };
@@ -5275,7 +5465,7 @@ var CliProvider = class {
5275
5465
  command: renderedCommand,
5276
5466
  stderr: result.stderr,
5277
5467
  exitCode: result.exitCode ?? 0,
5278
- cwd: this.config.cwd,
5468
+ cwd: effectiveCwd,
5279
5469
  outputFile: outputFilePath,
5280
5470
  error: errorMessage
5281
5471
  }
@@ -5290,7 +5480,7 @@ var CliProvider = class {
5290
5480
  command: renderedCommand,
5291
5481
  stderr: result.stderr,
5292
5482
  exitCode: result.exitCode ?? 0,
5293
- cwd: this.config.cwd,
5483
+ cwd: effectiveCwd,
5294
5484
  outputFile: outputFilePath,
5295
5485
  recordId: evalCaseId
5296
5486
  }
@@ -7240,9 +7430,9 @@ var MockProvider = class {
7240
7430
  };
7241
7431
 
7242
7432
  // src/evaluation/providers/pi-cli.ts
7243
- import { spawn as spawn3 } from "node:child_process";
7433
+ import { execSync, spawn as spawn3 } from "node:child_process";
7244
7434
  import { randomUUID as randomUUID7 } from "node:crypto";
7245
- import { createWriteStream as createWriteStream5 } from "node:fs";
7435
+ import { accessSync, createWriteStream as createWriteStream5, readFileSync as readFileSync2 } from "node:fs";
7246
7436
  import { mkdir as mkdir6, mkdtemp, rm, writeFile } from "node:fs/promises";
7247
7437
  import { tmpdir } from "node:os";
7248
7438
  import path19 from "node:path";
@@ -7300,6 +7490,75 @@ function subscribeToPiLogEntries(listener) {
7300
7490
  };
7301
7491
  }
7302
7492
 
7493
+ // src/evaluation/providers/pi-provider-aliases.ts
7494
+ var SUBPROVIDER_ALIASES = {
7495
+ azure: "azure-openai-responses"
7496
+ };
7497
+ var SUBPROVIDER_ALIASES_WITH_BASE_URL = {
7498
+ // Azure v1 endpoints are OpenAI-compatible; use the standard client
7499
+ // to avoid AzureOpenAI adding api-version query params.
7500
+ azure: "openai-responses"
7501
+ };
7502
+ var ENV_KEY_MAP = {
7503
+ google: "GEMINI_API_KEY",
7504
+ gemini: "GEMINI_API_KEY",
7505
+ anthropic: "ANTHROPIC_API_KEY",
7506
+ openai: "OPENAI_API_KEY",
7507
+ groq: "GROQ_API_KEY",
7508
+ xai: "XAI_API_KEY",
7509
+ openrouter: "OPENROUTER_API_KEY",
7510
+ azure: "AZURE_OPENAI_API_KEY"
7511
+ };
7512
+ var ENV_BASE_URL_MAP = {
7513
+ openai: "OPENAI_BASE_URL",
7514
+ azure: "AZURE_OPENAI_BASE_URL",
7515
+ openrouter: "OPENROUTER_BASE_URL"
7516
+ };
7517
+ function resolveSubprovider(name, hasBaseUrl = false) {
7518
+ const lower = name.toLowerCase();
7519
+ if (hasBaseUrl) {
7520
+ const alias = SUBPROVIDER_ALIASES_WITH_BASE_URL[lower];
7521
+ if (alias) return alias;
7522
+ }
7523
+ return SUBPROVIDER_ALIASES[lower] ?? name;
7524
+ }
7525
+ function resolveCliProvider(name) {
7526
+ const lower = name.toLowerCase();
7527
+ if (lower === "azure") return "azure-openai-responses";
7528
+ return name;
7529
+ }
7530
+ function resolveEnvKeyName(provider, hasBaseUrl = false) {
7531
+ const lower = provider.toLowerCase();
7532
+ if (hasBaseUrl && lower === "azure") return "OPENAI_API_KEY";
7533
+ return ENV_KEY_MAP[lower];
7534
+ }
7535
+ function resolveEnvBaseUrlName(provider, hasBaseUrl = false) {
7536
+ const lower = provider.toLowerCase();
7537
+ if (hasBaseUrl && lower === "azure") return "OPENAI_BASE_URL";
7538
+ return ENV_BASE_URL_MAP[lower];
7539
+ }
7540
+ function extractAzureResourceName(baseUrl) {
7541
+ const urlMatch = baseUrl.match(/^https?:\/\/([^./]+)/);
7542
+ if (urlMatch) return urlMatch[1];
7543
+ return baseUrl;
7544
+ }
7545
+ function normalizeAzureSdkBaseUrl(baseUrl) {
7546
+ const trimmed = baseUrl.trim().replace(/\/+$/, "");
7547
+ if (!trimmed) {
7548
+ return trimmed;
7549
+ }
7550
+ if (!/^https?:\/\//i.test(trimmed)) {
7551
+ return `https://${trimmed}.openai.azure.com/openai/v1`;
7552
+ }
7553
+ if (/\/openai\/v1$/i.test(trimmed)) {
7554
+ return trimmed;
7555
+ }
7556
+ if (/\/openai$/i.test(trimmed)) {
7557
+ return `${trimmed}/v1`;
7558
+ }
7559
+ return `${trimmed}/openai/v1`;
7560
+ }
7561
+
7303
7562
  // src/evaluation/providers/pi-utils.ts
7304
7563
  function extractPiTextContent(content) {
7305
7564
  if (typeof content === "string") {
@@ -7458,12 +7717,12 @@ var PiCliProvider = class {
7458
7717
  buildPiArgs(prompt, inputFiles) {
7459
7718
  const args = [];
7460
7719
  if (this.config.subprovider) {
7461
- args.push("--provider", this.config.subprovider);
7720
+ args.push("--provider", resolveCliProvider(this.config.subprovider));
7462
7721
  }
7463
7722
  if (this.config.model) {
7464
7723
  args.push("--model", this.config.model);
7465
7724
  }
7466
- if (this.config.apiKey) {
7725
+ if (this.config.apiKey && this.config.subprovider?.toLowerCase() !== "azure") {
7467
7726
  args.push("--api-key", this.config.apiKey);
7468
7727
  }
7469
7728
  args.push("--mode", "json");
@@ -7515,35 +7774,35 @@ ${prompt}` : prompt;
7515
7774
  }
7516
7775
  buildEnv() {
7517
7776
  const env = { ...process.env };
7518
- if (this.config.apiKey) {
7519
- const provider = this.config.subprovider?.toLowerCase() ?? "google";
7520
- const ENV_KEY_MAP = {
7521
- google: "GEMINI_API_KEY",
7522
- gemini: "GEMINI_API_KEY",
7523
- anthropic: "ANTHROPIC_API_KEY",
7524
- openai: "OPENAI_API_KEY",
7525
- groq: "GROQ_API_KEY",
7526
- xai: "XAI_API_KEY",
7527
- openrouter: "OPENROUTER_API_KEY"
7528
- };
7529
- const envKey = ENV_KEY_MAP[provider];
7530
- if (envKey) {
7531
- env[envKey] = this.config.apiKey;
7777
+ const provider = this.config.subprovider?.toLowerCase() ?? "google";
7778
+ if (provider === "azure") {
7779
+ if (this.config.apiKey) {
7780
+ env.AZURE_OPENAI_API_KEY = this.config.apiKey;
7781
+ }
7782
+ if (this.config.baseUrl) {
7783
+ env.AZURE_OPENAI_RESOURCE_NAME = extractAzureResourceName(this.config.baseUrl);
7784
+ }
7785
+ } else {
7786
+ if (this.config.apiKey) {
7787
+ const envKey = resolveEnvKeyName(provider);
7788
+ if (envKey) {
7789
+ env[envKey] = this.config.apiKey;
7790
+ }
7532
7791
  }
7533
7792
  }
7534
7793
  if (this.config.subprovider) {
7535
- const provider = this.config.subprovider.toLowerCase();
7794
+ const resolvedProvider = resolveCliProvider(this.config.subprovider);
7536
7795
  const PROVIDER_OWN_PREFIXES = {
7537
7796
  openrouter: ["OPENROUTER_"],
7538
7797
  anthropic: ["ANTHROPIC_"],
7539
7798
  openai: ["OPENAI_"],
7540
- azure: ["AZURE_OPENAI_"],
7799
+ "azure-openai-responses": ["AZURE_OPENAI_"],
7541
7800
  google: ["GEMINI_", "GOOGLE_GENERATIVE_AI_"],
7542
7801
  gemini: ["GEMINI_", "GOOGLE_GENERATIVE_AI_"],
7543
7802
  groq: ["GROQ_"],
7544
7803
  xai: ["XAI_"]
7545
7804
  };
7546
- const ownPrefixes = PROVIDER_OWN_PREFIXES[provider] ?? [];
7805
+ const ownPrefixes = PROVIDER_OWN_PREFIXES[resolvedProvider] ?? [];
7547
7806
  const allOtherPrefixes = Object.entries(PROVIDER_OWN_PREFIXES).filter(([key]) => key !== provider).flatMap(([, prefixes]) => prefixes);
7548
7807
  for (const key of Object.keys(env)) {
7549
7808
  if (allOtherPrefixes.some((prefix) => key.startsWith(prefix)) && !ownPrefixes.some((prefix) => key.startsWith(prefix))) {
@@ -7834,6 +8093,24 @@ function extractMessages(events) {
7834
8093
  }
7835
8094
  }
7836
8095
  }
8096
+ if (messages) {
8097
+ for (let i = messages.length - 1; i >= 0; i--) {
8098
+ if (messages[i].role === "assistant" && !messages[i].content) {
8099
+ for (let j = events.length - 1; j >= 0; j--) {
8100
+ const evt = events[j];
8101
+ if (!evt || evt.type !== "message_end") continue;
8102
+ const msg = evt.message;
8103
+ if (msg?.role !== "assistant") continue;
8104
+ const text = extractPiTextContent(msg.content);
8105
+ if (text) {
8106
+ messages[i] = { ...messages[i], content: text };
8107
+ break;
8108
+ }
8109
+ }
8110
+ break;
8111
+ }
8112
+ }
8113
+ }
7837
8114
  const eventToolCalls = extractToolCallsFromEvents(events);
7838
8115
  if (eventToolCalls.length > 0) {
7839
8116
  injectEventToolCalls(messages, eventToolCalls);
@@ -8018,17 +8295,43 @@ function formatTimeoutSuffix3(timeoutMs) {
8018
8295
  if (!timeoutMs || timeoutMs <= 0) return "";
8019
8296
  return ` after ${Math.ceil(timeoutMs / 1e3)}s`;
8020
8297
  }
8298
+ function resolveWindowsCmd(executable) {
8299
+ if (process.platform !== "win32") return [executable, []];
8300
+ const lower = executable.toLowerCase();
8301
+ if (lower.endsWith(".js") || lower.endsWith(".exe")) return [executable, []];
8302
+ let fullPath;
8303
+ try {
8304
+ fullPath = execSync(`where ${executable}`, { encoding: "utf-8" }).trim().split(/\r?\n/)[0].trim();
8305
+ } catch {
8306
+ return [executable, []];
8307
+ }
8308
+ const cmdPath = fullPath.endsWith(".cmd") ? fullPath : `${fullPath}.cmd`;
8309
+ try {
8310
+ const content = readFileSync2(cmdPath, "utf-8");
8311
+ const match = content.match(/"?%_prog%"?\s+"([^"]+\.js)"/);
8312
+ if (match) {
8313
+ const dp0 = path19.dirname(path19.resolve(cmdPath));
8314
+ const scriptPath = match[1].replace(/%dp0%[/\\]?/gi, `${dp0}${path19.sep}`);
8315
+ try {
8316
+ accessSync(scriptPath);
8317
+ return ["node", [scriptPath]];
8318
+ } catch {
8319
+ }
8320
+ }
8321
+ } catch {
8322
+ }
8323
+ return [executable, []];
8324
+ }
8021
8325
  async function defaultPiRunner(options) {
8022
8326
  return await new Promise((resolve, reject) => {
8023
8327
  const parts = options.executable.split(/\s+/);
8024
- const executable = parts[0];
8025
- const executableArgs = parts.slice(1);
8328
+ const [resolvedExe, prefixArgs] = resolveWindowsCmd(parts[0]);
8329
+ const executableArgs = [...prefixArgs, ...parts.slice(1)];
8026
8330
  const allArgs = [...executableArgs, ...options.args];
8027
- const child = spawn3(executable, allArgs, {
8331
+ const child = spawn3(resolvedExe, allArgs, {
8028
8332
  cwd: options.cwd,
8029
8333
  env: options.env,
8030
- stdio: ["pipe", "pipe", "pipe"],
8031
- shell: false
8334
+ stdio: ["pipe", "pipe", "pipe"]
8032
8335
  });
8033
8336
  let stdout = "";
8034
8337
  let stderr = "";
@@ -8083,13 +8386,43 @@ async function defaultPiRunner(options) {
8083
8386
  }
8084
8387
 
8085
8388
  // src/evaluation/providers/pi-coding-agent.ts
8086
- import { execSync } from "node:child_process";
8389
+ import { execSync as execSync2 } from "node:child_process";
8087
8390
  import { randomUUID as randomUUID8 } from "node:crypto";
8088
- import { accessSync, createWriteStream as createWriteStream6 } from "node:fs";
8391
+ import { accessSync as accessSync2, createWriteStream as createWriteStream6, mkdirSync } from "node:fs";
8089
8392
  import { mkdir as mkdir7 } from "node:fs/promises";
8090
- import path20 from "node:path";
8393
+ import path21 from "node:path";
8091
8394
  import { createInterface } from "node:readline";
8092
- import { fileURLToPath as fileURLToPath3 } from "node:url";
8395
+ import { fileURLToPath as fileURLToPath3, pathToFileURL } from "node:url";
8396
+
8397
+ // src/paths.ts
8398
+ import os2 from "node:os";
8399
+ import path20 from "node:path";
8400
+ var logged = false;
8401
+ function getAgentvHome() {
8402
+ const envHome = process.env.AGENTV_HOME;
8403
+ if (envHome && envHome !== "undefined") {
8404
+ if (!logged) {
8405
+ logged = true;
8406
+ console.warn(`Using AGENTV_HOME: ${envHome}`);
8407
+ }
8408
+ return envHome;
8409
+ }
8410
+ return path20.join(os2.homedir(), ".agentv");
8411
+ }
8412
+ function getWorkspacesRoot() {
8413
+ return path20.join(getAgentvHome(), "workspaces");
8414
+ }
8415
+ function getSubagentsRoot() {
8416
+ return path20.join(getAgentvHome(), "subagents");
8417
+ }
8418
+ function getTraceStateRoot() {
8419
+ return path20.join(getAgentvHome(), "trace-state");
8420
+ }
8421
+ function getWorkspacePoolRoot() {
8422
+ return path20.join(getAgentvHome(), "workspace-pool");
8423
+ }
8424
+
8425
+ // src/evaluation/providers/pi-coding-agent.ts
8093
8426
  var piCodingAgentModule = null;
8094
8427
  var piAiModule = null;
8095
8428
  var loadingPromise = null;
@@ -8107,46 +8440,126 @@ async function promptInstall() {
8107
8440
  rl.close();
8108
8441
  }
8109
8442
  }
8110
- function findAgentvRoot() {
8111
- const thisFile = fileURLToPath3(import.meta.url);
8112
- let dir = path20.dirname(thisFile);
8113
- for (let i = 0; i < 10; i++) {
8443
+ function findManagedSdkInstallRoot() {
8444
+ return path21.join(getAgentvHome(), "deps", "pi-sdk");
8445
+ }
8446
+ function resolveGlobalNpmRoot() {
8447
+ try {
8448
+ const root = execSync2("npm root -g", {
8449
+ encoding: "utf-8",
8450
+ stdio: ["ignore", "pipe", "ignore"]
8451
+ }).trim();
8452
+ return root.length > 0 ? root : void 0;
8453
+ } catch {
8454
+ return void 0;
8455
+ }
8456
+ }
8457
+ function buildGlobalModuleEntry(moduleName, globalNpmRoot) {
8458
+ return path21.join(globalNpmRoot, ...moduleName.split("/"), "dist", "index.js");
8459
+ }
8460
+ function findAccessiblePath(paths) {
8461
+ for (const candidate of paths) {
8114
8462
  try {
8115
- const pkg = path20.join(dir, "package.json");
8116
- accessSync(pkg);
8117
- return dir;
8463
+ accessSync2(candidate);
8464
+ return candidate;
8118
8465
  } catch {
8119
- const parent = path20.dirname(dir);
8120
- if (parent === dir) break;
8121
- dir = parent;
8122
8466
  }
8123
8467
  }
8124
- return path20.dirname(thisFile);
8468
+ return void 0;
8125
8469
  }
8126
- async function doLoadSdkModules() {
8470
+ async function tryImportLocalSdkModules() {
8127
8471
  try {
8128
8472
  [piCodingAgentModule, piAiModule] = await Promise.all([
8129
8473
  import("@mariozechner/pi-coding-agent"),
8130
8474
  import("@mariozechner/pi-ai")
8131
8475
  ]);
8476
+ return true;
8132
8477
  } catch {
8133
- if (await promptInstall()) {
8134
- const installDir = findAgentvRoot();
8135
- console.error(`Installing @mariozechner/pi-coding-agent into ${installDir}...`);
8136
- execSync("bun add @mariozechner/pi-coding-agent", {
8137
- cwd: installDir,
8138
- stdio: "inherit"
8139
- });
8140
- [piCodingAgentModule, piAiModule] = await Promise.all([
8141
- import("@mariozechner/pi-coding-agent"),
8142
- import("@mariozechner/pi-ai")
8143
- ]);
8144
- } else {
8145
- throw new Error(
8146
- "pi-coding-agent SDK is not installed. Install it with:\n bun add @mariozechner/pi-coding-agent"
8147
- );
8478
+ return false;
8479
+ }
8480
+ }
8481
+ async function tryImportManagedSdkModules() {
8482
+ const managedRoot = findManagedSdkInstallRoot();
8483
+ const piCodingAgentEntry = findAccessiblePath([
8484
+ path21.join(managedRoot, "node_modules", "@mariozechner", "pi-coding-agent", "dist", "index.js")
8485
+ ]);
8486
+ const piAiEntry = findAccessiblePath([
8487
+ path21.join(managedRoot, "node_modules", "@mariozechner", "pi-ai", "dist", "index.js"),
8488
+ path21.join(
8489
+ managedRoot,
8490
+ "node_modules",
8491
+ "@mariozechner",
8492
+ "pi-coding-agent",
8493
+ "node_modules",
8494
+ "@mariozechner",
8495
+ "pi-ai",
8496
+ "dist",
8497
+ "index.js"
8498
+ )
8499
+ ]);
8500
+ if (!piCodingAgentEntry || !piAiEntry) return false;
8501
+ try {
8502
+ [piCodingAgentModule, piAiModule] = await Promise.all([
8503
+ import(pathToFileURL(piCodingAgentEntry).href),
8504
+ import(pathToFileURL(piAiEntry).href)
8505
+ ]);
8506
+ return true;
8507
+ } catch {
8508
+ return false;
8509
+ }
8510
+ }
8511
+ async function tryImportGlobalSdkModules() {
8512
+ const globalNpmRoot = resolveGlobalNpmRoot();
8513
+ if (!globalNpmRoot) return false;
8514
+ const piCodingAgentEntry = findAccessiblePath([
8515
+ buildGlobalModuleEntry("@mariozechner/pi-coding-agent", globalNpmRoot)
8516
+ ]);
8517
+ const piAiEntry = findAccessiblePath([
8518
+ buildGlobalModuleEntry("@mariozechner/pi-ai", globalNpmRoot),
8519
+ path21.join(
8520
+ globalNpmRoot,
8521
+ "@mariozechner",
8522
+ "pi-coding-agent",
8523
+ "node_modules",
8524
+ "@mariozechner",
8525
+ "pi-ai",
8526
+ "dist",
8527
+ "index.js"
8528
+ )
8529
+ ]);
8530
+ if (!piCodingAgentEntry || !piAiEntry) return false;
8531
+ try {
8532
+ [piCodingAgentModule, piAiModule] = await Promise.all([
8533
+ import(pathToFileURL(piCodingAgentEntry).href),
8534
+ import(pathToFileURL(piAiEntry).href)
8535
+ ]);
8536
+ return true;
8537
+ } catch {
8538
+ return false;
8539
+ }
8540
+ }
8541
+ function installSdkModules(installDir) {
8542
+ console.error(`Installing @mariozechner/pi-coding-agent into ${installDir} via npm...`);
8543
+ mkdirSync(installDir, { recursive: true });
8544
+ execSync2("npm install --no-save --no-package-lock @mariozechner/pi-coding-agent", {
8545
+ cwd: installDir,
8546
+ stdio: "inherit"
8547
+ });
8548
+ }
8549
+ async function doLoadSdkModules() {
8550
+ if (await tryImportLocalSdkModules() || await tryImportManagedSdkModules() || await tryImportGlobalSdkModules()) {
8551
+ return;
8552
+ }
8553
+ if (await promptInstall()) {
8554
+ const installDir = findManagedSdkInstallRoot();
8555
+ installSdkModules(installDir);
8556
+ if (await tryImportManagedSdkModules()) {
8557
+ return;
8148
8558
  }
8149
8559
  }
8560
+ throw new Error(
8561
+ "pi-coding-agent SDK is not installed. Install it with:\n npm install @mariozechner/pi-coding-agent"
8562
+ );
8150
8563
  }
8151
8564
  async function loadSdkModules() {
8152
8565
  if (!piCodingAgentModule || !piAiModule) {
@@ -8174,7 +8587,9 @@ async function loadSdkModules() {
8174
8587
  codingTools: piSdk.codingTools,
8175
8588
  toolMap,
8176
8589
  SessionManager: piSdk.SessionManager,
8177
- getModel: piAi.getModel
8590
+ getModel: piAi.getModel,
8591
+ // biome-ignore lint/suspicious/noExplicitAny: registerBuiltInApiProviders exists at runtime but not in type defs
8592
+ registerBuiltInApiProviders: piAi.registerBuiltInApiProviders
8178
8593
  };
8179
8594
  }
8180
8595
  var PiCodingAgentProvider = class {
@@ -8196,17 +8611,35 @@ var PiCodingAgentProvider = class {
8196
8611
  const startTime = (/* @__PURE__ */ new Date()).toISOString();
8197
8612
  const startMs = Date.now();
8198
8613
  const sdk = await loadSdkModules();
8614
+ sdk.registerBuiltInApiProviders();
8199
8615
  const logger = await this.createStreamLogger(request).catch(() => void 0);
8200
8616
  try {
8201
8617
  const cwd = this.resolveCwd(request.cwd);
8202
- const providerName = this.config.subprovider ?? "google";
8618
+ const rawProvider = this.config.subprovider ?? "google";
8619
+ const normalizedBaseUrl = this.normalizeSdkBaseUrl(rawProvider, this.config.baseUrl);
8620
+ const hasBaseUrl = !!normalizedBaseUrl;
8621
+ const providerName = resolveSubprovider(rawProvider, hasBaseUrl);
8203
8622
  const modelId = this.config.model ?? "gemini-2.5-flash";
8204
- this.setApiKeyEnv(providerName);
8205
- const model = sdk.getModel(providerName, modelId);
8623
+ this.setApiKeyEnv(rawProvider, hasBaseUrl);
8624
+ this.setBaseUrlEnv(rawProvider, normalizedBaseUrl, hasBaseUrl);
8625
+ let model = sdk.getModel(providerName, modelId);
8626
+ if (model && normalizedBaseUrl) {
8627
+ model = { ...model, baseUrl: normalizedBaseUrl };
8628
+ }
8206
8629
  if (!model) {
8207
- throw new Error(
8208
- `pi-coding-agent: getModel('${providerName}', '${modelId}') returned undefined. The model '${modelId}' is not registered for provider '${providerName}' in pi-ai. Check that subprovider and model are correct in your target config.`
8209
- );
8630
+ const envProvider = providerName.replace(/-responses$/, "");
8631
+ model = {
8632
+ id: modelId,
8633
+ name: modelId,
8634
+ api: providerName,
8635
+ provider: envProvider,
8636
+ baseUrl: normalizedBaseUrl ?? "",
8637
+ reasoning: false,
8638
+ input: ["text"],
8639
+ cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
8640
+ contextWindow: 128e3,
8641
+ maxTokens: 16384
8642
+ };
8210
8643
  }
8211
8644
  const tools = this.resolveTools(sdk);
8212
8645
  const { session } = await sdk.createAgentSession({
@@ -8359,28 +8792,35 @@ ${fileList}`;
8359
8792
  }
8360
8793
  }
8361
8794
  /** Maps config apiKey to the provider-specific env var the SDK reads. */
8362
- setApiKeyEnv(providerName) {
8795
+ setApiKeyEnv(providerName, hasBaseUrl = false) {
8363
8796
  if (!this.config.apiKey) return;
8364
- const ENV_KEY_MAP = {
8365
- google: "GEMINI_API_KEY",
8366
- gemini: "GEMINI_API_KEY",
8367
- anthropic: "ANTHROPIC_API_KEY",
8368
- openai: "OPENAI_API_KEY",
8369
- groq: "GROQ_API_KEY",
8370
- xai: "XAI_API_KEY",
8371
- openrouter: "OPENROUTER_API_KEY"
8372
- };
8373
- const envKey = ENV_KEY_MAP[providerName.toLowerCase()];
8797
+ const envKey = resolveEnvKeyName(providerName, hasBaseUrl);
8374
8798
  if (envKey) {
8375
8799
  process.env[envKey] = this.config.apiKey;
8376
8800
  }
8377
8801
  }
8802
+ /** Maps config baseUrl to the provider-specific env var the SDK reads. */
8803
+ setBaseUrlEnv(providerName, baseUrl = this.config.baseUrl, hasBaseUrl = false) {
8804
+ const normalizedBaseUrl = this.normalizeSdkBaseUrl(providerName, baseUrl);
8805
+ if (!normalizedBaseUrl) return;
8806
+ const envKey = resolveEnvBaseUrlName(providerName, hasBaseUrl);
8807
+ if (envKey) {
8808
+ process.env[envKey] = normalizedBaseUrl;
8809
+ }
8810
+ }
8811
+ normalizeSdkBaseUrl(providerName, baseUrl) {
8812
+ if (!baseUrl) return void 0;
8813
+ if (providerName.toLowerCase() === "azure") {
8814
+ return normalizeAzureSdkBaseUrl(baseUrl);
8815
+ }
8816
+ return baseUrl;
8817
+ }
8378
8818
  resolveCwd(cwdOverride) {
8379
8819
  if (cwdOverride) {
8380
- return path20.resolve(cwdOverride);
8820
+ return path21.resolve(cwdOverride);
8381
8821
  }
8382
8822
  if (this.config.cwd) {
8383
- return path20.resolve(this.config.cwd);
8823
+ return path21.resolve(this.config.cwd);
8384
8824
  }
8385
8825
  return process.cwd();
8386
8826
  }
@@ -8399,9 +8839,9 @@ ${fileList}`;
8399
8839
  }
8400
8840
  resolveLogDirectory() {
8401
8841
  if (this.config.logDir) {
8402
- return path20.resolve(this.config.logDir);
8842
+ return path21.resolve(this.config.logDir);
8403
8843
  }
8404
- return path20.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
8844
+ return path21.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
8405
8845
  }
8406
8846
  async createStreamLogger(request) {
8407
8847
  const logDir = this.resolveLogDirectory();
@@ -8415,7 +8855,7 @@ ${fileList}`;
8415
8855
  console.warn(`Skipping Pi stream logging (could not create ${logDir}): ${message}`);
8416
8856
  return void 0;
8417
8857
  }
8418
- const filePath = path20.join(logDir, buildLogFilename6(request, this.targetName));
8858
+ const filePath = path21.join(logDir, buildLogFilename6(request, this.targetName));
8419
8859
  try {
8420
8860
  const logger = await PiStreamLogger2.create({
8421
8861
  filePath,
@@ -8640,7 +9080,7 @@ import path30 from "node:path";
8640
9080
  // src/evaluation/providers/vscode/utils/fs.ts
8641
9081
  import { constants as constants2 } from "node:fs";
8642
9082
  import { access as access2, mkdir as mkdir8, readdir as readdir2, rm as rm2, stat as stat2 } from "node:fs/promises";
8643
- import path21 from "node:path";
9083
+ import path22 from "node:path";
8644
9084
  async function pathExists(target) {
8645
9085
  try {
8646
9086
  await access2(target, constants2.F_OK);
@@ -8656,7 +9096,7 @@ async function readDirEntries(target) {
8656
9096
  const entries = await readdir2(target, { withFileTypes: true });
8657
9097
  return entries.map((entry) => ({
8658
9098
  name: entry.name,
8659
- absolutePath: path21.join(target, entry.name),
9099
+ absolutePath: path22.join(target, entry.name),
8660
9100
  isDirectory: entry.isDirectory()
8661
9101
  }));
8662
9102
  }
@@ -8671,9 +9111,9 @@ async function removeIfExists(target) {
8671
9111
  }
8672
9112
 
8673
9113
  // src/evaluation/providers/vscode/utils/path.ts
8674
- import path22 from "node:path";
9114
+ import path23 from "node:path";
8675
9115
  function pathToFileUri2(filePath) {
8676
- const absolutePath = path22.isAbsolute(filePath) ? filePath : path22.resolve(filePath);
9116
+ const absolutePath = path23.isAbsolute(filePath) ? filePath : path23.resolve(filePath);
8677
9117
  const normalizedPath = absolutePath.replace(/\\/g, "/");
8678
9118
  if (/^[a-zA-Z]:\//.test(normalizedPath)) {
8679
9119
  return `file:///${normalizedPath}`;
@@ -8682,7 +9122,7 @@ function pathToFileUri2(filePath) {
8682
9122
  }
8683
9123
 
8684
9124
  // src/evaluation/providers/vscode/dispatch/promptBuilder.ts
8685
- import path23 from "node:path";
9125
+ import path24 from "node:path";
8686
9126
 
8687
9127
  // src/evaluation/providers/vscode/utils/template.ts
8688
9128
  function renderTemplate2(content, variables) {
@@ -8774,8 +9214,8 @@ function createBatchRequestPrompt(userQuery, responseFileTmp, responseFileFinal,
8774
9214
  });
8775
9215
  }
8776
9216
  function createBatchOrchestratorPrompt(requestFiles, responseFiles, templateContent) {
8777
- const requestLines = requestFiles.map((file, index) => `${index + 1}. messages/${path23.basename(file)}`).join("\n");
8778
- const responseList = responseFiles.map((file) => `"${path23.basename(file)}"`).join(", ");
9217
+ const requestLines = requestFiles.map((file, index) => `${index + 1}. messages/${path24.basename(file)}`).join("\n");
9218
+ const responseList = responseFiles.map((file) => `"${path24.basename(file)}"`).join(", ");
8779
9219
  return renderTemplate2(templateContent, {
8780
9220
  requestFiles: requestLines,
8781
9221
  responseList
@@ -8784,7 +9224,7 @@ function createBatchOrchestratorPrompt(requestFiles, responseFiles, templateCont
8784
9224
 
8785
9225
  // src/evaluation/providers/vscode/dispatch/responseWaiter.ts
8786
9226
  import { readFile as readFile9 } from "node:fs/promises";
8787
- import path24 from "node:path";
9227
+ import path25 from "node:path";
8788
9228
 
8789
9229
  // src/evaluation/providers/vscode/utils/time.ts
8790
9230
  function sleep2(ms) {
@@ -8843,7 +9283,7 @@ async function waitForResponseOutput(responseFileFinal, pollInterval = 1e3, sile
8843
9283
  }
8844
9284
  async function waitForBatchResponses(responseFilesFinal, pollInterval = 1e3, silent = false, timeoutMs = DEFAULT_TIMEOUT_MS) {
8845
9285
  if (!silent) {
8846
- const fileList = responseFilesFinal.map((file) => path24.basename(file)).join(", ");
9286
+ const fileList = responseFilesFinal.map((file) => path25.basename(file)).join(", ");
8847
9287
  console.error(`waiting for ${responseFilesFinal.length} batch response(s): ${fileList}`);
8848
9288
  }
8849
9289
  const deadline = Date.now() + timeoutMs;
@@ -8852,7 +9292,7 @@ async function waitForBatchResponses(responseFilesFinal, pollInterval = 1e3, sil
8852
9292
  while (pending.size > 0) {
8853
9293
  if (Date.now() >= deadline) {
8854
9294
  if (!silent) {
8855
- const remaining = [...pending].map((f) => path24.basename(f)).join(", ");
9295
+ const remaining = [...pending].map((f) => path25.basename(f)).join(", ");
8856
9296
  console.error(
8857
9297
  `error: timed out after ${Math.round(timeoutMs / 1e3)}s waiting for batch responses. Still pending: ${remaining}`
8858
9298
  );
@@ -8908,36 +9348,6 @@ import { promisify as promisify2 } from "node:util";
8908
9348
 
8909
9349
  // src/evaluation/providers/vscode/dispatch/constants.ts
8910
9350
  import path26 from "node:path";
8911
-
8912
- // src/paths.ts
8913
- import os2 from "node:os";
8914
- import path25 from "node:path";
8915
- var logged = false;
8916
- function getAgentvHome() {
8917
- const envHome = process.env.AGENTV_HOME;
8918
- if (envHome && envHome !== "undefined") {
8919
- if (!logged) {
8920
- logged = true;
8921
- console.warn(`Using AGENTV_HOME: ${envHome}`);
8922
- }
8923
- return envHome;
8924
- }
8925
- return path25.join(os2.homedir(), ".agentv");
8926
- }
8927
- function getWorkspacesRoot() {
8928
- return path25.join(getAgentvHome(), "workspaces");
8929
- }
8930
- function getSubagentsRoot() {
8931
- return path25.join(getAgentvHome(), "subagents");
8932
- }
8933
- function getTraceStateRoot() {
8934
- return path25.join(getAgentvHome(), "trace-state");
8935
- }
8936
- function getWorkspacePoolRoot() {
8937
- return path25.join(getAgentvHome(), "workspace-pool");
8938
- }
8939
-
8940
- // src/evaluation/providers/vscode/dispatch/constants.ts
8941
9351
  var DEFAULT_LOCK_NAME = "subagent.lock";
8942
9352
  var DEFAULT_ALIVE_FILENAME = ".alive";
8943
9353
  function getDefaultSubagentRoot(vscodeCmd = "code") {
@@ -10194,9 +10604,10 @@ function resolveAndCreateProvider(definition, env = process.env) {
10194
10604
  }
10195
10605
 
10196
10606
  // src/evaluation/evaluators/scoring.ts
10197
- var PASS_THRESHOLD = 0.8;
10198
- function scoreToVerdict(score) {
10199
- return score >= PASS_THRESHOLD ? "pass" : "fail";
10607
+ var DEFAULT_THRESHOLD = 0.8;
10608
+ var PASS_THRESHOLD = DEFAULT_THRESHOLD;
10609
+ function scoreToVerdict(score, threshold = DEFAULT_THRESHOLD) {
10610
+ return score >= threshold ? "pass" : "fail";
10200
10611
  }
10201
10612
  function clampScore(value) {
10202
10613
  if (Number.isNaN(value) || !Number.isFinite(value)) {
@@ -10385,13 +10796,13 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
10385
10796
  async function execShellWithStdin(command, stdinPayload, options = {}) {
10386
10797
  const { mkdir: mkdir16, readFile: readFile15, rm: rm6, writeFile: writeFile9 } = await import("node:fs/promises");
10387
10798
  const { tmpdir: tmpdir3 } = await import("node:os");
10388
- const path49 = await import("node:path");
10799
+ const path50 = await import("node:path");
10389
10800
  const { randomUUID: randomUUID10 } = await import("node:crypto");
10390
- const dir = path49.join(tmpdir3(), `agentv-exec-${randomUUID10()}`);
10801
+ const dir = path50.join(tmpdir3(), `agentv-exec-${randomUUID10()}`);
10391
10802
  await mkdir16(dir, { recursive: true });
10392
- const stdinPath = path49.join(dir, "stdin.txt");
10393
- const stdoutPath = path49.join(dir, "stdout.txt");
10394
- const stderrPath = path49.join(dir, "stderr.txt");
10803
+ const stdinPath = path50.join(dir, "stdin.txt");
10804
+ const stdoutPath = path50.join(dir, "stdout.txt");
10805
+ const stderrPath = path50.join(dir, "stderr.txt");
10395
10806
  await writeFile9(stdinPath, stdinPayload, "utf8");
10396
10807
  const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
10397
10808
  const { spawn: spawn5 } = await import("node:child_process");
@@ -11589,7 +12000,7 @@ ${outputSchema}`;
11589
12000
  parts.push("[[ ## scoring_criteria ## ]]");
11590
12001
  for (const rubric of rubrics) {
11591
12002
  const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
11592
- const minScoreLabel = rubric.required_min_score !== void 0 ? ` [REQUIRED: min score ${rubric.required_min_score}]` : "";
12003
+ const minScoreLabel = rubric.min_score !== void 0 ? ` [REQUIRED: min score ${rubric.min_score}]` : rubric.required_min_score !== void 0 ? ` [REQUIRED: min score ${rubric.required_min_score}]` : "";
11593
12004
  parts.push("", `### Criterion: ${rubric.id}${weightLabel}${minScoreLabel}`);
11594
12005
  if (rubric.outcome) {
11595
12006
  parts.push(`Description: ${rubric.outcome}`);
@@ -11643,54 +12054,106 @@ ${outputSchema}`;
11643
12054
  async runWithRetry(options) {
11644
12055
  const { context, graderProvider, systemPrompt, userPrompt, schema, images } = options;
11645
12056
  let lastError;
12057
+ let lastInvalidResponse;
12058
+ let shouldAttemptStructureFix = false;
11646
12059
  for (let attempt = 1; attempt <= 3; attempt++) {
11647
12060
  try {
11648
- const model = graderProvider.asLanguageModel?.();
11649
- if (model) {
11650
- const modelOptions = {
11651
- ...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
11652
- ...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
11653
- };
11654
- const hasImages = images && images.length > 0;
11655
- const result = hasImages ? await generateText2({
11656
- model,
11657
- system: systemPrompt,
11658
- messages: [
11659
- {
11660
- role: "user",
11661
- content: [
11662
- { type: "text", text: userPrompt },
11663
- ...toAiSdkImageParts(images)
11664
- ]
11665
- }
11666
- ],
11667
- ...modelOptions
11668
- }) : await generateText2({
11669
- model,
11670
- system: systemPrompt,
11671
- prompt: userPrompt,
11672
- ...modelOptions
11673
- });
11674
- const data2 = schema.parse(parseJsonFromText(result.text));
11675
- const rawUsage = result.usage;
11676
- const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens } : void 0;
11677
- return { data: data2, tokenUsage };
12061
+ const result = await this.generateStructuredResponse({
12062
+ context,
12063
+ graderProvider,
12064
+ systemPrompt,
12065
+ userPrompt,
12066
+ images
12067
+ });
12068
+ const canRepairResponse = result.text.trim().length > 0;
12069
+ lastInvalidResponse = canRepairResponse ? result : void 0;
12070
+ let data;
12071
+ try {
12072
+ data = schema.parse(parseJsonFromText(result.text));
12073
+ } catch (e) {
12074
+ lastError = e instanceof Error ? e : new Error(String(e));
12075
+ shouldAttemptStructureFix = canRepairResponse;
12076
+ continue;
11678
12077
  }
11679
- const response = await graderProvider.invoke({
11680
- question: userPrompt,
12078
+ return {
12079
+ data,
12080
+ providerResponse: result.providerResponse,
12081
+ tokenUsage: result.tokenUsage
12082
+ };
12083
+ } catch (e) {
12084
+ lastError = e instanceof Error ? e : new Error(String(e));
12085
+ }
12086
+ }
12087
+ if (shouldAttemptStructureFix && lastInvalidResponse) {
12088
+ try {
12089
+ const repaired = await this.generateStructuredResponse({
12090
+ context,
12091
+ graderProvider,
11681
12092
  systemPrompt,
11682
- evalCaseId: context.evalCase.id,
11683
- attempt: context.attempt,
11684
- maxOutputTokens: this.maxOutputTokens,
11685
- temperature: this.temperature
12093
+ userPrompt: buildStructureRepairPrompt({
12094
+ validationError: lastError?.message ?? "Schema validation failed",
12095
+ invalidResponse: lastInvalidResponse.text
12096
+ })
11686
12097
  });
11687
- const data = schema.parse(parseJsonFromText(extractLastAssistantContent(response.output)));
11688
- return { data, providerResponse: response, tokenUsage: response.tokenUsage };
12098
+ const data = schema.parse(parseJsonFromText(repaired.text));
12099
+ return {
12100
+ data,
12101
+ providerResponse: repaired.providerResponse,
12102
+ tokenUsage: sumTokenUsage(lastInvalidResponse.tokenUsage, repaired.tokenUsage)
12103
+ };
11689
12104
  } catch (e) {
11690
12105
  lastError = e instanceof Error ? e : new Error(String(e));
11691
12106
  }
11692
12107
  }
11693
- throw new Error(`Failed to parse evaluator response after 3 attempts: ${lastError?.message}`);
12108
+ throw new Error(
12109
+ `Failed to parse evaluator response after 3 attempts and 1 structure-fix attempt: ${lastError?.message}`
12110
+ );
12111
+ }
12112
+ async generateStructuredResponse(options) {
12113
+ const { context, graderProvider, systemPrompt, userPrompt, images } = options;
12114
+ const model = graderProvider.asLanguageModel?.();
12115
+ if (model) {
12116
+ const modelOptions = {
12117
+ ...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
12118
+ ...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
12119
+ };
12120
+ const hasImages = images && images.length > 0;
12121
+ const result = hasImages ? await generateText2({
12122
+ model,
12123
+ system: systemPrompt,
12124
+ messages: [
12125
+ {
12126
+ role: "user",
12127
+ content: [
12128
+ { type: "text", text: userPrompt },
12129
+ ...toAiSdkImageParts(images)
12130
+ ]
12131
+ }
12132
+ ],
12133
+ ...modelOptions
12134
+ }) : await generateText2({
12135
+ model,
12136
+ system: systemPrompt,
12137
+ prompt: userPrompt,
12138
+ ...modelOptions
12139
+ });
12140
+ const rawUsage = result.usage;
12141
+ const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens } : void 0;
12142
+ return { text: result.text, tokenUsage };
12143
+ }
12144
+ const response = await graderProvider.invoke({
12145
+ question: userPrompt,
12146
+ systemPrompt,
12147
+ evalCaseId: context.evalCase.id,
12148
+ attempt: context.attempt,
12149
+ maxOutputTokens: this.maxOutputTokens,
12150
+ temperature: this.temperature
12151
+ });
12152
+ return {
12153
+ text: extractLastAssistantContent(response.output),
12154
+ providerResponse: response,
12155
+ tokenUsage: response.tokenUsage
12156
+ };
11694
12157
  }
11695
12158
  };
11696
12159
  function buildOutputSchema() {
@@ -11710,6 +12173,29 @@ function buildOutputSchema() {
11710
12173
  "}"
11711
12174
  ].join("\n");
11712
12175
  }
12176
+ function buildStructureRepairPrompt(options) {
12177
+ const { validationError, invalidResponse } = options;
12178
+ return [
12179
+ "The following evaluation response has useful grading content but invalid JSON structure.",
12180
+ "Repair it to satisfy the schema in the system prompt.",
12181
+ "Preserve the evaluation meaning, do not re-grade the answer, and return only a single JSON object.",
12182
+ "",
12183
+ "Validation error:",
12184
+ validationError,
12185
+ "",
12186
+ "Invalid response:",
12187
+ invalidResponse
12188
+ ].join("\n");
12189
+ }
12190
+ function sumTokenUsage(first, second) {
12191
+ if (!first && !second) {
12192
+ return void 0;
12193
+ }
12194
+ return {
12195
+ input: (first?.input ?? 0) + (second?.input ?? 0),
12196
+ output: (first?.output ?? 0) + (second?.output ?? 0)
12197
+ };
12198
+ }
11713
12199
  function buildRubricOutputSchema() {
11714
12200
  return `You are an expert evaluator. Evaluate the candidate answer against each rubric item.
11715
12201
  You must return a valid JSON object matching this schema:
@@ -11809,19 +12295,21 @@ function calculateScoreRangeResult(result, rubrics) {
11809
12295
  rawScores[rubric.id] = rawScore;
11810
12296
  totalWeight += rubric.weight;
11811
12297
  weightedScoreSum += normalizedScore * rubric.weight;
11812
- let requiredMinScore;
11813
- if (rubric.required_min_score !== void 0) {
11814
- requiredMinScore = rubric.required_min_score;
12298
+ let minScoreThreshold;
12299
+ if (rubric.min_score !== void 0) {
12300
+ minScoreThreshold = rubric.min_score;
12301
+ } else if (rubric.required_min_score !== void 0) {
12302
+ minScoreThreshold = rubric.required_min_score / 10;
11815
12303
  } else if (rubric.required === true) {
11816
- requiredMinScore = 10;
12304
+ minScoreThreshold = 1;
11817
12305
  }
11818
12306
  const matchingRange = rubric.score_ranges?.find(
11819
12307
  (r) => rawScore >= r.score_range[0] && rawScore <= r.score_range[1]
11820
12308
  );
11821
12309
  const rangeDescription = matchingRange?.outcome ?? "";
11822
12310
  const criterionLabel = rubric.outcome ?? rubric.id;
11823
- const passed = !(requiredMinScore !== void 0 && rawScore < requiredMinScore) && rawScore >= 7;
11824
- if (requiredMinScore !== void 0 && rawScore < requiredMinScore) {
12311
+ const passed = !(minScoreThreshold !== void 0 && normalizedScore < minScoreThreshold) && rawScore >= 7;
12312
+ if (minScoreThreshold !== void 0 && normalizedScore < minScoreThreshold) {
11825
12313
  failedRequired = true;
11826
12314
  }
11827
12315
  assertions.push({
@@ -11898,11 +12386,11 @@ function createFilesystemTools(workspacePath) {
11898
12386
  execute: async (input) => {
11899
12387
  try {
11900
12388
  const resolved = resolveSandboxed(workspacePath, input.path);
11901
- const stat10 = await fs2.stat(resolved);
11902
- if (stat10.isDirectory()) {
12389
+ const stat11 = await fs2.stat(resolved);
12390
+ if (stat11.isDirectory()) {
11903
12391
  return { error: `'${input.path}' is a directory, not a file` };
11904
12392
  }
11905
- const buffer = Buffer.alloc(Math.min(stat10.size, MAX_FILE_SIZE));
12393
+ const buffer = Buffer.alloc(Math.min(stat11.size, MAX_FILE_SIZE));
11906
12394
  const fd = await fs2.open(resolved, "r");
11907
12395
  try {
11908
12396
  await fd.read(buffer, 0, buffer.length, 0);
@@ -11910,8 +12398,8 @@ function createFilesystemTools(workspacePath) {
11910
12398
  await fd.close();
11911
12399
  }
11912
12400
  const content = buffer.toString("utf-8");
11913
- const truncated = stat10.size > MAX_FILE_SIZE;
11914
- return { content, truncated, size: stat10.size };
12401
+ const truncated = stat11.size > MAX_FILE_SIZE;
12402
+ return { content, truncated, size: stat11.size };
11915
12403
  } catch (error) {
11916
12404
  return { error: error instanceof Error ? error.message : String(error) };
11917
12405
  }
@@ -11962,8 +12450,8 @@ async function searchDirectory(dirPath, workspacePath, regex, matches) {
11962
12450
  const ext = path35.extname(entry.name).toLowerCase();
11963
12451
  if (BINARY_EXTENSIONS.has(ext)) continue;
11964
12452
  try {
11965
- const stat10 = await fs2.stat(fullPath);
11966
- if (stat10.size > MAX_FILE_SIZE) continue;
12453
+ const stat11 = await fs2.stat(fullPath);
12454
+ if (stat11.size > MAX_FILE_SIZE) continue;
11967
12455
  const content = await fs2.readFile(fullPath, "utf-8");
11968
12456
  const lines = content.split("\n");
11969
12457
  for (let i = 0; i < lines.length; i++) {
@@ -12604,115 +13092,115 @@ var FieldAccuracyEvaluator = class {
12604
13092
  * Evaluate a single field against the expected value.
12605
13093
  */
12606
13094
  evaluateField(fieldConfig, candidateData, expectedData) {
12607
- const { path: path49, match, required = true, weight = 1 } = fieldConfig;
12608
- const candidateValue = resolvePath(candidateData, path49);
12609
- const expectedValue = resolvePath(expectedData, path49);
13095
+ const { path: path50, match, required = true, weight = 1 } = fieldConfig;
13096
+ const candidateValue = resolvePath(candidateData, path50);
13097
+ const expectedValue = resolvePath(expectedData, path50);
12610
13098
  if (expectedValue === void 0) {
12611
13099
  return {
12612
- path: path49,
13100
+ path: path50,
12613
13101
  score: 1,
12614
13102
  // No expected value means no comparison needed
12615
13103
  weight,
12616
13104
  hit: true,
12617
- message: `${path49}: no expected value`
13105
+ message: `${path50}: no expected value`
12618
13106
  };
12619
13107
  }
12620
13108
  if (candidateValue === void 0) {
12621
13109
  if (required) {
12622
13110
  return {
12623
- path: path49,
13111
+ path: path50,
12624
13112
  score: 0,
12625
13113
  weight,
12626
13114
  hit: false,
12627
- message: `${path49} (required, missing)`
13115
+ message: `${path50} (required, missing)`
12628
13116
  };
12629
13117
  }
12630
13118
  return {
12631
- path: path49,
13119
+ path: path50,
12632
13120
  score: 1,
12633
13121
  // Don't penalize missing optional fields
12634
13122
  weight: 0,
12635
13123
  // Zero weight means it won't affect the score
12636
13124
  hit: true,
12637
- message: `${path49}: optional field missing`
13125
+ message: `${path50}: optional field missing`
12638
13126
  };
12639
13127
  }
12640
13128
  switch (match) {
12641
13129
  case "exact":
12642
- return this.compareExact(path49, candidateValue, expectedValue, weight);
13130
+ return this.compareExact(path50, candidateValue, expectedValue, weight);
12643
13131
  case "numeric_tolerance":
12644
13132
  return this.compareNumericTolerance(
12645
- path49,
13133
+ path50,
12646
13134
  candidateValue,
12647
13135
  expectedValue,
12648
13136
  fieldConfig,
12649
13137
  weight
12650
13138
  );
12651
13139
  case "date":
12652
- return this.compareDate(path49, candidateValue, expectedValue, fieldConfig, weight);
13140
+ return this.compareDate(path50, candidateValue, expectedValue, fieldConfig, weight);
12653
13141
  default:
12654
13142
  return {
12655
- path: path49,
13143
+ path: path50,
12656
13144
  score: 0,
12657
13145
  weight,
12658
13146
  hit: false,
12659
- message: `${path49}: unknown match type "${match}"`
13147
+ message: `${path50}: unknown match type "${match}"`
12660
13148
  };
12661
13149
  }
12662
13150
  }
12663
13151
  /**
12664
13152
  * Exact equality comparison.
12665
13153
  */
12666
- compareExact(path49, candidateValue, expectedValue, weight) {
13154
+ compareExact(path50, candidateValue, expectedValue, weight) {
12667
13155
  if (deepEqual(candidateValue, expectedValue)) {
12668
13156
  return {
12669
- path: path49,
13157
+ path: path50,
12670
13158
  score: 1,
12671
13159
  weight,
12672
13160
  hit: true,
12673
- message: path49
13161
+ message: path50
12674
13162
  };
12675
13163
  }
12676
13164
  if (typeof candidateValue !== typeof expectedValue) {
12677
13165
  return {
12678
- path: path49,
13166
+ path: path50,
12679
13167
  score: 0,
12680
13168
  weight,
12681
13169
  hit: false,
12682
- message: `${path49} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
13170
+ message: `${path50} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
12683
13171
  };
12684
13172
  }
12685
13173
  return {
12686
- path: path49,
13174
+ path: path50,
12687
13175
  score: 0,
12688
13176
  weight,
12689
13177
  hit: false,
12690
- message: `${path49} (value mismatch)`
13178
+ message: `${path50} (value mismatch)`
12691
13179
  };
12692
13180
  }
12693
13181
  /**
12694
13182
  * Numeric comparison with absolute or relative tolerance.
12695
13183
  */
12696
- compareNumericTolerance(path49, candidateValue, expectedValue, fieldConfig, weight) {
13184
+ compareNumericTolerance(path50, candidateValue, expectedValue, fieldConfig, weight) {
12697
13185
  const { tolerance = 0, relative = false } = fieldConfig;
12698
13186
  const candidateNum = toNumber(candidateValue);
12699
13187
  const expectedNum = toNumber(expectedValue);
12700
13188
  if (candidateNum === null || expectedNum === null) {
12701
13189
  return {
12702
- path: path49,
13190
+ path: path50,
12703
13191
  score: 0,
12704
13192
  weight,
12705
13193
  hit: false,
12706
- message: `${path49} (non-numeric value)`
13194
+ message: `${path50} (non-numeric value)`
12707
13195
  };
12708
13196
  }
12709
13197
  if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
12710
13198
  return {
12711
- path: path49,
13199
+ path: path50,
12712
13200
  score: 0,
12713
13201
  weight,
12714
13202
  hit: false,
12715
- message: `${path49} (invalid numeric value)`
13203
+ message: `${path50} (invalid numeric value)`
12716
13204
  };
12717
13205
  }
12718
13206
  const diff = Math.abs(candidateNum - expectedNum);
@@ -12725,61 +13213,61 @@ var FieldAccuracyEvaluator = class {
12725
13213
  }
12726
13214
  if (withinTolerance) {
12727
13215
  return {
12728
- path: path49,
13216
+ path: path50,
12729
13217
  score: 1,
12730
13218
  weight,
12731
13219
  hit: true,
12732
- message: `${path49} (within tolerance: diff=${diff.toFixed(2)})`
13220
+ message: `${path50} (within tolerance: diff=${diff.toFixed(2)})`
12733
13221
  };
12734
13222
  }
12735
13223
  return {
12736
- path: path49,
13224
+ path: path50,
12737
13225
  score: 0,
12738
13226
  weight,
12739
13227
  hit: false,
12740
- message: `${path49} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
13228
+ message: `${path50} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
12741
13229
  };
12742
13230
  }
12743
13231
  /**
12744
13232
  * Date comparison with format normalization.
12745
13233
  */
12746
- compareDate(path49, candidateValue, expectedValue, fieldConfig, weight) {
13234
+ compareDate(path50, candidateValue, expectedValue, fieldConfig, weight) {
12747
13235
  const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
12748
13236
  const candidateDate = parseDate(String(candidateValue), formats);
12749
13237
  const expectedDate = parseDate(String(expectedValue), formats);
12750
13238
  if (candidateDate === null) {
12751
13239
  return {
12752
- path: path49,
13240
+ path: path50,
12753
13241
  score: 0,
12754
13242
  weight,
12755
13243
  hit: false,
12756
- message: `${path49} (unparseable candidate date)`
13244
+ message: `${path50} (unparseable candidate date)`
12757
13245
  };
12758
13246
  }
12759
13247
  if (expectedDate === null) {
12760
13248
  return {
12761
- path: path49,
13249
+ path: path50,
12762
13250
  score: 0,
12763
13251
  weight,
12764
13252
  hit: false,
12765
- message: `${path49} (unparseable expected date)`
13253
+ message: `${path50} (unparseable expected date)`
12766
13254
  };
12767
13255
  }
12768
13256
  if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
12769
13257
  return {
12770
- path: path49,
13258
+ path: path50,
12771
13259
  score: 1,
12772
13260
  weight,
12773
13261
  hit: true,
12774
- message: path49
13262
+ message: path50
12775
13263
  };
12776
13264
  }
12777
13265
  return {
12778
- path: path49,
13266
+ path: path50,
12779
13267
  score: 0,
12780
13268
  weight,
12781
13269
  hit: false,
12782
- message: `${path49} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
13270
+ message: `${path50} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
12783
13271
  };
12784
13272
  }
12785
13273
  /**
@@ -12812,11 +13300,11 @@ var FieldAccuracyEvaluator = class {
12812
13300
  };
12813
13301
  }
12814
13302
  };
12815
- function resolvePath(obj, path49) {
12816
- if (!path49 || !obj) {
13303
+ function resolvePath(obj, path50) {
13304
+ if (!path50 || !obj) {
12817
13305
  return void 0;
12818
13306
  }
12819
- const parts = path49.split(/\.|\[|\]/).filter((p) => p.length > 0);
13307
+ const parts = path50.split(/\.|\[|\]/).filter((p) => p.length > 0);
12820
13308
  let current = obj;
12821
13309
  for (const part of parts) {
12822
13310
  if (current === null || current === void 0) {
@@ -13308,8 +13796,8 @@ var TokenUsageEvaluator = class {
13308
13796
  };
13309
13797
 
13310
13798
  // src/evaluation/evaluators/tool-trajectory.ts
13311
- function getNestedValue(obj, path49) {
13312
- const parts = path49.split(".");
13799
+ function getNestedValue(obj, path50) {
13800
+ const parts = path50.split(".");
13313
13801
  let current = obj;
13314
13802
  for (const part of parts) {
13315
13803
  if (current === null || current === void 0 || typeof current !== "object") {
@@ -15081,7 +15569,7 @@ var WorkspacePoolManager = class {
15081
15569
  }
15082
15570
  /**
15083
15571
  * Reset an existing slot for reuse:
15084
- * 1. Reset repos (git reset --hard {ref} && git clean -fd per repo)
15572
+ * 1. Reset repos (fetch from origin when resolve=remote, then git reset --hard && git clean per repo)
15085
15573
  * 2. Re-copy template files (skip repo directories)
15086
15574
  */
15087
15575
  async resetSlot(slotPath, templatePath, repos, poolReset = "fast") {
@@ -15094,7 +15582,17 @@ var WorkspacePoolManager = class {
15094
15582
  continue;
15095
15583
  }
15096
15584
  const ref = repo.checkout?.ref ?? "HEAD";
15097
- await git(["reset", "--hard", ref], { cwd: repoDir });
15585
+ const resolve = repo.checkout?.resolve ?? "remote";
15586
+ if (resolve === "remote") {
15587
+ const fetchArgs = ["fetch", "origin", ref];
15588
+ if (repo.clone?.depth) {
15589
+ fetchArgs.splice(1, 0, "--depth", String(repo.clone.depth));
15590
+ }
15591
+ await git(fetchArgs, { cwd: repoDir });
15592
+ await git(["reset", "--hard", "FETCH_HEAD"], { cwd: repoDir });
15593
+ } else {
15594
+ await git(["reset", "--hard", ref], { cwd: repoDir });
15595
+ }
15098
15596
  const cleanFlag = poolReset === "strict" ? "-fdx" : "-fd";
15099
15597
  await git(["clean", cleanFlag], { cwd: repoDir });
15100
15598
  }
@@ -15391,7 +15889,7 @@ async function executeWorkspaceScript(config, context, failureMode = "fatal") {
15391
15889
  }
15392
15890
 
15393
15891
  // src/evaluation/orchestrator.ts
15394
- function classifyQualityStatus(score, threshold = PASS_THRESHOLD) {
15892
+ function classifyQualityStatus(score, threshold = DEFAULT_THRESHOLD) {
15395
15893
  return score >= threshold ? "ok" : "quality_failure";
15396
15894
  }
15397
15895
  function buildSkippedEvaluatorError(scores) {
@@ -15483,7 +15981,7 @@ async function runEvaluation(options) {
15483
15981
  const filteredEvalCases = filterEvalCases(evalCases, filter);
15484
15982
  if (filteredEvalCases.length === 0) {
15485
15983
  if (filter) {
15486
- throw new Error(`No tests matched filter '${filter}' in ${evalFilePath}`);
15984
+ throw new Error(`No tests matched filter '${formatFilter(filter)}' in ${evalFilePath}`);
15487
15985
  }
15488
15986
  return [];
15489
15987
  }
@@ -15509,20 +16007,10 @@ async function runEvaluation(options) {
15509
16007
  if (resolvedTargetsByName.has(name)) {
15510
16008
  return resolvedTargetsByName.get(name);
15511
16009
  }
15512
- let definition = targetDefinitions.get(name);
16010
+ const definition = resolveDelegatedTargetDefinition(name, targetDefinitions, envLookup);
15513
16011
  if (!definition) {
15514
16012
  return void 0;
15515
16013
  }
15516
- for (let depth = 0; depth < 5; depth++) {
15517
- const useTarget = definition.use_target;
15518
- if (typeof useTarget !== "string" || useTarget.trim().length === 0) break;
15519
- const envMatch = useTarget.trim().match(/^\$\{\{\s*([A-Z0-9_]+)\s*\}\}$/i);
15520
- const resolvedName = envMatch ? envLookup[envMatch[1]] ?? "" : useTarget.trim();
15521
- if (resolvedName.length === 0) break;
15522
- const next = targetDefinitions.get(resolvedName);
15523
- if (!next) break;
15524
- definition = next;
15525
- }
15526
16014
  const resolved = resolveTargetDefinition(definition, envLookup, evalFilePath);
15527
16015
  resolvedTargetsByName.set(name, resolved);
15528
16016
  return resolved;
@@ -15545,6 +16033,9 @@ async function runEvaluation(options) {
15545
16033
  const graderName = targetContext.graderTarget ?? targetContext.name;
15546
16034
  const resolvedGrader = resolveTargetByName(graderName);
15547
16035
  if (!resolvedGrader) {
16036
+ if (!LLM_GRADER_CAPABLE_KINDS.includes(targetContext.kind)) {
16037
+ return void 0;
16038
+ }
15548
16039
  return getOrCreateProvider(targetContext);
15549
16040
  }
15550
16041
  return getOrCreateProvider(resolvedGrader);
@@ -15875,7 +16366,7 @@ async function runEvaluation(options) {
15875
16366
  const budgetResult = {
15876
16367
  timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
15877
16368
  testId: evalCase.id,
15878
- dataset: evalCase.dataset,
16369
+ suite: evalCase.suite,
15879
16370
  category: evalCase.category,
15880
16371
  score: 0,
15881
16372
  assertions: [],
@@ -15912,7 +16403,7 @@ async function runEvaluation(options) {
15912
16403
  const haltResult = {
15913
16404
  timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
15914
16405
  testId: evalCase.id,
15915
- dataset: evalCase.dataset,
16406
+ suite: evalCase.suite,
15916
16407
  category: evalCase.category,
15917
16408
  score: 0,
15918
16409
  assertions: [],
@@ -16224,7 +16715,7 @@ async function runBatchEvaluation(options) {
16224
16715
  targetResolver,
16225
16716
  availableTargets,
16226
16717
  verbose,
16227
- threshold: batchThreshold
16718
+ threshold: evalCase.threshold ?? batchThreshold
16228
16719
  });
16229
16720
  if (providerError) {
16230
16721
  result = {
@@ -16686,8 +17177,9 @@ async function runEvalCase(options) {
16686
17177
  fileChanges,
16687
17178
  workspacePath,
16688
17179
  verbose,
16689
- threshold: caseThreshold
17180
+ threshold: evalCase.threshold ?? caseThreshold
16690
17181
  });
17182
+ const effectiveThreshold = evalCase.threshold ?? caseThreshold;
16691
17183
  const totalDurationMs = Date.now() - caseStartMs;
16692
17184
  const graderTokens = aggregateEvaluatorTokenUsage(result.scores);
16693
17185
  const evalRunTokenUsage = tokenUsage || graderTokens ? {
@@ -16701,7 +17193,7 @@ async function runEvalCase(options) {
16701
17193
  ...evalRunTokenUsage ? { tokenUsage: evalRunTokenUsage } : {}
16702
17194
  };
16703
17195
  const skippedEvaluatorError = buildSkippedEvaluatorError(result.scores);
16704
- const executionStatus = providerError || skippedEvaluatorError ? "execution_error" : classifyQualityStatus(result.score, caseThreshold);
17196
+ const executionStatus = providerError || skippedEvaluatorError ? "execution_error" : classifyQualityStatus(result.score, effectiveThreshold);
16705
17197
  const targetUsedField = targetUsed ? { targetUsed } : {};
16706
17198
  const finalResult = providerError ? {
16707
17199
  ...result,
@@ -16902,7 +17394,8 @@ async function evaluateCandidate(options) {
16902
17394
  targetResolver,
16903
17395
  availableTargets,
16904
17396
  fileChanges,
16905
- workspacePath
17397
+ workspacePath,
17398
+ threshold: evalThreshold
16906
17399
  });
16907
17400
  const completedAt = nowFn();
16908
17401
  let agentRequest;
@@ -16933,7 +17426,7 @@ async function evaluateCandidate(options) {
16933
17426
  return {
16934
17427
  timestamp: completedAt.toISOString(),
16935
17428
  testId: evalCase.id,
16936
- dataset: evalCase.dataset,
17429
+ suite: evalCase.suite,
16937
17430
  category: evalCase.category,
16938
17431
  conversationId: evalCase.conversation_id,
16939
17432
  score: score.score,
@@ -16976,7 +17469,8 @@ async function runEvaluatorsForCase(options) {
16976
17469
  targetResolver,
16977
17470
  availableTargets,
16978
17471
  fileChanges,
16979
- workspacePath
17472
+ workspacePath,
17473
+ threshold
16980
17474
  } = options;
16981
17475
  if (evalCase.assertions && evalCase.assertions.length > 0) {
16982
17476
  return runEvaluatorList({
@@ -17002,7 +17496,8 @@ async function runEvaluatorsForCase(options) {
17002
17496
  targetResolver,
17003
17497
  availableTargets,
17004
17498
  fileChanges,
17005
- workspacePath
17499
+ workspacePath,
17500
+ threshold
17006
17501
  });
17007
17502
  }
17008
17503
  const evaluatorKind = evalCase.evaluator ?? "llm-grader";
@@ -17104,7 +17599,8 @@ async function runEvaluatorList(options) {
17104
17599
  name: evaluatorConfig.name,
17105
17600
  type: evaluatorConfig.type,
17106
17601
  weight,
17107
- ...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {}
17602
+ ...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {},
17603
+ ...evaluatorConfig.min_score !== void 0 ? { min_score: evaluatorConfig.min_score } : {}
17108
17604
  });
17109
17605
  scores.push({
17110
17606
  name: evaluatorConfig.name,
@@ -17139,7 +17635,8 @@ async function runEvaluatorList(options) {
17139
17635
  name: evaluatorConfig.name ?? "unknown",
17140
17636
  type: evaluatorConfig.type ?? "llm-grader",
17141
17637
  weight,
17142
- ...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {}
17638
+ ...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {},
17639
+ ...evaluatorConfig.min_score !== void 0 ? { min_score: evaluatorConfig.min_score } : {}
17143
17640
  });
17144
17641
  scores.push({
17145
17642
  name: evaluatorConfig.name ?? "unknown",
@@ -17173,9 +17670,10 @@ async function runEvaluatorList(options) {
17173
17670
  }
17174
17671
  }
17175
17672
  }
17673
+ const effectiveThreshold = options.threshold ?? DEFAULT_THRESHOLD;
17176
17674
  const hasRequiredFailure = scored.some((entry) => {
17177
17675
  if (!entry.required) return false;
17178
- const minScore = typeof entry.required === "number" ? entry.required : PASS_THRESHOLD;
17676
+ const minScore = entry.min_score ?? (typeof entry.required === "number" ? entry.required : effectiveThreshold);
17179
17677
  return entry.score.score < minScore;
17180
17678
  });
17181
17679
  const scorable = scored.filter((entry) => entry.score.verdict !== "skip");
@@ -17186,17 +17684,23 @@ async function runEvaluatorList(options) {
17186
17684
  const expectedAspectCount = assertions.length || 1;
17187
17685
  const score = {
17188
17686
  score: aggregateScore,
17189
- verdict: scoreToVerdict(aggregateScore),
17687
+ verdict: scoreToVerdict(aggregateScore, effectiveThreshold),
17190
17688
  assertions,
17191
17689
  expectedAspectCount
17192
17690
  };
17193
17691
  return { score, scores };
17194
17692
  }
17693
+ function formatFilter(filter) {
17694
+ return typeof filter === "string" ? filter : filter.join(", ");
17695
+ }
17696
+ function matchesFilter3(id, filter) {
17697
+ return typeof filter === "string" ? micromatch3.isMatch(id, filter) : filter.some((pattern) => micromatch3.isMatch(id, pattern));
17698
+ }
17195
17699
  function filterEvalCases(evalCases, filter) {
17196
17700
  if (!filter) {
17197
17701
  return evalCases;
17198
17702
  }
17199
- return evalCases.filter((evalCase) => micromatch3.isMatch(evalCase.id, filter));
17703
+ return evalCases.filter((evalCase) => matchesFilter3(evalCase.id, filter));
17200
17704
  }
17201
17705
  function buildEvaluatorRegistry(overrides, resolveGraderProvider) {
17202
17706
  const llmGrader = overrides?.["llm-grader"] ?? new LlmGraderEvaluator({
@@ -17283,7 +17787,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
17283
17787
  return {
17284
17788
  timestamp: timestamp.toISOString(),
17285
17789
  testId: evalCase.id,
17286
- dataset: evalCase.dataset,
17790
+ suite: evalCase.suite,
17287
17791
  category: evalCase.category,
17288
17792
  conversationId: evalCase.conversation_id,
17289
17793
  score: 0,
@@ -17555,6 +18059,7 @@ async function evaluate(config) {
17555
18059
  verbose: config.verbose,
17556
18060
  maxConcurrency: config.workers ?? 3,
17557
18061
  filter: config.filter,
18062
+ threshold: config.threshold,
17558
18063
  evalCases,
17559
18064
  onResult: async (result) => {
17560
18065
  collectedResults.push(result);
@@ -17565,19 +18070,19 @@ async function evaluate(config) {
17565
18070
  const durationMs = Date.now() - startTime;
17566
18071
  return {
17567
18072
  results: allResults,
17568
- summary: computeSummary(allResults, durationMs)
18073
+ summary: computeSummary(allResults, durationMs, config.threshold)
17569
18074
  };
17570
18075
  }
17571
18076
  function mapAssertionType(type) {
17572
18077
  return type.replace(/_/g, "-");
17573
18078
  }
17574
- function computeSummary(results, durationMs) {
18079
+ function computeSummary(results, durationMs, threshold = DEFAULT_THRESHOLD) {
17575
18080
  const total = results.length;
17576
18081
  let passed = 0;
17577
18082
  let scoreSum = 0;
17578
18083
  for (const r of results) {
17579
18084
  scoreSum += r.score;
17580
- if (r.score >= PASS_THRESHOLD) {
18085
+ if (r.score >= threshold) {
17581
18086
  passed++;
17582
18087
  }
17583
18088
  }
@@ -17608,7 +18113,7 @@ async function discoverDefaultTarget(repoRoot) {
17608
18113
  return null;
17609
18114
  }
17610
18115
  async function loadEnvHierarchy(repoRoot, startPath) {
17611
- const { readFileSync: readFileSync3 } = await import("node:fs");
18116
+ const { readFileSync: readFileSync4 } = await import("node:fs");
17612
18117
  const chain = buildDirectoryChain(startPath, repoRoot);
17613
18118
  const envFiles = [];
17614
18119
  for (const dir of chain) {
@@ -17617,7 +18122,7 @@ async function loadEnvHierarchy(repoRoot, startPath) {
17617
18122
  }
17618
18123
  for (let i = 0; i < envFiles.length; i++) {
17619
18124
  try {
17620
- const content = readFileSync3(envFiles[i], "utf8");
18125
+ const content = readFileSync4(envFiles[i], "utf8");
17621
18126
  for (const line of content.split("\n")) {
17622
18127
  const trimmed = line.trim();
17623
18128
  if (!trimmed || trimmed.startsWith("#")) continue;
@@ -17690,7 +18195,7 @@ var CONFIG_FILE_NAMES = [
17690
18195
  ];
17691
18196
  async function loadTsConfig(projectRoot) {
17692
18197
  const { existsSync: existsSync7 } = await import("node:fs");
17693
- const { pathToFileURL } = await import("node:url");
18198
+ const { pathToFileURL: pathToFileURL2 } = await import("node:url");
17694
18199
  const { join: join2 } = await import("node:path");
17695
18200
  for (const fileName of CONFIG_FILE_NAMES) {
17696
18201
  const filePath = join2(projectRoot, fileName);
@@ -17698,7 +18203,7 @@ async function loadTsConfig(projectRoot) {
17698
18203
  continue;
17699
18204
  }
17700
18205
  try {
17701
- const fileUrl = pathToFileURL(filePath).href;
18206
+ const fileUrl = pathToFileURL2(filePath).href;
17702
18207
  const mod = await import(fileUrl);
17703
18208
  const config = mod.default ?? mod;
17704
18209
  return AgentVConfigSchema.parse(config);
@@ -17832,7 +18337,7 @@ function shouldSkipCacheForTemperature(targetConfig) {
17832
18337
  }
17833
18338
 
17834
18339
  // src/projects.ts
17835
- import { existsSync as existsSync6, mkdirSync, readFileSync as readFileSync2, readdirSync as readdirSync3, statSync as statSync2, writeFileSync } from "node:fs";
18340
+ import { existsSync as existsSync6, mkdirSync as mkdirSync2, readFileSync as readFileSync3, readdirSync as readdirSync3, statSync as statSync2, writeFileSync } from "node:fs";
17836
18341
  import path47 from "node:path";
17837
18342
  import { parse as parseYaml3, stringify as stringifyYaml } from "yaml";
17838
18343
  function getProjectsRegistryPath() {
@@ -17844,7 +18349,7 @@ function loadProjectRegistry() {
17844
18349
  return { projects: [] };
17845
18350
  }
17846
18351
  try {
17847
- const raw = readFileSync2(registryPath, "utf-8");
18352
+ const raw = readFileSync3(registryPath, "utf-8");
17848
18353
  const parsed = parseYaml3(raw);
17849
18354
  if (!parsed || !Array.isArray(parsed.projects)) {
17850
18355
  return { projects: [] };
@@ -17858,7 +18363,7 @@ function saveProjectRegistry(registry) {
17858
18363
  const registryPath = getProjectsRegistryPath();
17859
18364
  const dir = path47.dirname(registryPath);
17860
18365
  if (!existsSync6(dir)) {
17861
- mkdirSync(dir, { recursive: true });
18366
+ mkdirSync2(dir, { recursive: true });
17862
18367
  }
17863
18368
  writeFileSync(registryPath, stringifyYaml(registry), "utf-8");
17864
18369
  }
@@ -18124,7 +18629,7 @@ var OtelTraceExporter = class {
18124
18629
  rootSpan.setAttribute("gen_ai.system", "agentv");
18125
18630
  rootSpan.setAttribute("agentv.test_id", result.testId);
18126
18631
  rootSpan.setAttribute("agentv.target", result.target);
18127
- if (result.dataset) rootSpan.setAttribute("agentv.dataset", result.dataset);
18632
+ if (result.suite) rootSpan.setAttribute("agentv.suite", result.suite);
18128
18633
  rootSpan.setAttribute("agentv.score", result.score);
18129
18634
  if (captureContent && result.output.length > 0) {
18130
18635
  const lastMsg = result.output[result.output.length - 1];
@@ -18333,7 +18838,7 @@ var OtelStreamingObserver = class {
18333
18838
  this.rootSpan.setAttribute("gen_ai.system", "agentv");
18334
18839
  this.rootSpan.setAttribute("agentv.test_id", testId);
18335
18840
  this.rootSpan.setAttribute("agentv.target", target);
18336
- if (evalSet) this.rootSpan.setAttribute("agentv.dataset", evalSet);
18841
+ if (evalSet) this.rootSpan.setAttribute("agentv.suite", evalSet);
18337
18842
  this.rootCtx = this.api.trace.setSpan(this.api.context.active(), this.rootSpan);
18338
18843
  }
18339
18844
  /** Create and immediately export a tool span */
@@ -18682,11 +19187,241 @@ function extractToolResultContent(content) {
18682
19187
  return parts.length > 0 ? parts.join("") : void 0;
18683
19188
  }
18684
19189
 
18685
- // src/import/session-discovery.ts
19190
+ // src/import/codex-parser.ts
19191
+ function parseCodexSession(jsonl) {
19192
+ const messages = [];
19193
+ let sessionId = "";
19194
+ let cwd;
19195
+ let model;
19196
+ let version;
19197
+ let startTimestamp;
19198
+ let endTimestamp;
19199
+ const pendingCalls = /* @__PURE__ */ new Map();
19200
+ const lines = jsonl.split("\n").filter((l) => l.trim().length > 0);
19201
+ for (const line of lines) {
19202
+ let entry;
19203
+ try {
19204
+ entry = JSON.parse(line);
19205
+ } catch {
19206
+ continue;
19207
+ }
19208
+ if (!entry.type) continue;
19209
+ if (entry.timestamp) {
19210
+ if (!startTimestamp) startTimestamp = entry.timestamp;
19211
+ endTimestamp = entry.timestamp;
19212
+ }
19213
+ const payload = entry.payload ?? {};
19214
+ switch (entry.type) {
19215
+ case "session_meta": {
19216
+ sessionId = String(payload.id ?? "");
19217
+ cwd = payload.cwd ? String(payload.cwd) : void 0;
19218
+ version = payload.cli_version ? String(payload.cli_version) : void 0;
19219
+ if (payload.model && !model) {
19220
+ model = String(payload.model);
19221
+ }
19222
+ break;
19223
+ }
19224
+ case "turn_context": {
19225
+ if (payload.model && !model) {
19226
+ model = String(payload.model);
19227
+ }
19228
+ if (payload.cwd && !cwd) {
19229
+ cwd = String(payload.cwd);
19230
+ }
19231
+ break;
19232
+ }
19233
+ case "response_item": {
19234
+ const itemType = String(payload.type ?? "");
19235
+ const role = String(payload.role ?? "");
19236
+ switch (itemType) {
19237
+ case "message": {
19238
+ if (role === "developer") break;
19239
+ const content = extractResponseItemContent(payload.content);
19240
+ if (role === "user" && content) {
19241
+ messages.push({ role: "user", content });
19242
+ } else if (role === "assistant" && content) {
19243
+ messages.push({ role: "assistant", content });
19244
+ }
19245
+ break;
19246
+ }
19247
+ case "function_call": {
19248
+ const toolName = String(payload.name ?? "");
19249
+ const callId = String(payload.call_id ?? "");
19250
+ let input;
19251
+ if (typeof payload.arguments === "string") {
19252
+ try {
19253
+ input = JSON.parse(payload.arguments);
19254
+ } catch {
19255
+ input = payload.arguments;
19256
+ }
19257
+ } else {
19258
+ input = payload.arguments;
19259
+ }
19260
+ const toolCall = { tool: toolName, input, id: callId };
19261
+ const msgIdx = messages.length;
19262
+ messages.push({
19263
+ role: "assistant",
19264
+ toolCalls: [toolCall]
19265
+ });
19266
+ if (callId) {
19267
+ pendingCalls.set(callId, { msgIdx, toolIdx: 0 });
19268
+ }
19269
+ break;
19270
+ }
19271
+ case "custom_tool_call": {
19272
+ const toolName = String(payload.name ?? "");
19273
+ const callId = String(payload.call_id ?? "");
19274
+ let input;
19275
+ if (typeof payload.arguments === "string") {
19276
+ try {
19277
+ input = JSON.parse(payload.arguments);
19278
+ } catch {
19279
+ input = payload.arguments;
19280
+ }
19281
+ } else {
19282
+ input = payload.arguments;
19283
+ }
19284
+ const toolCall = { tool: toolName, input, id: callId };
19285
+ const msgIdx = messages.length;
19286
+ messages.push({
19287
+ role: "assistant",
19288
+ toolCalls: [toolCall]
19289
+ });
19290
+ if (callId) {
19291
+ pendingCalls.set(callId, { msgIdx, toolIdx: 0 });
19292
+ }
19293
+ break;
19294
+ }
19295
+ case "function_call_output":
19296
+ case "custom_tool_call_output": {
19297
+ const callId = String(payload.call_id ?? "");
19298
+ const pending = pendingCalls.get(callId);
19299
+ if (pending) {
19300
+ const existingMsg = messages[pending.msgIdx];
19301
+ const existingCalls = [...existingMsg.toolCalls ?? []];
19302
+ existingCalls[pending.toolIdx] = {
19303
+ ...existingCalls[pending.toolIdx],
19304
+ output: payload.output
19305
+ };
19306
+ messages[pending.msgIdx] = { ...existingMsg, toolCalls: existingCalls };
19307
+ pendingCalls.delete(callId);
19308
+ }
19309
+ break;
19310
+ }
19311
+ // Skip reasoning blocks (thinking tokens)
19312
+ case "reasoning":
19313
+ break;
19314
+ }
19315
+ break;
19316
+ }
19317
+ }
19318
+ }
19319
+ let durationMs;
19320
+ if (startTimestamp && endTimestamp) {
19321
+ durationMs = new Date(endTimestamp).getTime() - new Date(startTimestamp).getTime();
19322
+ }
19323
+ const source = {
19324
+ provider: "codex",
19325
+ sessionId,
19326
+ cwd,
19327
+ startedAt: startTimestamp,
19328
+ model,
19329
+ version
19330
+ };
19331
+ return {
19332
+ messages,
19333
+ source,
19334
+ // Codex rollout files don't include token counts (only rate limit info)
19335
+ tokenUsage: void 0,
19336
+ durationMs,
19337
+ costUsd: null
19338
+ };
19339
+ }
19340
+ function extractResponseItemContent(content) {
19341
+ if (typeof content === "string") return content;
19342
+ if (!Array.isArray(content)) return void 0;
19343
+ const parts = [];
19344
+ for (const block of content) {
19345
+ if (typeof block === "object" && block !== null) {
19346
+ const b = block;
19347
+ if (typeof b.text === "string") {
19348
+ parts.push(b.text);
19349
+ }
19350
+ }
19351
+ }
19352
+ return parts.length > 0 ? parts.join("") : void 0;
19353
+ }
19354
+
19355
+ // src/import/codex-session-discovery.ts
18686
19356
  import { readdir as readdir8, stat as stat9 } from "node:fs/promises";
18687
19357
  import { homedir as homedir3 } from "node:os";
18688
19358
  import path48 from "node:path";
18689
- var DEFAULT_PROJECTS_DIR = () => path48.join(homedir3(), ".claude", "projects");
19359
+ var DEFAULT_SESSIONS_DIR = () => path48.join(homedir3(), ".codex", "sessions");
19360
+ async function discoverCodexSessions(opts) {
19361
+ const sessionsDir = opts?.sessionsDir ?? DEFAULT_SESSIONS_DIR();
19362
+ const limit = opts?.latest ? 1 : opts?.limit ?? 10;
19363
+ const sessions = [];
19364
+ let yearDirs;
19365
+ try {
19366
+ yearDirs = await readdir8(sessionsDir);
19367
+ } catch {
19368
+ return [];
19369
+ }
19370
+ for (const year of yearDirs) {
19371
+ const yearPath = path48.join(sessionsDir, year);
19372
+ let monthDirs;
19373
+ try {
19374
+ monthDirs = await readdir8(yearPath);
19375
+ } catch {
19376
+ continue;
19377
+ }
19378
+ for (const month of monthDirs) {
19379
+ const monthPath = path48.join(yearPath, month);
19380
+ let dayDirs;
19381
+ try {
19382
+ dayDirs = await readdir8(monthPath);
19383
+ } catch {
19384
+ continue;
19385
+ }
19386
+ for (const day of dayDirs) {
19387
+ if (opts?.date) {
19388
+ const dirDate = `${year}-${month}-${day}`;
19389
+ if (dirDate !== opts.date) continue;
19390
+ }
19391
+ const dayPath = path48.join(monthPath, day);
19392
+ let files;
19393
+ try {
19394
+ files = await readdir8(dayPath);
19395
+ } catch {
19396
+ continue;
19397
+ }
19398
+ for (const file of files) {
19399
+ if (!file.startsWith("rollout-") || !file.endsWith(".jsonl")) continue;
19400
+ const filePath = path48.join(dayPath, file);
19401
+ const nameWithoutExt = file.replace(/\.jsonl$/, "");
19402
+ const parts = nameWithoutExt.split("-");
19403
+ const sessionId = parts.length >= 6 ? parts.slice(-5).join("-") : nameWithoutExt;
19404
+ let updatedAt;
19405
+ try {
19406
+ const fileStat = await stat9(filePath);
19407
+ updatedAt = fileStat.mtime;
19408
+ } catch {
19409
+ updatedAt = /* @__PURE__ */ new Date(0);
19410
+ }
19411
+ sessions.push({ sessionId, filePath, filename: file, updatedAt });
19412
+ }
19413
+ }
19414
+ }
19415
+ }
19416
+ sessions.sort((a, b) => b.updatedAt.getTime() - a.updatedAt.getTime());
19417
+ return sessions.slice(0, limit);
19418
+ }
19419
+
19420
+ // src/import/session-discovery.ts
19421
+ import { readdir as readdir9, stat as stat10 } from "node:fs/promises";
19422
+ import { homedir as homedir4 } from "node:os";
19423
+ import path49 from "node:path";
19424
+ var DEFAULT_PROJECTS_DIR = () => path49.join(homedir4(), ".claude", "projects");
18690
19425
  function encodeProjectPath(projectPath) {
18691
19426
  return projectPath.replace(/\//g, "-");
18692
19427
  }
@@ -18695,7 +19430,7 @@ async function discoverClaudeSessions(opts) {
18695
19430
  const limit = opts?.latest ? 1 : opts?.limit ?? 10;
18696
19431
  let projectDirs;
18697
19432
  try {
18698
- projectDirs = await readdir8(projectsDir);
19433
+ projectDirs = await readdir9(projectsDir);
18699
19434
  } catch {
18700
19435
  return [];
18701
19436
  }
@@ -18705,10 +19440,10 @@ async function discoverClaudeSessions(opts) {
18705
19440
  }
18706
19441
  const sessions = [];
18707
19442
  for (const projectDir of projectDirs) {
18708
- const dirPath = path48.join(projectsDir, projectDir);
19443
+ const dirPath = path49.join(projectsDir, projectDir);
18709
19444
  let entries;
18710
19445
  try {
18711
- entries = await readdir8(dirPath);
19446
+ entries = await readdir9(dirPath);
18712
19447
  } catch {
18713
19448
  continue;
18714
19449
  }
@@ -18716,10 +19451,10 @@ async function discoverClaudeSessions(opts) {
18716
19451
  if (!entry.endsWith(".jsonl")) continue;
18717
19452
  const sessionId = entry.replace(/\.jsonl$/, "");
18718
19453
  if (opts?.sessionId && sessionId !== opts.sessionId) continue;
18719
- const filePath = path48.join(dirPath, entry);
19454
+ const filePath = path49.join(dirPath, entry);
18720
19455
  let updatedAt;
18721
19456
  try {
18722
- const fileStat = await stat9(filePath);
19457
+ const fileStat = await stat10(filePath);
18723
19458
  updatedAt = fileStat.mtime;
18724
19459
  } catch {
18725
19460
  updatedAt = /* @__PURE__ */ new Date(0);
@@ -18738,10 +19473,85 @@ async function discoverClaudeSessions(opts) {
18738
19473
 
18739
19474
  // src/import/types.ts
18740
19475
  import { readFile as readFile14 } from "node:fs/promises";
19476
+ function toTranscriptJsonLine(entry) {
19477
+ const firstUserMessage = entry.messages.find((m) => m.role === "user");
19478
+ const input = typeof firstUserMessage?.content === "string" ? firstUserMessage.content : "";
19479
+ return {
19480
+ input,
19481
+ output: entry.messages,
19482
+ token_usage: entry.tokenUsage ? {
19483
+ input: entry.tokenUsage.input,
19484
+ output: entry.tokenUsage.output,
19485
+ cached: entry.tokenUsage.cached
19486
+ } : void 0,
19487
+ duration_ms: entry.durationMs,
19488
+ cost_usd: entry.costUsd,
19489
+ source: {
19490
+ provider: entry.source.provider,
19491
+ session_id: entry.source.sessionId,
19492
+ model: entry.source.model,
19493
+ timestamp: entry.source.startedAt,
19494
+ git_branch: entry.source.gitBranch,
19495
+ cwd: entry.source.cwd ?? entry.source.projectPath,
19496
+ version: entry.source.version
19497
+ }
19498
+ };
19499
+ }
19500
+ async function readTranscriptJsonl(filePath) {
19501
+ const text = await readFile14(filePath, "utf8");
19502
+ return text.split("\n").filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
19503
+ }
18741
19504
  async function readTranscriptFile(filePath) {
18742
19505
  return readFile14(filePath, "utf8");
18743
19506
  }
18744
19507
 
19508
+ // src/import/transcript-provider.ts
19509
+ var TranscriptProvider = class _TranscriptProvider {
19510
+ id;
19511
+ kind = "transcript";
19512
+ targetName;
19513
+ lines;
19514
+ cursor = 0;
19515
+ constructor(targetName, lines) {
19516
+ this.targetName = targetName;
19517
+ this.id = `transcript:${targetName}`;
19518
+ this.lines = lines;
19519
+ }
19520
+ /**
19521
+ * Create a TranscriptProvider from a JSONL file path.
19522
+ */
19523
+ static async fromFile(filePath) {
19524
+ const lines = await readTranscriptJsonl(filePath);
19525
+ if (lines.length === 0) {
19526
+ throw new Error(`Transcript file is empty: ${filePath}`);
19527
+ }
19528
+ const providerName = lines[0].source.provider ?? "transcript";
19529
+ return new _TranscriptProvider(providerName, lines);
19530
+ }
19531
+ get lineCount() {
19532
+ return this.lines.length;
19533
+ }
19534
+ async invoke(_request) {
19535
+ if (this.cursor >= this.lines.length) {
19536
+ throw new Error(
19537
+ `Transcript exhausted: ${this.lines.length} line(s) available but ${this.cursor + 1} invocations attempted. Each transcript line maps to one test case.`
19538
+ );
19539
+ }
19540
+ const line = this.lines[this.cursor++];
19541
+ return {
19542
+ output: line.output,
19543
+ tokenUsage: line.token_usage ? {
19544
+ input: line.token_usage.input,
19545
+ output: line.token_usage.output,
19546
+ cached: line.token_usage.cached
19547
+ } : void 0,
19548
+ durationMs: line.duration_ms,
19549
+ costUsd: line.cost_usd ?? void 0,
19550
+ startTime: line.source.timestamp
19551
+ };
19552
+ }
19553
+ };
19554
+
18745
19555
  // src/index.ts
18746
19556
  function createAgentKernel() {
18747
19557
  return { status: "stub" };
@@ -18755,6 +19565,7 @@ export {
18755
19565
  DEFAULT_EVALUATOR_TEMPLATE,
18756
19566
  DEFAULT_EVAL_PATTERNS,
18757
19567
  DEFAULT_EXPLORATION_TOOLS,
19568
+ DEFAULT_THRESHOLD,
18758
19569
  DeterministicAssertionEvaluator,
18759
19570
  EvaluatorRegistry,
18760
19571
  ExecutionMetricsEvaluator,
@@ -18776,6 +19587,7 @@ export {
18776
19587
  TemplateNotFoundError,
18777
19588
  TokenUsageEvaluator,
18778
19589
  ToolTrajectoryEvaluator,
19590
+ TranscriptProvider,
18779
19591
  WorkspaceCreationError,
18780
19592
  WorkspacePoolManager,
18781
19593
  addProject,
@@ -18812,6 +19624,7 @@ export {
18812
19624
  detectFormat,
18813
19625
  discoverAssertions,
18814
19626
  discoverClaudeSessions,
19627
+ discoverCodexSessions,
18815
19628
  discoverCopilotSessions,
18816
19629
  discoverGraders,
18817
19630
  discoverGraders as discoverJudges,
@@ -18872,6 +19685,8 @@ export {
18872
19685
  normalizeLineEndings,
18873
19686
  parseAgentSkillsEvals,
18874
19687
  parseClaudeSession,
19688
+ parseCodexSession,
19689
+ parseCopilotEvents,
18875
19690
  parseJsonFromText,
18876
19691
  parseJsonSafe,
18877
19692
  readJsonFile,
@@ -18879,8 +19694,10 @@ export {
18879
19694
  readTestSuiteMetadata,
18880
19695
  readTextFile,
18881
19696
  readTranscriptFile,
19697
+ readTranscriptJsonl,
18882
19698
  removeProject,
18883
19699
  resolveAndCreateProvider,
19700
+ resolveDelegatedTargetDefinition,
18884
19701
  resolveFileReference,
18885
19702
  resolveTargetDefinition,
18886
19703
  resolveWorkspaceTemplate,
@@ -18910,6 +19727,7 @@ export {
18910
19727
  substituteVariables,
18911
19728
  toCamelCaseDeep,
18912
19729
  toSnakeCaseDeep,
19730
+ toTranscriptJsonLine,
18913
19731
  tokensPerTool,
18914
19732
  touchProject,
18915
19733
  transpileEvalYaml,