@m4trix/evals 0.21.1 → 0.23.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +200 -154
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +201 -155
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +196 -151
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +197 -152
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +196 -151
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +197 -152
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli.cjs
CHANGED
|
@@ -1443,6 +1443,20 @@ function readOutput(testCase) {
|
|
|
1443
1443
|
}
|
|
1444
1444
|
return candidate.getOutput();
|
|
1445
1445
|
}
|
|
1446
|
+
function buildEvaluationUnits(testCases) {
|
|
1447
|
+
const units = [];
|
|
1448
|
+
for (const testCaseItem of testCases) {
|
|
1449
|
+
const rerunTotal = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
|
|
1450
|
+
for (let r = 0; r < rerunTotal; r++) {
|
|
1451
|
+
units.push({
|
|
1452
|
+
testCaseItem,
|
|
1453
|
+
rerunIndex: r + 1,
|
|
1454
|
+
rerunTotal
|
|
1455
|
+
});
|
|
1456
|
+
}
|
|
1457
|
+
}
|
|
1458
|
+
return units;
|
|
1459
|
+
}
|
|
1446
1460
|
function nowIsoForFile() {
|
|
1447
1461
|
return (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
1448
1462
|
}
|
|
@@ -1452,157 +1466,171 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
|
1452
1466
|
`${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
|
|
1453
1467
|
);
|
|
1454
1468
|
}
|
|
1455
|
-
function
|
|
1469
|
+
function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
|
|
1470
|
+
const { testCaseItem, rerunIndex, rerunTotal } = unit;
|
|
1456
1471
|
return effect.Effect.gen(function* () {
|
|
1457
|
-
const
|
|
1458
|
-
const
|
|
1459
|
-
|
|
1460
|
-
|
|
1461
|
-
|
|
1462
|
-
|
|
1463
|
-
|
|
1464
|
-
|
|
1465
|
-
|
|
1466
|
-
|
|
1467
|
-
|
|
1468
|
-
|
|
1469
|
-
|
|
1470
|
-
|
|
1471
|
-
|
|
1472
|
-
|
|
1473
|
-
|
|
1474
|
-
|
|
1475
|
-
|
|
1476
|
-
|
|
1477
|
-
|
|
1478
|
-
|
|
1479
|
-
|
|
1480
|
-
|
|
1481
|
-
|
|
1482
|
-
|
|
1483
|
-
|
|
1484
|
-
|
|
1485
|
-
|
|
1486
|
-
|
|
1487
|
-
|
|
1488
|
-
|
|
1489
|
-
|
|
1490
|
-
|
|
1491
|
-
|
|
1492
|
-
|
|
1493
|
-
|
|
1494
|
-
|
|
1495
|
-
|
|
1496
|
-
|
|
1497
|
-
|
|
1498
|
-
|
|
1499
|
-
|
|
1500
|
-
|
|
1501
|
-
|
|
1502
|
-
|
|
1503
|
-
|
|
1504
|
-
|
|
1505
|
-
|
|
1506
|
-
|
|
1507
|
-
|
|
1508
|
-
|
|
1509
|
-
|
|
1510
|
-
|
|
1511
|
-
|
|
1512
|
-
|
|
1513
|
-
|
|
1514
|
-
|
|
1515
|
-
|
|
1516
|
-
|
|
1517
|
-
|
|
1518
|
-
|
|
1519
|
-
|
|
1520
|
-
const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
|
|
1521
|
-
logs.push(taggedEntry ?? createLogEntry(result));
|
|
1522
|
-
testCaseError = result.message;
|
|
1523
|
-
evaluatorScores.push({
|
|
1524
|
-
evaluatorId,
|
|
1525
|
-
scores: [],
|
|
1526
|
-
passed: false,
|
|
1527
|
-
logs: logs.length > 0 ? logs : void 0
|
|
1528
|
-
});
|
|
1529
|
-
continue;
|
|
1530
|
-
}
|
|
1531
|
-
const { scores, metrics } = normalizeResult(result);
|
|
1532
|
-
const passed2 = computeEvaluatorPassed(evaluator, result, scores);
|
|
1533
|
-
evaluatorScores.push({
|
|
1534
|
-
evaluatorId,
|
|
1535
|
-
scores,
|
|
1536
|
-
passed: passed2,
|
|
1537
|
-
metrics,
|
|
1538
|
-
logs: logs.length > 0 ? logs : void 0
|
|
1539
|
-
});
|
|
1540
|
-
} catch (error) {
|
|
1541
|
-
if (error instanceof Error) {
|
|
1542
|
-
const taggedEntry = error[evaluatorErrorLogEntryKey];
|
|
1543
|
-
logs.push(taggedEntry ?? createLogEntry(error));
|
|
1544
|
-
}
|
|
1545
|
-
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
1472
|
+
const evaluatorRunId = `run-${crypto.randomUUID()}`;
|
|
1473
|
+
const started = Date.now();
|
|
1474
|
+
const startedEvaluations = yield* effect.Ref.modify(startedRef, (n) => [
|
|
1475
|
+
n + 1,
|
|
1476
|
+
n + 1
|
|
1477
|
+
]);
|
|
1478
|
+
yield* publishEvent({
|
|
1479
|
+
type: "TestCaseStarted",
|
|
1480
|
+
runId: task.runId,
|
|
1481
|
+
testCaseId: testCaseItem.id,
|
|
1482
|
+
testCaseName: testCaseItem.testCase.getName(),
|
|
1483
|
+
startedTestCases: startedEvaluations,
|
|
1484
|
+
totalTestCases: totalEvaluations,
|
|
1485
|
+
rerunIndex,
|
|
1486
|
+
rerunTotal
|
|
1487
|
+
});
|
|
1488
|
+
const evaluatorScores = [];
|
|
1489
|
+
let testCaseError;
|
|
1490
|
+
const output = readOutput(testCaseItem.testCase);
|
|
1491
|
+
for (const { id: evaluatorId, evaluator } of task.evaluators) {
|
|
1492
|
+
const evaluateFn = evaluator.getEvaluateFn();
|
|
1493
|
+
if (!evaluateFn) {
|
|
1494
|
+
continue;
|
|
1495
|
+
}
|
|
1496
|
+
const logs = [];
|
|
1497
|
+
const logDiff = (expected, actual, options) => {
|
|
1498
|
+
logs.push(createDiffLogEntry(expected, actual, options));
|
|
1499
|
+
};
|
|
1500
|
+
const log = (message, options) => {
|
|
1501
|
+
logs.push(createLogEntry(message, options));
|
|
1502
|
+
};
|
|
1503
|
+
const createError = (message, options) => {
|
|
1504
|
+
const entry = createLogEntry(message, options);
|
|
1505
|
+
const error = message instanceof Error ? message : new Error(entry.message);
|
|
1506
|
+
error[evaluatorErrorLogEntryKey] = entry;
|
|
1507
|
+
return error;
|
|
1508
|
+
};
|
|
1509
|
+
try {
|
|
1510
|
+
const ctx = yield* effect.Effect.promise(
|
|
1511
|
+
() => Promise.resolve(evaluator.resolveContext())
|
|
1512
|
+
);
|
|
1513
|
+
const result = yield* effect.Effect.promise(
|
|
1514
|
+
() => Promise.resolve().then(
|
|
1515
|
+
() => evaluateFn({
|
|
1516
|
+
input: testCaseItem.testCase.getInput(),
|
|
1517
|
+
ctx,
|
|
1518
|
+
output,
|
|
1519
|
+
meta: {
|
|
1520
|
+
triggerId: task.triggerId,
|
|
1521
|
+
runId: evaluatorRunId,
|
|
1522
|
+
datasetId: task.datasetId
|
|
1523
|
+
},
|
|
1524
|
+
logDiff,
|
|
1525
|
+
log,
|
|
1526
|
+
createError
|
|
1527
|
+
})
|
|
1528
|
+
)
|
|
1529
|
+
);
|
|
1530
|
+
if (result instanceof Error) {
|
|
1531
|
+
const evaluatorError = result;
|
|
1532
|
+
const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
|
|
1533
|
+
logs.push(taggedEntry ?? createLogEntry(result));
|
|
1534
|
+
testCaseError = result.message;
|
|
1546
1535
|
evaluatorScores.push({
|
|
1547
1536
|
evaluatorId,
|
|
1548
1537
|
scores: [],
|
|
1549
1538
|
passed: false,
|
|
1550
1539
|
logs: logs.length > 0 ? logs : void 0
|
|
1551
1540
|
});
|
|
1541
|
+
continue;
|
|
1542
|
+
}
|
|
1543
|
+
const { scores, metrics } = normalizeResult(result);
|
|
1544
|
+
const passed = computeEvaluatorPassed(evaluator, result, scores);
|
|
1545
|
+
evaluatorScores.push({
|
|
1546
|
+
evaluatorId,
|
|
1547
|
+
scores,
|
|
1548
|
+
passed,
|
|
1549
|
+
metrics,
|
|
1550
|
+
logs: logs.length > 0 ? logs : void 0
|
|
1551
|
+
});
|
|
1552
|
+
} catch (error) {
|
|
1553
|
+
if (error instanceof Error) {
|
|
1554
|
+
const taggedEntry = error[evaluatorErrorLogEntryKey];
|
|
1555
|
+
logs.push(taggedEntry ?? createLogEntry(error));
|
|
1552
1556
|
}
|
|
1557
|
+
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
1558
|
+
evaluatorScores.push({
|
|
1559
|
+
evaluatorId,
|
|
1560
|
+
scores: [],
|
|
1561
|
+
passed: false,
|
|
1562
|
+
logs: logs.length > 0 ? logs : void 0
|
|
1563
|
+
});
|
|
1553
1564
|
}
|
|
1554
|
-
const rerunPassedThis = evaluatorScores.every((s) => s.passed);
|
|
1555
|
-
rerunPassed.push(rerunPassedThis);
|
|
1556
|
-
const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [
|
|
1557
|
-
n + 1,
|
|
1558
|
-
n + 1
|
|
1559
|
-
]);
|
|
1560
|
-
const progressEvent = {
|
|
1561
|
-
type: "TestCaseProgress",
|
|
1562
|
-
runId: task.runId,
|
|
1563
|
-
testCaseId: testCaseItem.id,
|
|
1564
|
-
testCaseName: testCaseItem.testCase.getName(),
|
|
1565
|
-
completedTestCases: completedEvaluations,
|
|
1566
|
-
totalTestCases: totalEvaluations,
|
|
1567
|
-
rerunIndex: r + 1,
|
|
1568
|
-
rerunTotal: reruns,
|
|
1569
|
-
passed: rerunPassedThis,
|
|
1570
|
-
durationMs: Date.now() - started,
|
|
1571
|
-
evaluatorScores,
|
|
1572
|
-
output,
|
|
1573
|
-
errorMessage: testCaseError
|
|
1574
|
-
};
|
|
1575
|
-
updateSnapshot(task.runId, (snapshot) => ({
|
|
1576
|
-
...snapshot,
|
|
1577
|
-
completedTestCases: completedEvaluations
|
|
1578
|
-
}));
|
|
1579
|
-
yield* publishEvent(progressEvent);
|
|
1580
|
-
yield* effect.Queue.offer(persistenceQueue, {
|
|
1581
|
-
runId: task.runId,
|
|
1582
|
-
artifactPath: task.snapshot.artifactPath,
|
|
1583
|
-
payload: progressEvent
|
|
1584
|
-
});
|
|
1585
|
-
}
|
|
1586
|
-
const testCasePassed = rerunPassed.every(Boolean);
|
|
1587
|
-
if (testCasePassed) {
|
|
1588
|
-
yield* effect.Ref.update(passedRef, (n) => n + 1);
|
|
1589
|
-
} else {
|
|
1590
|
-
yield* effect.Ref.update(failedRef, (n) => n + 1);
|
|
1591
1565
|
}
|
|
1592
|
-
const
|
|
1593
|
-
|
|
1594
|
-
|
|
1566
|
+
const rerunPassedThis = evaluatorScores.every((s) => s.passed);
|
|
1567
|
+
const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [
|
|
1568
|
+
n + 1,
|
|
1569
|
+
n + 1
|
|
1595
1570
|
]);
|
|
1596
|
-
|
|
1571
|
+
const progressEvent = {
|
|
1572
|
+
type: "TestCaseProgress",
|
|
1573
|
+
runId: task.runId,
|
|
1574
|
+
testCaseId: testCaseItem.id,
|
|
1575
|
+
testCaseName: testCaseItem.testCase.getName(),
|
|
1576
|
+
completedTestCases: completedEvaluations,
|
|
1577
|
+
totalTestCases: totalEvaluations,
|
|
1578
|
+
rerunIndex,
|
|
1579
|
+
rerunTotal,
|
|
1580
|
+
passed: rerunPassedThis,
|
|
1581
|
+
durationMs: Date.now() - started,
|
|
1582
|
+
evaluatorScores,
|
|
1583
|
+
output,
|
|
1584
|
+
errorMessage: testCaseError
|
|
1585
|
+
};
|
|
1586
|
+
yield* updateSnapshot(task.runId, (snapshot) => ({
|
|
1597
1587
|
...snapshot,
|
|
1598
|
-
|
|
1599
|
-
failedTestCases: failed
|
|
1588
|
+
completedTestCases: completedEvaluations
|
|
1600
1589
|
}));
|
|
1590
|
+
yield* publishEvent(progressEvent);
|
|
1591
|
+
yield* effect.Queue.offer(persistenceQueue, {
|
|
1592
|
+
runId: task.runId,
|
|
1593
|
+
artifactPath: task.snapshot.artifactPath,
|
|
1594
|
+
payload: progressEvent
|
|
1595
|
+
});
|
|
1596
|
+
const testCaseCompleted = yield* effect.Ref.modify(
|
|
1597
|
+
testCaseResultsRef,
|
|
1598
|
+
(map) => {
|
|
1599
|
+
const key = testCaseItem.id;
|
|
1600
|
+
const existing = map.get(key) ?? { completedCount: 0, results: [] };
|
|
1601
|
+
const newResults = [...existing.results, rerunPassedThis];
|
|
1602
|
+
const newCompletedCount = existing.completedCount + 1;
|
|
1603
|
+
const isLast = newCompletedCount === rerunTotal;
|
|
1604
|
+
const newMap = new Map(map);
|
|
1605
|
+
newMap.set(key, {
|
|
1606
|
+
completedCount: newCompletedCount,
|
|
1607
|
+
results: newResults
|
|
1608
|
+
});
|
|
1609
|
+
const outcome = isLast ? newResults.every(Boolean) : null;
|
|
1610
|
+
return [outcome, newMap];
|
|
1611
|
+
}
|
|
1612
|
+
);
|
|
1613
|
+
if (testCaseCompleted !== null) {
|
|
1614
|
+
if (testCaseCompleted) {
|
|
1615
|
+
yield* effect.Ref.update(passedRef, (n) => n + 1);
|
|
1616
|
+
} else {
|
|
1617
|
+
yield* effect.Ref.update(failedRef, (n) => n + 1);
|
|
1618
|
+
}
|
|
1619
|
+
const [passed, failed] = yield* effect.Effect.all([
|
|
1620
|
+
effect.Ref.get(passedRef),
|
|
1621
|
+
effect.Ref.get(failedRef)
|
|
1622
|
+
]);
|
|
1623
|
+
yield* updateSnapshot(task.runId, (snapshot) => ({
|
|
1624
|
+
...snapshot,
|
|
1625
|
+
passedTestCases: passed,
|
|
1626
|
+
failedTestCases: failed
|
|
1627
|
+
}));
|
|
1628
|
+
}
|
|
1601
1629
|
});
|
|
1602
1630
|
}
|
|
1603
1631
|
var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => effect.Effect.gen(function* () {
|
|
1604
1632
|
const startedAt = Date.now();
|
|
1605
|
-
updateSnapshot(task.runId, (snapshot) => ({
|
|
1633
|
+
yield* updateSnapshot(task.runId, (snapshot) => ({
|
|
1606
1634
|
...snapshot,
|
|
1607
1635
|
status: "running",
|
|
1608
1636
|
startedAt
|
|
@@ -1621,9 +1649,13 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
1621
1649
|
const startedRef = yield* effect.Ref.make(0);
|
|
1622
1650
|
const passedRef = yield* effect.Ref.make(0);
|
|
1623
1651
|
const failedRef = yield* effect.Ref.make(0);
|
|
1624
|
-
const
|
|
1652
|
+
const testCaseResultsRef = yield* effect.Ref.make(
|
|
1653
|
+
/* @__PURE__ */ new Map()
|
|
1654
|
+
);
|
|
1655
|
+
const evaluationUnits = buildEvaluationUnits(task.testCases);
|
|
1656
|
+
const processEvaluation = (unit) => processOneEvaluation(
|
|
1625
1657
|
task,
|
|
1626
|
-
|
|
1658
|
+
unit,
|
|
1627
1659
|
totalEvaluations,
|
|
1628
1660
|
publishEvent,
|
|
1629
1661
|
persistenceQueue,
|
|
@@ -1631,11 +1663,12 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
1631
1663
|
startedRef,
|
|
1632
1664
|
completedRef,
|
|
1633
1665
|
passedRef,
|
|
1634
|
-
failedRef
|
|
1666
|
+
failedRef,
|
|
1667
|
+
testCaseResultsRef
|
|
1635
1668
|
);
|
|
1636
1669
|
yield* effect.Effect.forEach(
|
|
1637
|
-
|
|
1638
|
-
|
|
1670
|
+
evaluationUnits,
|
|
1671
|
+
processEvaluation,
|
|
1639
1672
|
maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
|
|
1640
1673
|
);
|
|
1641
1674
|
const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* effect.Effect.all([
|
|
@@ -1653,7 +1686,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
1653
1686
|
totalTestCases: task.testCases.length,
|
|
1654
1687
|
artifactPath: task.snapshot.artifactPath
|
|
1655
1688
|
};
|
|
1656
|
-
updateSnapshot(task.runId, (snapshot) => ({
|
|
1689
|
+
yield* updateSnapshot(task.runId, (snapshot) => ({
|
|
1657
1690
|
...snapshot,
|
|
1658
1691
|
status: "completed",
|
|
1659
1692
|
completedTestCases: completedEvaluations,
|
|
@@ -1936,7 +1969,9 @@ var EffectRunner = class {
|
|
|
1936
1969
|
this.persistenceQueue = effect.Effect.runSync(
|
|
1937
1970
|
effect.Queue.unbounded()
|
|
1938
1971
|
);
|
|
1939
|
-
this.
|
|
1972
|
+
this.snapshotsRef = effect.Effect.runSync(
|
|
1973
|
+
effect.Ref.make(/* @__PURE__ */ new Map())
|
|
1974
|
+
);
|
|
1940
1975
|
this.listeners = /* @__PURE__ */ new Set();
|
|
1941
1976
|
this.datasetsById = /* @__PURE__ */ new Map();
|
|
1942
1977
|
this.evaluatorsById = /* @__PURE__ */ new Map();
|
|
@@ -2039,7 +2074,13 @@ var EffectRunner = class {
|
|
|
2039
2074
|
status: "queued",
|
|
2040
2075
|
artifactPath
|
|
2041
2076
|
};
|
|
2042
|
-
|
|
2077
|
+
await effect.Effect.runPromise(
|
|
2078
|
+
effect.Ref.update(this.snapshotsRef, (map) => {
|
|
2079
|
+
const next = new Map(map);
|
|
2080
|
+
next.set(runId, snapshot);
|
|
2081
|
+
return next;
|
|
2082
|
+
})
|
|
2083
|
+
);
|
|
2043
2084
|
const queuedEvent = {
|
|
2044
2085
|
type: "RunQueued",
|
|
2045
2086
|
runId,
|
|
@@ -2080,12 +2121,12 @@ var EffectRunner = class {
|
|
|
2080
2121
|
};
|
|
2081
2122
|
}
|
|
2082
2123
|
getRunSnapshot(runId) {
|
|
2083
|
-
return this.
|
|
2124
|
+
return effect.Effect.runSync(effect.Ref.get(this.snapshotsRef)).get(runId);
|
|
2084
2125
|
}
|
|
2085
2126
|
getAllRunSnapshots() {
|
|
2086
|
-
return Array.from(
|
|
2087
|
-
(
|
|
2088
|
-
);
|
|
2127
|
+
return Array.from(
|
|
2128
|
+
effect.Effect.runSync(effect.Ref.get(this.snapshotsRef)).values()
|
|
2129
|
+
).sort((a, b) => b.queuedAt - a.queuedAt);
|
|
2089
2130
|
}
|
|
2090
2131
|
async loadRunSnapshotsFromArtifacts() {
|
|
2091
2132
|
return loadRunSnapshotsFromArtifacts(this.config);
|
|
@@ -2114,11 +2155,15 @@ var EffectRunner = class {
|
|
|
2114
2155
|
);
|
|
2115
2156
|
}
|
|
2116
2157
|
updateSnapshot(runId, updater) {
|
|
2117
|
-
|
|
2118
|
-
|
|
2119
|
-
|
|
2120
|
-
|
|
2121
|
-
|
|
2158
|
+
return effect.Ref.modify(this.snapshotsRef, (map) => {
|
|
2159
|
+
const existing = map.get(runId);
|
|
2160
|
+
if (!existing) {
|
|
2161
|
+
return [void 0, map];
|
|
2162
|
+
}
|
|
2163
|
+
const next = new Map(map);
|
|
2164
|
+
next.set(runId, updater(existing));
|
|
2165
|
+
return [void 0, next];
|
|
2166
|
+
}).pipe(effect.Effect.asVoid);
|
|
2122
2167
|
}
|
|
2123
2168
|
publishEvent(event) {
|
|
2124
2169
|
return effect.Effect.sync(() => {
|