@m4trix/evals 0.21.1 → 0.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +196 -151
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +197 -152
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +196 -151
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +197 -152
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +196 -151
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +197 -152
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli.js
CHANGED
|
@@ -6,7 +6,7 @@ import { jsx, jsxs, Fragment } from 'react/jsx-runtime';
|
|
|
6
6
|
import { resolve, relative, join, dirname } from 'path';
|
|
7
7
|
import { LineGraph } from '@pppp606/ink-chart';
|
|
8
8
|
import { randomUUID } from 'crypto';
|
|
9
|
-
import { Effect, PubSub, Queue,
|
|
9
|
+
import { Effect, PubSub, Queue, Ref, Fiber } from 'effect';
|
|
10
10
|
import { existsSync } from 'fs';
|
|
11
11
|
import * as jitiModule from 'jiti';
|
|
12
12
|
import { readdir, readFile, mkdir, appendFile } from 'fs/promises';
|
|
@@ -1416,6 +1416,20 @@ function readOutput(testCase) {
|
|
|
1416
1416
|
}
|
|
1417
1417
|
return candidate.getOutput();
|
|
1418
1418
|
}
|
|
1419
|
+
function buildEvaluationUnits(testCases) {
|
|
1420
|
+
const units = [];
|
|
1421
|
+
for (const testCaseItem of testCases) {
|
|
1422
|
+
const rerunTotal = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
|
|
1423
|
+
for (let r = 0; r < rerunTotal; r++) {
|
|
1424
|
+
units.push({
|
|
1425
|
+
testCaseItem,
|
|
1426
|
+
rerunIndex: r + 1,
|
|
1427
|
+
rerunTotal
|
|
1428
|
+
});
|
|
1429
|
+
}
|
|
1430
|
+
}
|
|
1431
|
+
return units;
|
|
1432
|
+
}
|
|
1419
1433
|
function nowIsoForFile() {
|
|
1420
1434
|
return (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
1421
1435
|
}
|
|
@@ -1425,157 +1439,171 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
|
1425
1439
|
`${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
|
|
1426
1440
|
);
|
|
1427
1441
|
}
|
|
1428
|
-
function
|
|
1442
|
+
function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
|
|
1443
|
+
const { testCaseItem, rerunIndex, rerunTotal } = unit;
|
|
1429
1444
|
return Effect.gen(function* () {
|
|
1430
|
-
const
|
|
1431
|
-
const
|
|
1432
|
-
|
|
1433
|
-
|
|
1434
|
-
|
|
1435
|
-
|
|
1436
|
-
|
|
1437
|
-
|
|
1438
|
-
|
|
1439
|
-
|
|
1440
|
-
|
|
1441
|
-
|
|
1442
|
-
|
|
1443
|
-
|
|
1444
|
-
|
|
1445
|
-
|
|
1446
|
-
|
|
1447
|
-
|
|
1448
|
-
|
|
1449
|
-
|
|
1450
|
-
|
|
1451
|
-
|
|
1452
|
-
|
|
1453
|
-
|
|
1454
|
-
|
|
1455
|
-
|
|
1456
|
-
|
|
1457
|
-
|
|
1458
|
-
|
|
1459
|
-
|
|
1460
|
-
|
|
1461
|
-
|
|
1462
|
-
|
|
1463
|
-
|
|
1464
|
-
|
|
1465
|
-
|
|
1466
|
-
|
|
1467
|
-
|
|
1468
|
-
|
|
1469
|
-
|
|
1470
|
-
|
|
1471
|
-
|
|
1472
|
-
|
|
1473
|
-
|
|
1474
|
-
|
|
1475
|
-
|
|
1476
|
-
|
|
1477
|
-
|
|
1478
|
-
|
|
1479
|
-
|
|
1480
|
-
|
|
1481
|
-
|
|
1482
|
-
|
|
1483
|
-
|
|
1484
|
-
|
|
1485
|
-
|
|
1486
|
-
|
|
1487
|
-
|
|
1488
|
-
|
|
1489
|
-
|
|
1490
|
-
|
|
1491
|
-
|
|
1492
|
-
|
|
1493
|
-
const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
|
|
1494
|
-
logs.push(taggedEntry ?? createLogEntry(result));
|
|
1495
|
-
testCaseError = result.message;
|
|
1496
|
-
evaluatorScores.push({
|
|
1497
|
-
evaluatorId,
|
|
1498
|
-
scores: [],
|
|
1499
|
-
passed: false,
|
|
1500
|
-
logs: logs.length > 0 ? logs : void 0
|
|
1501
|
-
});
|
|
1502
|
-
continue;
|
|
1503
|
-
}
|
|
1504
|
-
const { scores, metrics } = normalizeResult(result);
|
|
1505
|
-
const passed2 = computeEvaluatorPassed(evaluator, result, scores);
|
|
1506
|
-
evaluatorScores.push({
|
|
1507
|
-
evaluatorId,
|
|
1508
|
-
scores,
|
|
1509
|
-
passed: passed2,
|
|
1510
|
-
metrics,
|
|
1511
|
-
logs: logs.length > 0 ? logs : void 0
|
|
1512
|
-
});
|
|
1513
|
-
} catch (error) {
|
|
1514
|
-
if (error instanceof Error) {
|
|
1515
|
-
const taggedEntry = error[evaluatorErrorLogEntryKey];
|
|
1516
|
-
logs.push(taggedEntry ?? createLogEntry(error));
|
|
1517
|
-
}
|
|
1518
|
-
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
1445
|
+
const evaluatorRunId = `run-${randomUUID()}`;
|
|
1446
|
+
const started = Date.now();
|
|
1447
|
+
const startedEvaluations = yield* Ref.modify(startedRef, (n) => [
|
|
1448
|
+
n + 1,
|
|
1449
|
+
n + 1
|
|
1450
|
+
]);
|
|
1451
|
+
yield* publishEvent({
|
|
1452
|
+
type: "TestCaseStarted",
|
|
1453
|
+
runId: task.runId,
|
|
1454
|
+
testCaseId: testCaseItem.id,
|
|
1455
|
+
testCaseName: testCaseItem.testCase.getName(),
|
|
1456
|
+
startedTestCases: startedEvaluations,
|
|
1457
|
+
totalTestCases: totalEvaluations,
|
|
1458
|
+
rerunIndex,
|
|
1459
|
+
rerunTotal
|
|
1460
|
+
});
|
|
1461
|
+
const evaluatorScores = [];
|
|
1462
|
+
let testCaseError;
|
|
1463
|
+
const output = readOutput(testCaseItem.testCase);
|
|
1464
|
+
for (const { id: evaluatorId, evaluator } of task.evaluators) {
|
|
1465
|
+
const evaluateFn = evaluator.getEvaluateFn();
|
|
1466
|
+
if (!evaluateFn) {
|
|
1467
|
+
continue;
|
|
1468
|
+
}
|
|
1469
|
+
const logs = [];
|
|
1470
|
+
const logDiff = (expected, actual, options) => {
|
|
1471
|
+
logs.push(createDiffLogEntry(expected, actual, options));
|
|
1472
|
+
};
|
|
1473
|
+
const log = (message, options) => {
|
|
1474
|
+
logs.push(createLogEntry(message, options));
|
|
1475
|
+
};
|
|
1476
|
+
const createError = (message, options) => {
|
|
1477
|
+
const entry = createLogEntry(message, options);
|
|
1478
|
+
const error = message instanceof Error ? message : new Error(entry.message);
|
|
1479
|
+
error[evaluatorErrorLogEntryKey] = entry;
|
|
1480
|
+
return error;
|
|
1481
|
+
};
|
|
1482
|
+
try {
|
|
1483
|
+
const ctx = yield* Effect.promise(
|
|
1484
|
+
() => Promise.resolve(evaluator.resolveContext())
|
|
1485
|
+
);
|
|
1486
|
+
const result = yield* Effect.promise(
|
|
1487
|
+
() => Promise.resolve().then(
|
|
1488
|
+
() => evaluateFn({
|
|
1489
|
+
input: testCaseItem.testCase.getInput(),
|
|
1490
|
+
ctx,
|
|
1491
|
+
output,
|
|
1492
|
+
meta: {
|
|
1493
|
+
triggerId: task.triggerId,
|
|
1494
|
+
runId: evaluatorRunId,
|
|
1495
|
+
datasetId: task.datasetId
|
|
1496
|
+
},
|
|
1497
|
+
logDiff,
|
|
1498
|
+
log,
|
|
1499
|
+
createError
|
|
1500
|
+
})
|
|
1501
|
+
)
|
|
1502
|
+
);
|
|
1503
|
+
if (result instanceof Error) {
|
|
1504
|
+
const evaluatorError = result;
|
|
1505
|
+
const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
|
|
1506
|
+
logs.push(taggedEntry ?? createLogEntry(result));
|
|
1507
|
+
testCaseError = result.message;
|
|
1519
1508
|
evaluatorScores.push({
|
|
1520
1509
|
evaluatorId,
|
|
1521
1510
|
scores: [],
|
|
1522
1511
|
passed: false,
|
|
1523
1512
|
logs: logs.length > 0 ? logs : void 0
|
|
1524
1513
|
});
|
|
1514
|
+
continue;
|
|
1515
|
+
}
|
|
1516
|
+
const { scores, metrics } = normalizeResult(result);
|
|
1517
|
+
const passed = computeEvaluatorPassed(evaluator, result, scores);
|
|
1518
|
+
evaluatorScores.push({
|
|
1519
|
+
evaluatorId,
|
|
1520
|
+
scores,
|
|
1521
|
+
passed,
|
|
1522
|
+
metrics,
|
|
1523
|
+
logs: logs.length > 0 ? logs : void 0
|
|
1524
|
+
});
|
|
1525
|
+
} catch (error) {
|
|
1526
|
+
if (error instanceof Error) {
|
|
1527
|
+
const taggedEntry = error[evaluatorErrorLogEntryKey];
|
|
1528
|
+
logs.push(taggedEntry ?? createLogEntry(error));
|
|
1525
1529
|
}
|
|
1530
|
+
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
1531
|
+
evaluatorScores.push({
|
|
1532
|
+
evaluatorId,
|
|
1533
|
+
scores: [],
|
|
1534
|
+
passed: false,
|
|
1535
|
+
logs: logs.length > 0 ? logs : void 0
|
|
1536
|
+
});
|
|
1526
1537
|
}
|
|
1527
|
-
const rerunPassedThis = evaluatorScores.every((s) => s.passed);
|
|
1528
|
-
rerunPassed.push(rerunPassedThis);
|
|
1529
|
-
const completedEvaluations = yield* Ref.modify(completedRef, (n) => [
|
|
1530
|
-
n + 1,
|
|
1531
|
-
n + 1
|
|
1532
|
-
]);
|
|
1533
|
-
const progressEvent = {
|
|
1534
|
-
type: "TestCaseProgress",
|
|
1535
|
-
runId: task.runId,
|
|
1536
|
-
testCaseId: testCaseItem.id,
|
|
1537
|
-
testCaseName: testCaseItem.testCase.getName(),
|
|
1538
|
-
completedTestCases: completedEvaluations,
|
|
1539
|
-
totalTestCases: totalEvaluations,
|
|
1540
|
-
rerunIndex: r + 1,
|
|
1541
|
-
rerunTotal: reruns,
|
|
1542
|
-
passed: rerunPassedThis,
|
|
1543
|
-
durationMs: Date.now() - started,
|
|
1544
|
-
evaluatorScores,
|
|
1545
|
-
output,
|
|
1546
|
-
errorMessage: testCaseError
|
|
1547
|
-
};
|
|
1548
|
-
updateSnapshot(task.runId, (snapshot) => ({
|
|
1549
|
-
...snapshot,
|
|
1550
|
-
completedTestCases: completedEvaluations
|
|
1551
|
-
}));
|
|
1552
|
-
yield* publishEvent(progressEvent);
|
|
1553
|
-
yield* Queue.offer(persistenceQueue, {
|
|
1554
|
-
runId: task.runId,
|
|
1555
|
-
artifactPath: task.snapshot.artifactPath,
|
|
1556
|
-
payload: progressEvent
|
|
1557
|
-
});
|
|
1558
|
-
}
|
|
1559
|
-
const testCasePassed = rerunPassed.every(Boolean);
|
|
1560
|
-
if (testCasePassed) {
|
|
1561
|
-
yield* Ref.update(passedRef, (n) => n + 1);
|
|
1562
|
-
} else {
|
|
1563
|
-
yield* Ref.update(failedRef, (n) => n + 1);
|
|
1564
1538
|
}
|
|
1565
|
-
const
|
|
1566
|
-
|
|
1567
|
-
|
|
1539
|
+
const rerunPassedThis = evaluatorScores.every((s) => s.passed);
|
|
1540
|
+
const completedEvaluations = yield* Ref.modify(completedRef, (n) => [
|
|
1541
|
+
n + 1,
|
|
1542
|
+
n + 1
|
|
1568
1543
|
]);
|
|
1569
|
-
|
|
1544
|
+
const progressEvent = {
|
|
1545
|
+
type: "TestCaseProgress",
|
|
1546
|
+
runId: task.runId,
|
|
1547
|
+
testCaseId: testCaseItem.id,
|
|
1548
|
+
testCaseName: testCaseItem.testCase.getName(),
|
|
1549
|
+
completedTestCases: completedEvaluations,
|
|
1550
|
+
totalTestCases: totalEvaluations,
|
|
1551
|
+
rerunIndex,
|
|
1552
|
+
rerunTotal,
|
|
1553
|
+
passed: rerunPassedThis,
|
|
1554
|
+
durationMs: Date.now() - started,
|
|
1555
|
+
evaluatorScores,
|
|
1556
|
+
output,
|
|
1557
|
+
errorMessage: testCaseError
|
|
1558
|
+
};
|
|
1559
|
+
yield* updateSnapshot(task.runId, (snapshot) => ({
|
|
1570
1560
|
...snapshot,
|
|
1571
|
-
|
|
1572
|
-
failedTestCases: failed
|
|
1561
|
+
completedTestCases: completedEvaluations
|
|
1573
1562
|
}));
|
|
1563
|
+
yield* publishEvent(progressEvent);
|
|
1564
|
+
yield* Queue.offer(persistenceQueue, {
|
|
1565
|
+
runId: task.runId,
|
|
1566
|
+
artifactPath: task.snapshot.artifactPath,
|
|
1567
|
+
payload: progressEvent
|
|
1568
|
+
});
|
|
1569
|
+
const testCaseCompleted = yield* Ref.modify(
|
|
1570
|
+
testCaseResultsRef,
|
|
1571
|
+
(map) => {
|
|
1572
|
+
const key = testCaseItem.id;
|
|
1573
|
+
const existing = map.get(key) ?? { completedCount: 0, results: [] };
|
|
1574
|
+
const newResults = [...existing.results, rerunPassedThis];
|
|
1575
|
+
const newCompletedCount = existing.completedCount + 1;
|
|
1576
|
+
const isLast = newCompletedCount === rerunTotal;
|
|
1577
|
+
const newMap = new Map(map);
|
|
1578
|
+
newMap.set(key, {
|
|
1579
|
+
completedCount: newCompletedCount,
|
|
1580
|
+
results: newResults
|
|
1581
|
+
});
|
|
1582
|
+
const outcome = isLast ? newResults.every(Boolean) : null;
|
|
1583
|
+
return [outcome, newMap];
|
|
1584
|
+
}
|
|
1585
|
+
);
|
|
1586
|
+
if (testCaseCompleted !== null) {
|
|
1587
|
+
if (testCaseCompleted) {
|
|
1588
|
+
yield* Ref.update(passedRef, (n) => n + 1);
|
|
1589
|
+
} else {
|
|
1590
|
+
yield* Ref.update(failedRef, (n) => n + 1);
|
|
1591
|
+
}
|
|
1592
|
+
const [passed, failed] = yield* Effect.all([
|
|
1593
|
+
Ref.get(passedRef),
|
|
1594
|
+
Ref.get(failedRef)
|
|
1595
|
+
]);
|
|
1596
|
+
yield* updateSnapshot(task.runId, (snapshot) => ({
|
|
1597
|
+
...snapshot,
|
|
1598
|
+
passedTestCases: passed,
|
|
1599
|
+
failedTestCases: failed
|
|
1600
|
+
}));
|
|
1601
|
+
}
|
|
1574
1602
|
});
|
|
1575
1603
|
}
|
|
1576
1604
|
var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => Effect.gen(function* () {
|
|
1577
1605
|
const startedAt = Date.now();
|
|
1578
|
-
updateSnapshot(task.runId, (snapshot) => ({
|
|
1606
|
+
yield* updateSnapshot(task.runId, (snapshot) => ({
|
|
1579
1607
|
...snapshot,
|
|
1580
1608
|
status: "running",
|
|
1581
1609
|
startedAt
|
|
@@ -1594,9 +1622,13 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
1594
1622
|
const startedRef = yield* Ref.make(0);
|
|
1595
1623
|
const passedRef = yield* Ref.make(0);
|
|
1596
1624
|
const failedRef = yield* Ref.make(0);
|
|
1597
|
-
const
|
|
1625
|
+
const testCaseResultsRef = yield* Ref.make(
|
|
1626
|
+
/* @__PURE__ */ new Map()
|
|
1627
|
+
);
|
|
1628
|
+
const evaluationUnits = buildEvaluationUnits(task.testCases);
|
|
1629
|
+
const processEvaluation = (unit) => processOneEvaluation(
|
|
1598
1630
|
task,
|
|
1599
|
-
|
|
1631
|
+
unit,
|
|
1600
1632
|
totalEvaluations,
|
|
1601
1633
|
publishEvent,
|
|
1602
1634
|
persistenceQueue,
|
|
@@ -1604,11 +1636,12 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
1604
1636
|
startedRef,
|
|
1605
1637
|
completedRef,
|
|
1606
1638
|
passedRef,
|
|
1607
|
-
failedRef
|
|
1639
|
+
failedRef,
|
|
1640
|
+
testCaseResultsRef
|
|
1608
1641
|
);
|
|
1609
1642
|
yield* Effect.forEach(
|
|
1610
|
-
|
|
1611
|
-
|
|
1643
|
+
evaluationUnits,
|
|
1644
|
+
processEvaluation,
|
|
1612
1645
|
maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
|
|
1613
1646
|
);
|
|
1614
1647
|
const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* Effect.all([
|
|
@@ -1626,7 +1659,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
1626
1659
|
totalTestCases: task.testCases.length,
|
|
1627
1660
|
artifactPath: task.snapshot.artifactPath
|
|
1628
1661
|
};
|
|
1629
|
-
updateSnapshot(task.runId, (snapshot) => ({
|
|
1662
|
+
yield* updateSnapshot(task.runId, (snapshot) => ({
|
|
1630
1663
|
...snapshot,
|
|
1631
1664
|
status: "completed",
|
|
1632
1665
|
completedTestCases: completedEvaluations,
|
|
@@ -1909,7 +1942,9 @@ var EffectRunner = class {
|
|
|
1909
1942
|
this.persistenceQueue = Effect.runSync(
|
|
1910
1943
|
Queue.unbounded()
|
|
1911
1944
|
);
|
|
1912
|
-
this.
|
|
1945
|
+
this.snapshotsRef = Effect.runSync(
|
|
1946
|
+
Ref.make(/* @__PURE__ */ new Map())
|
|
1947
|
+
);
|
|
1913
1948
|
this.listeners = /* @__PURE__ */ new Set();
|
|
1914
1949
|
this.datasetsById = /* @__PURE__ */ new Map();
|
|
1915
1950
|
this.evaluatorsById = /* @__PURE__ */ new Map();
|
|
@@ -2012,7 +2047,13 @@ var EffectRunner = class {
|
|
|
2012
2047
|
status: "queued",
|
|
2013
2048
|
artifactPath
|
|
2014
2049
|
};
|
|
2015
|
-
|
|
2050
|
+
await Effect.runPromise(
|
|
2051
|
+
Ref.update(this.snapshotsRef, (map) => {
|
|
2052
|
+
const next = new Map(map);
|
|
2053
|
+
next.set(runId, snapshot);
|
|
2054
|
+
return next;
|
|
2055
|
+
})
|
|
2056
|
+
);
|
|
2016
2057
|
const queuedEvent = {
|
|
2017
2058
|
type: "RunQueued",
|
|
2018
2059
|
runId,
|
|
@@ -2053,12 +2094,12 @@ var EffectRunner = class {
|
|
|
2053
2094
|
};
|
|
2054
2095
|
}
|
|
2055
2096
|
getRunSnapshot(runId) {
|
|
2056
|
-
return this.
|
|
2097
|
+
return Effect.runSync(Ref.get(this.snapshotsRef)).get(runId);
|
|
2057
2098
|
}
|
|
2058
2099
|
getAllRunSnapshots() {
|
|
2059
|
-
return Array.from(
|
|
2060
|
-
(
|
|
2061
|
-
);
|
|
2100
|
+
return Array.from(
|
|
2101
|
+
Effect.runSync(Ref.get(this.snapshotsRef)).values()
|
|
2102
|
+
).sort((a, b) => b.queuedAt - a.queuedAt);
|
|
2062
2103
|
}
|
|
2063
2104
|
async loadRunSnapshotsFromArtifacts() {
|
|
2064
2105
|
return loadRunSnapshotsFromArtifacts(this.config);
|
|
@@ -2087,11 +2128,15 @@ var EffectRunner = class {
|
|
|
2087
2128
|
);
|
|
2088
2129
|
}
|
|
2089
2130
|
updateSnapshot(runId, updater) {
|
|
2090
|
-
|
|
2091
|
-
|
|
2092
|
-
|
|
2093
|
-
|
|
2094
|
-
|
|
2131
|
+
return Ref.modify(this.snapshotsRef, (map) => {
|
|
2132
|
+
const existing = map.get(runId);
|
|
2133
|
+
if (!existing) {
|
|
2134
|
+
return [void 0, map];
|
|
2135
|
+
}
|
|
2136
|
+
const next = new Map(map);
|
|
2137
|
+
next.set(runId, updater(existing));
|
|
2138
|
+
return [void 0, next];
|
|
2139
|
+
}).pipe(Effect.asVoid);
|
|
2095
2140
|
}
|
|
2096
2141
|
publishEvent(event) {
|
|
2097
2142
|
return Effect.sync(() => {
|