@m4trix/evals 0.15.0 → 0.17.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +44 -27
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +44 -27
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +56 -26
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +56 -26
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +43 -27
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +67 -4
- package/dist/index.js +42 -28
- package/dist/index.js.map +1 -1
- package/package.json +3 -2
package/dist/index.cjs
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
'use strict';
|
|
2
2
|
|
|
3
3
|
var effect = require('effect');
|
|
4
|
-
var
|
|
4
|
+
var jsonDiff = require('json-diff');
|
|
5
5
|
var crypto = require('crypto');
|
|
6
6
|
var fs = require('fs');
|
|
7
7
|
var path = require('path');
|
|
@@ -660,46 +660,48 @@ var binaryScore = Score.of({
|
|
|
660
660
|
},
|
|
661
661
|
aggregate: aggregateAll
|
|
662
662
|
});
|
|
663
|
-
function
|
|
663
|
+
function createDiffString(expected, actual, diffOptions) {
|
|
664
|
+
const opts = { ...diffOptions, color: false };
|
|
665
|
+
const result = jsonDiff.diffString(expected, actual, opts);
|
|
666
|
+
return typeof result === "string" ? result : "";
|
|
667
|
+
}
|
|
668
|
+
function formatLogMessage(msg) {
|
|
669
|
+
if (typeof msg === "string")
|
|
670
|
+
return msg;
|
|
664
671
|
try {
|
|
665
|
-
|
|
672
|
+
if (msg !== null && typeof msg === "object") {
|
|
673
|
+
return JSON.stringify(msg, null, 2);
|
|
674
|
+
}
|
|
675
|
+
return String(msg);
|
|
666
676
|
} catch {
|
|
667
|
-
return String(
|
|
677
|
+
return String(msg);
|
|
668
678
|
}
|
|
669
679
|
}
|
|
670
|
-
function
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
partLines.pop();
|
|
677
|
-
}
|
|
678
|
-
for (const line of partLines) {
|
|
679
|
-
lines.push(`${prefix} ${line}`);
|
|
680
|
-
}
|
|
681
|
-
}
|
|
682
|
-
return lines.join("\n");
|
|
680
|
+
function createLogEntry(message, options) {
|
|
681
|
+
return {
|
|
682
|
+
type: "log",
|
|
683
|
+
label: options?.label,
|
|
684
|
+
message: formatLogMessage(message)
|
|
685
|
+
};
|
|
683
686
|
}
|
|
684
|
-
function
|
|
685
|
-
|
|
686
|
-
const actualStr = toJsonLines(actual);
|
|
687
|
-
const changes = diff.diffLines(expectedStr, actualStr);
|
|
688
|
-
return formatDiffString(changes);
|
|
687
|
+
function getLogLines(entry) {
|
|
688
|
+
return entry.message.split("\n");
|
|
689
689
|
}
|
|
690
690
|
function createDiffLogEntry(expected, actual, options) {
|
|
691
|
-
const
|
|
691
|
+
const { label, ...diffOpts } = options ?? {};
|
|
692
|
+
const diff = createDiffString(expected, actual, diffOpts);
|
|
692
693
|
return {
|
|
693
694
|
type: "diff",
|
|
694
|
-
label
|
|
695
|
+
label,
|
|
695
696
|
expected,
|
|
696
697
|
actual,
|
|
697
698
|
diff: diff || "(no differences)"
|
|
698
699
|
};
|
|
699
700
|
}
|
|
700
701
|
function printJsonDiff(expected, actual, options = {}) {
|
|
701
|
-
const
|
|
702
|
-
|
|
702
|
+
const { color = true, ...diffOpts } = options;
|
|
703
|
+
const diff = createDiffString(expected, actual, diffOpts);
|
|
704
|
+
if (color) {
|
|
703
705
|
const lines = diff.split("\n").map((line) => {
|
|
704
706
|
const trimmed = line.trimStart();
|
|
705
707
|
if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
|
|
@@ -1055,6 +1057,7 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
1055
1057
|
const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
|
|
1056
1058
|
const rerunPassed = [];
|
|
1057
1059
|
for (let r = 0; r < reruns; r++) {
|
|
1060
|
+
const evaluatorRunId = `run-${crypto.randomUUID()}`;
|
|
1058
1061
|
const started = Date.now();
|
|
1059
1062
|
const evaluatorScores = [];
|
|
1060
1063
|
let testCaseError;
|
|
@@ -1069,6 +1072,9 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
1069
1072
|
const logDiff = (expected, actual, options) => {
|
|
1070
1073
|
logs.push(createDiffLogEntry(expected, actual, options));
|
|
1071
1074
|
};
|
|
1075
|
+
const log = (message, options) => {
|
|
1076
|
+
logs.push(createLogEntry(message, options));
|
|
1077
|
+
};
|
|
1072
1078
|
const ctx = yield* effect.Effect.promise(
|
|
1073
1079
|
() => Promise.resolve(evaluator.resolveContext())
|
|
1074
1080
|
);
|
|
@@ -1078,7 +1084,13 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
1078
1084
|
input: testCaseItem.testCase.getInput(),
|
|
1079
1085
|
ctx,
|
|
1080
1086
|
output,
|
|
1081
|
-
|
|
1087
|
+
meta: {
|
|
1088
|
+
triggerId: task.triggerId,
|
|
1089
|
+
runId: evaluatorRunId,
|
|
1090
|
+
datasetId: task.datasetId
|
|
1091
|
+
},
|
|
1092
|
+
logDiff,
|
|
1093
|
+
log
|
|
1082
1094
|
})
|
|
1083
1095
|
)
|
|
1084
1096
|
);
|
|
@@ -1536,6 +1548,7 @@ var EffectRunner = class {
|
|
|
1536
1548
|
(sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
|
|
1537
1549
|
0
|
|
1538
1550
|
);
|
|
1551
|
+
const triggerId = request.triggerId ?? `trg-${crypto.randomUUID()}`;
|
|
1539
1552
|
const runId = `run-${crypto.randomUUID()}`;
|
|
1540
1553
|
const artifactPath = createArtifactPath(
|
|
1541
1554
|
this.config.artifactDirectory,
|
|
@@ -1577,6 +1590,7 @@ var EffectRunner = class {
|
|
|
1577
1590
|
await effect.Effect.runPromise(
|
|
1578
1591
|
effect.Queue.offer(this.runQueue, {
|
|
1579
1592
|
runId,
|
|
1593
|
+
triggerId,
|
|
1580
1594
|
datasetId: request.datasetId,
|
|
1581
1595
|
dataset: dataset.dataset,
|
|
1582
1596
|
evaluators: selectedEvaluators,
|
|
@@ -1660,9 +1674,11 @@ exports.Metric = Metric;
|
|
|
1660
1674
|
exports.Score = Score;
|
|
1661
1675
|
exports.TestCase = TestCase;
|
|
1662
1676
|
exports.binaryScore = binaryScore;
|
|
1677
|
+
exports.createLogEntry = createLogEntry;
|
|
1663
1678
|
exports.createRunner = createRunner;
|
|
1664
1679
|
exports.defaultRunnerConfig = defaultRunnerConfig;
|
|
1665
1680
|
exports.defineConfig = defineConfig;
|
|
1681
|
+
exports.getLogLines = getLogLines;
|
|
1666
1682
|
exports.getMetricById = getMetricById;
|
|
1667
1683
|
exports.getScoreById = getScoreById;
|
|
1668
1684
|
exports.latencyMetric = latencyMetric;
|