@m4trix/evals 0.20.0 → 0.21.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +222 -28
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +221 -27
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +115 -5
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +114 -5
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +117 -5
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +10 -2
- package/dist/index.js +114 -5
- package/dist/index.js.map +1 -1
- package/package.json +3 -3
package/dist/index.cjs
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
'use strict';
|
|
2
2
|
|
|
3
3
|
var effect = require('effect');
|
|
4
|
-
var
|
|
4
|
+
var diff = require('diff');
|
|
5
|
+
var stringify = require('fast-json-stable-stringify');
|
|
5
6
|
var crypto = require('crypto');
|
|
6
7
|
var fs = require('fs');
|
|
7
8
|
var path = require('path');
|
|
@@ -10,6 +11,8 @@ var promises = require('fs/promises');
|
|
|
10
11
|
var url = require('url');
|
|
11
12
|
|
|
12
13
|
var _documentCurrentScript = typeof document !== 'undefined' ? document.currentScript : null;
|
|
14
|
+
function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
|
|
15
|
+
|
|
13
16
|
function _interopNamespace(e) {
|
|
14
17
|
if (e && e.__esModule) return e;
|
|
15
18
|
var n = Object.create(null);
|
|
@@ -28,6 +31,7 @@ function _interopNamespace(e) {
|
|
|
28
31
|
return Object.freeze(n);
|
|
29
32
|
}
|
|
30
33
|
|
|
34
|
+
var stringify__default = /*#__PURE__*/_interopDefault(stringify);
|
|
31
35
|
var jitiModule__namespace = /*#__PURE__*/_interopNamespace(jitiModule);
|
|
32
36
|
|
|
33
37
|
// src/cli/data.mock.json
|
|
@@ -730,10 +734,102 @@ var binaryScore = Score.of({
|
|
|
730
734
|
},
|
|
731
735
|
aggregateValues: Score.aggregate.all
|
|
732
736
|
});
|
|
737
|
+
function preprocessForDiff(value, options) {
|
|
738
|
+
if (options?.sort && Array.isArray(value)) {
|
|
739
|
+
return [...value].sort((a, b) => {
|
|
740
|
+
const aStr = stringify__default.default(preprocessForDiff(a, options));
|
|
741
|
+
const bStr = stringify__default.default(preprocessForDiff(b, options));
|
|
742
|
+
return aStr.localeCompare(bStr);
|
|
743
|
+
}).map((item) => preprocessForDiff(item, options));
|
|
744
|
+
}
|
|
745
|
+
if (value !== null && typeof value === "object" && !Array.isArray(value) && options?.excludeKeys) {
|
|
746
|
+
const keys = Array.isArray(options.excludeKeys) ? options.excludeKeys : options.excludeKeys.split(",").map((k) => k.trim());
|
|
747
|
+
const filtered = {};
|
|
748
|
+
for (const [k, v] of Object.entries(value)) {
|
|
749
|
+
if (!keys.includes(k)) {
|
|
750
|
+
filtered[k] = preprocessForDiff(v, options);
|
|
751
|
+
}
|
|
752
|
+
}
|
|
753
|
+
return filtered;
|
|
754
|
+
}
|
|
755
|
+
if (value !== null && typeof value === "object" && !Array.isArray(value)) {
|
|
756
|
+
const result = {};
|
|
757
|
+
for (const [k, v] of Object.entries(value)) {
|
|
758
|
+
result[k] = preprocessForDiff(v, options);
|
|
759
|
+
}
|
|
760
|
+
return result;
|
|
761
|
+
}
|
|
762
|
+
if (typeof value === "number" && options?.precision !== void 0) {
|
|
763
|
+
return Number(value.toFixed(options.precision));
|
|
764
|
+
}
|
|
765
|
+
return value;
|
|
766
|
+
}
|
|
767
|
+
function toPrettyJson(value) {
|
|
768
|
+
const str = stringify__default.default(value);
|
|
769
|
+
try {
|
|
770
|
+
const parsed = JSON.parse(str);
|
|
771
|
+
return JSON.stringify(parsed, null, 2);
|
|
772
|
+
} catch {
|
|
773
|
+
return str;
|
|
774
|
+
}
|
|
775
|
+
}
|
|
776
|
+
function formatDiffParts(parts) {
|
|
777
|
+
const lines = [];
|
|
778
|
+
for (const part of parts) {
|
|
779
|
+
const prefix = part.added ? "+ " : part.removed ? "- " : "";
|
|
780
|
+
const partLines = part.value.split("\n");
|
|
781
|
+
for (let i = 0; i < partLines.length; i++) {
|
|
782
|
+
const line = partLines[i];
|
|
783
|
+
if (i === partLines.length - 1 && line === "")
|
|
784
|
+
continue;
|
|
785
|
+
lines.push(prefix + line);
|
|
786
|
+
}
|
|
787
|
+
}
|
|
788
|
+
return lines.join("\n");
|
|
789
|
+
}
|
|
733
790
|
function createDiffString(expected, actual, diffOptions) {
|
|
734
|
-
const
|
|
735
|
-
const
|
|
736
|
-
|
|
791
|
+
const expectedProcessed = preprocessForDiff(expected, diffOptions);
|
|
792
|
+
const actualProcessed = preprocessForDiff(actual, diffOptions);
|
|
793
|
+
if (diffOptions?.keysOnly) {
|
|
794
|
+
const expectedKeys = JSON.stringify(
|
|
795
|
+
extractKeys(expectedProcessed),
|
|
796
|
+
null,
|
|
797
|
+
2
|
|
798
|
+
);
|
|
799
|
+
const actualKeys = JSON.stringify(
|
|
800
|
+
extractKeys(actualProcessed),
|
|
801
|
+
null,
|
|
802
|
+
2
|
|
803
|
+
);
|
|
804
|
+
const parts2 = diff.diffLines(expectedKeys, actualKeys);
|
|
805
|
+
return formatDiffParts(parts2);
|
|
806
|
+
}
|
|
807
|
+
const expectedStr = toPrettyJson(expectedProcessed);
|
|
808
|
+
const actualStr = toPrettyJson(actualProcessed);
|
|
809
|
+
if (expectedStr === actualStr) {
|
|
810
|
+
return "";
|
|
811
|
+
}
|
|
812
|
+
const parts = diff.diffLines(expectedStr, actualStr);
|
|
813
|
+
if (diffOptions?.outputNewOnly) {
|
|
814
|
+
const filtered = parts.filter(
|
|
815
|
+
(p) => p.added === true
|
|
816
|
+
);
|
|
817
|
+
return formatDiffParts(filtered);
|
|
818
|
+
}
|
|
819
|
+
return formatDiffParts(parts);
|
|
820
|
+
}
|
|
821
|
+
function extractKeys(value) {
|
|
822
|
+
if (value === null || typeof value !== "object") {
|
|
823
|
+
return "\xB7";
|
|
824
|
+
}
|
|
825
|
+
if (Array.isArray(value)) {
|
|
826
|
+
return value.map(extractKeys);
|
|
827
|
+
}
|
|
828
|
+
const result = {};
|
|
829
|
+
for (const [k, v] of Object.entries(value)) {
|
|
830
|
+
result[k] = extractKeys(v);
|
|
831
|
+
}
|
|
832
|
+
return result;
|
|
737
833
|
}
|
|
738
834
|
function formatLogMessage(msg) {
|
|
739
835
|
if (typeof msg === "string")
|
|
@@ -1128,13 +1224,27 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
|
1128
1224
|
`${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
|
|
1129
1225
|
);
|
|
1130
1226
|
}
|
|
1131
|
-
function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, completedRef, passedRef, failedRef) {
|
|
1227
|
+
function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef) {
|
|
1132
1228
|
return effect.Effect.gen(function* () {
|
|
1133
1229
|
const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
|
|
1134
1230
|
const rerunPassed = [];
|
|
1135
1231
|
for (let r = 0; r < reruns; r++) {
|
|
1136
1232
|
const evaluatorRunId = `run-${crypto.randomUUID()}`;
|
|
1137
1233
|
const started = Date.now();
|
|
1234
|
+
const startedEvaluations = yield* effect.Ref.modify(startedRef, (n) => [
|
|
1235
|
+
n + 1,
|
|
1236
|
+
n + 1
|
|
1237
|
+
]);
|
|
1238
|
+
yield* publishEvent({
|
|
1239
|
+
type: "TestCaseStarted",
|
|
1240
|
+
runId: task.runId,
|
|
1241
|
+
testCaseId: testCaseItem.id,
|
|
1242
|
+
testCaseName: testCaseItem.testCase.getName(),
|
|
1243
|
+
startedTestCases: startedEvaluations,
|
|
1244
|
+
totalTestCases: totalEvaluations,
|
|
1245
|
+
rerunIndex: r + 1,
|
|
1246
|
+
rerunTotal: reruns
|
|
1247
|
+
});
|
|
1138
1248
|
const evaluatorScores = [];
|
|
1139
1249
|
let testCaseError;
|
|
1140
1250
|
const output = readOutput(testCaseItem.testCase);
|
|
@@ -1280,6 +1390,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
1280
1390
|
);
|
|
1281
1391
|
const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
|
|
1282
1392
|
const completedRef = yield* effect.Ref.make(0);
|
|
1393
|
+
const startedRef = yield* effect.Ref.make(0);
|
|
1283
1394
|
const passedRef = yield* effect.Ref.make(0);
|
|
1284
1395
|
const failedRef = yield* effect.Ref.make(0);
|
|
1285
1396
|
const processTestCase = (testCaseItem) => processOneTestCase(
|
|
@@ -1289,6 +1400,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
1289
1400
|
publishEvent,
|
|
1290
1401
|
persistenceQueue,
|
|
1291
1402
|
updateSnapshot,
|
|
1403
|
+
startedRef,
|
|
1292
1404
|
completedRef,
|
|
1293
1405
|
passedRef,
|
|
1294
1406
|
failedRef
|