@m4trix/evals 0.20.0 → 0.21.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +222 -28
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +221 -27
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +115 -5
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +114 -5
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +117 -5
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +10 -2
- package/dist/index.js +114 -5
- package/dist/index.js.map +1 -1
- package/package.json +3 -3
package/dist/index.d.ts
CHANGED
|
@@ -169,10 +169,9 @@ declare class Dataset {
|
|
|
169
169
|
|
|
170
170
|
/**
|
|
171
171
|
* Options for customizing JSON diff output. Passed to logDiff, createDiffLogEntry, and printJsonDiff.
|
|
172
|
-
* @see https://www.npmjs.com/package/json-diff
|
|
173
172
|
*/
|
|
174
173
|
interface JsonDiffOptions {
|
|
175
|
-
/** Include equal sections of the document, not just deltas */
|
|
174
|
+
/** Include equal sections of the document, not just deltas (always true with current implementation) */
|
|
176
175
|
full?: boolean;
|
|
177
176
|
/** Sort primitive values in arrays before comparing */
|
|
178
177
|
sort?: boolean;
|
|
@@ -437,6 +436,15 @@ type RunnerEvent = {
|
|
|
437
436
|
type: 'RunStarted';
|
|
438
437
|
runId: string;
|
|
439
438
|
startedAt: number;
|
|
439
|
+
} | {
|
|
440
|
+
type: 'TestCaseStarted';
|
|
441
|
+
runId: string;
|
|
442
|
+
testCaseId: string;
|
|
443
|
+
testCaseName: string;
|
|
444
|
+
startedTestCases: number;
|
|
445
|
+
totalTestCases: number;
|
|
446
|
+
rerunIndex: number;
|
|
447
|
+
rerunTotal: number;
|
|
440
448
|
} | {
|
|
441
449
|
type: 'TestCaseProgress';
|
|
442
450
|
runId: string;
|
package/dist/index.js
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import { Effect, PubSub, Queue, Fiber, Ref } from 'effect';
|
|
2
2
|
export { Schema as S } from 'effect';
|
|
3
|
-
import {
|
|
3
|
+
import { diffLines } from 'diff';
|
|
4
|
+
import stringify from 'fast-json-stable-stringify';
|
|
4
5
|
import { randomUUID } from 'crypto';
|
|
5
6
|
import { existsSync } from 'fs';
|
|
6
7
|
import { resolve as resolve$1, relative, join, dirname } from 'path';
|
|
@@ -708,10 +709,102 @@ var binaryScore = Score.of({
|
|
|
708
709
|
},
|
|
709
710
|
aggregateValues: Score.aggregate.all
|
|
710
711
|
});
|
|
712
|
+
function preprocessForDiff(value, options) {
|
|
713
|
+
if (options?.sort && Array.isArray(value)) {
|
|
714
|
+
return [...value].sort((a, b) => {
|
|
715
|
+
const aStr = stringify(preprocessForDiff(a, options));
|
|
716
|
+
const bStr = stringify(preprocessForDiff(b, options));
|
|
717
|
+
return aStr.localeCompare(bStr);
|
|
718
|
+
}).map((item) => preprocessForDiff(item, options));
|
|
719
|
+
}
|
|
720
|
+
if (value !== null && typeof value === "object" && !Array.isArray(value) && options?.excludeKeys) {
|
|
721
|
+
const keys = Array.isArray(options.excludeKeys) ? options.excludeKeys : options.excludeKeys.split(",").map((k) => k.trim());
|
|
722
|
+
const filtered = {};
|
|
723
|
+
for (const [k, v] of Object.entries(value)) {
|
|
724
|
+
if (!keys.includes(k)) {
|
|
725
|
+
filtered[k] = preprocessForDiff(v, options);
|
|
726
|
+
}
|
|
727
|
+
}
|
|
728
|
+
return filtered;
|
|
729
|
+
}
|
|
730
|
+
if (value !== null && typeof value === "object" && !Array.isArray(value)) {
|
|
731
|
+
const result = {};
|
|
732
|
+
for (const [k, v] of Object.entries(value)) {
|
|
733
|
+
result[k] = preprocessForDiff(v, options);
|
|
734
|
+
}
|
|
735
|
+
return result;
|
|
736
|
+
}
|
|
737
|
+
if (typeof value === "number" && options?.precision !== void 0) {
|
|
738
|
+
return Number(value.toFixed(options.precision));
|
|
739
|
+
}
|
|
740
|
+
return value;
|
|
741
|
+
}
|
|
742
|
+
function toPrettyJson(value) {
|
|
743
|
+
const str = stringify(value);
|
|
744
|
+
try {
|
|
745
|
+
const parsed = JSON.parse(str);
|
|
746
|
+
return JSON.stringify(parsed, null, 2);
|
|
747
|
+
} catch {
|
|
748
|
+
return str;
|
|
749
|
+
}
|
|
750
|
+
}
|
|
751
|
+
function formatDiffParts(parts) {
|
|
752
|
+
const lines = [];
|
|
753
|
+
for (const part of parts) {
|
|
754
|
+
const prefix = part.added ? "+ " : part.removed ? "- " : "";
|
|
755
|
+
const partLines = part.value.split("\n");
|
|
756
|
+
for (let i = 0; i < partLines.length; i++) {
|
|
757
|
+
const line = partLines[i];
|
|
758
|
+
if (i === partLines.length - 1 && line === "")
|
|
759
|
+
continue;
|
|
760
|
+
lines.push(prefix + line);
|
|
761
|
+
}
|
|
762
|
+
}
|
|
763
|
+
return lines.join("\n");
|
|
764
|
+
}
|
|
711
765
|
function createDiffString(expected, actual, diffOptions) {
|
|
712
|
-
const
|
|
713
|
-
const
|
|
714
|
-
|
|
766
|
+
const expectedProcessed = preprocessForDiff(expected, diffOptions);
|
|
767
|
+
const actualProcessed = preprocessForDiff(actual, diffOptions);
|
|
768
|
+
if (diffOptions?.keysOnly) {
|
|
769
|
+
const expectedKeys = JSON.stringify(
|
|
770
|
+
extractKeys(expectedProcessed),
|
|
771
|
+
null,
|
|
772
|
+
2
|
|
773
|
+
);
|
|
774
|
+
const actualKeys = JSON.stringify(
|
|
775
|
+
extractKeys(actualProcessed),
|
|
776
|
+
null,
|
|
777
|
+
2
|
|
778
|
+
);
|
|
779
|
+
const parts2 = diffLines(expectedKeys, actualKeys);
|
|
780
|
+
return formatDiffParts(parts2);
|
|
781
|
+
}
|
|
782
|
+
const expectedStr = toPrettyJson(expectedProcessed);
|
|
783
|
+
const actualStr = toPrettyJson(actualProcessed);
|
|
784
|
+
if (expectedStr === actualStr) {
|
|
785
|
+
return "";
|
|
786
|
+
}
|
|
787
|
+
const parts = diffLines(expectedStr, actualStr);
|
|
788
|
+
if (diffOptions?.outputNewOnly) {
|
|
789
|
+
const filtered = parts.filter(
|
|
790
|
+
(p) => p.added === true
|
|
791
|
+
);
|
|
792
|
+
return formatDiffParts(filtered);
|
|
793
|
+
}
|
|
794
|
+
return formatDiffParts(parts);
|
|
795
|
+
}
|
|
796
|
+
function extractKeys(value) {
|
|
797
|
+
if (value === null || typeof value !== "object") {
|
|
798
|
+
return "\xB7";
|
|
799
|
+
}
|
|
800
|
+
if (Array.isArray(value)) {
|
|
801
|
+
return value.map(extractKeys);
|
|
802
|
+
}
|
|
803
|
+
const result = {};
|
|
804
|
+
for (const [k, v] of Object.entries(value)) {
|
|
805
|
+
result[k] = extractKeys(v);
|
|
806
|
+
}
|
|
807
|
+
return result;
|
|
715
808
|
}
|
|
716
809
|
function formatLogMessage(msg) {
|
|
717
810
|
if (typeof msg === "string")
|
|
@@ -1106,13 +1199,27 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
|
1106
1199
|
`${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
|
|
1107
1200
|
);
|
|
1108
1201
|
}
|
|
1109
|
-
function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, completedRef, passedRef, failedRef) {
|
|
1202
|
+
function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef) {
|
|
1110
1203
|
return Effect.gen(function* () {
|
|
1111
1204
|
const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
|
|
1112
1205
|
const rerunPassed = [];
|
|
1113
1206
|
for (let r = 0; r < reruns; r++) {
|
|
1114
1207
|
const evaluatorRunId = `run-${randomUUID()}`;
|
|
1115
1208
|
const started = Date.now();
|
|
1209
|
+
const startedEvaluations = yield* Ref.modify(startedRef, (n) => [
|
|
1210
|
+
n + 1,
|
|
1211
|
+
n + 1
|
|
1212
|
+
]);
|
|
1213
|
+
yield* publishEvent({
|
|
1214
|
+
type: "TestCaseStarted",
|
|
1215
|
+
runId: task.runId,
|
|
1216
|
+
testCaseId: testCaseItem.id,
|
|
1217
|
+
testCaseName: testCaseItem.testCase.getName(),
|
|
1218
|
+
startedTestCases: startedEvaluations,
|
|
1219
|
+
totalTestCases: totalEvaluations,
|
|
1220
|
+
rerunIndex: r + 1,
|
|
1221
|
+
rerunTotal: reruns
|
|
1222
|
+
});
|
|
1116
1223
|
const evaluatorScores = [];
|
|
1117
1224
|
let testCaseError;
|
|
1118
1225
|
const output = readOutput(testCaseItem.testCase);
|
|
@@ -1258,6 +1365,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
1258
1365
|
);
|
|
1259
1366
|
const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
|
|
1260
1367
|
const completedRef = yield* Ref.make(0);
|
|
1368
|
+
const startedRef = yield* Ref.make(0);
|
|
1261
1369
|
const passedRef = yield* Ref.make(0);
|
|
1262
1370
|
const failedRef = yield* Ref.make(0);
|
|
1263
1371
|
const processTestCase = (testCaseItem) => processOneTestCase(
|
|
@@ -1267,6 +1375,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
1267
1375
|
publishEvent,
|
|
1268
1376
|
persistenceQueue,
|
|
1269
1377
|
updateSnapshot,
|
|
1378
|
+
startedRef,
|
|
1270
1379
|
completedRef,
|
|
1271
1380
|
passedRef,
|
|
1272
1381
|
failedRef
|