@m4trix/evals 0.20.0 → 0.21.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -11,7 +11,8 @@ import { existsSync } from 'fs';
11
11
  import * as jitiModule from 'jiti';
12
12
  import { readdir, readFile, mkdir, appendFile } from 'fs/promises';
13
13
  import { pathToFileURL } from 'url';
14
- import { diffString } from 'json-diff';
14
+ import { diffLines } from 'diff';
15
+ import stringify from 'fast-json-stable-stringify';
15
16
 
16
17
  var SEP = " ";
17
18
  var ARROW = "\u203A";
@@ -978,10 +979,102 @@ async function collectTestCasesFromFiles(config) {
978
979
  );
979
980
  return found.flat();
980
981
  }
982
+ function preprocessForDiff(value, options) {
983
+ if (options?.sort && Array.isArray(value)) {
984
+ return [...value].sort((a, b) => {
985
+ const aStr = stringify(preprocessForDiff(a, options));
986
+ const bStr = stringify(preprocessForDiff(b, options));
987
+ return aStr.localeCompare(bStr);
988
+ }).map((item) => preprocessForDiff(item, options));
989
+ }
990
+ if (value !== null && typeof value === "object" && !Array.isArray(value) && options?.excludeKeys) {
991
+ const keys = Array.isArray(options.excludeKeys) ? options.excludeKeys : options.excludeKeys.split(",").map((k) => k.trim());
992
+ const filtered = {};
993
+ for (const [k, v] of Object.entries(value)) {
994
+ if (!keys.includes(k)) {
995
+ filtered[k] = preprocessForDiff(v, options);
996
+ }
997
+ }
998
+ return filtered;
999
+ }
1000
+ if (value !== null && typeof value === "object" && !Array.isArray(value)) {
1001
+ const result = {};
1002
+ for (const [k, v] of Object.entries(value)) {
1003
+ result[k] = preprocessForDiff(v, options);
1004
+ }
1005
+ return result;
1006
+ }
1007
+ if (typeof value === "number" && options?.precision !== void 0) {
1008
+ return Number(value.toFixed(options.precision));
1009
+ }
1010
+ return value;
1011
+ }
1012
+ function toPrettyJson(value) {
1013
+ const str = stringify(value);
1014
+ try {
1015
+ const parsed = JSON.parse(str);
1016
+ return JSON.stringify(parsed, null, 2);
1017
+ } catch {
1018
+ return str;
1019
+ }
1020
+ }
1021
+ function formatDiffParts(parts) {
1022
+ const lines = [];
1023
+ for (const part of parts) {
1024
+ const prefix = part.added ? "+ " : part.removed ? "- " : "";
1025
+ const partLines = part.value.split("\n");
1026
+ for (let i = 0; i < partLines.length; i++) {
1027
+ const line = partLines[i];
1028
+ if (i === partLines.length - 1 && line === "")
1029
+ continue;
1030
+ lines.push(prefix + line);
1031
+ }
1032
+ }
1033
+ return lines.join("\n");
1034
+ }
981
1035
  function createDiffString(expected, actual, diffOptions) {
982
- const opts = { ...diffOptions, color: false };
983
- const result = diffString(expected, actual, opts);
984
- return typeof result === "string" ? result : "";
1036
+ const expectedProcessed = preprocessForDiff(expected, diffOptions);
1037
+ const actualProcessed = preprocessForDiff(actual, diffOptions);
1038
+ if (diffOptions?.keysOnly) {
1039
+ const expectedKeys = JSON.stringify(
1040
+ extractKeys(expectedProcessed),
1041
+ null,
1042
+ 2
1043
+ );
1044
+ const actualKeys = JSON.stringify(
1045
+ extractKeys(actualProcessed),
1046
+ null,
1047
+ 2
1048
+ );
1049
+ const parts2 = diffLines(expectedKeys, actualKeys);
1050
+ return formatDiffParts(parts2);
1051
+ }
1052
+ const expectedStr = toPrettyJson(expectedProcessed);
1053
+ const actualStr = toPrettyJson(actualProcessed);
1054
+ if (expectedStr === actualStr) {
1055
+ return "";
1056
+ }
1057
+ const parts = diffLines(expectedStr, actualStr);
1058
+ if (diffOptions?.outputNewOnly) {
1059
+ const filtered = parts.filter(
1060
+ (p) => p.added === true
1061
+ );
1062
+ return formatDiffParts(filtered);
1063
+ }
1064
+ return formatDiffParts(parts);
1065
+ }
1066
+ function extractKeys(value) {
1067
+ if (value === null || typeof value !== "object") {
1068
+ return "\xB7";
1069
+ }
1070
+ if (Array.isArray(value)) {
1071
+ return value.map(extractKeys);
1072
+ }
1073
+ const result = {};
1074
+ for (const [k, v] of Object.entries(value)) {
1075
+ result[k] = extractKeys(v);
1076
+ }
1077
+ return result;
985
1078
  }
986
1079
  function formatLogMessage(msg) {
987
1080
  if (typeof msg === "string")
@@ -1332,13 +1425,27 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
1332
1425
  `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
1333
1426
  );
1334
1427
  }
1335
- function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, completedRef, passedRef, failedRef) {
1428
+ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef) {
1336
1429
  return Effect.gen(function* () {
1337
1430
  const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
1338
1431
  const rerunPassed = [];
1339
1432
  for (let r = 0; r < reruns; r++) {
1340
1433
  const evaluatorRunId = `run-${randomUUID()}`;
1341
1434
  const started = Date.now();
1435
+ const startedEvaluations = yield* Ref.modify(startedRef, (n) => [
1436
+ n + 1,
1437
+ n + 1
1438
+ ]);
1439
+ yield* publishEvent({
1440
+ type: "TestCaseStarted",
1441
+ runId: task.runId,
1442
+ testCaseId: testCaseItem.id,
1443
+ testCaseName: testCaseItem.testCase.getName(),
1444
+ startedTestCases: startedEvaluations,
1445
+ totalTestCases: totalEvaluations,
1446
+ rerunIndex: r + 1,
1447
+ rerunTotal: reruns
1448
+ });
1342
1449
  const evaluatorScores = [];
1343
1450
  let testCaseError;
1344
1451
  const output = readOutput(testCaseItem.testCase);
@@ -1484,6 +1591,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
1484
1591
  );
1485
1592
  const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
1486
1593
  const completedRef = yield* Ref.make(0);
1594
+ const startedRef = yield* Ref.make(0);
1487
1595
  const passedRef = yield* Ref.make(0);
1488
1596
  const failedRef = yield* Ref.make(0);
1489
1597
  const processTestCase = (testCaseItem) => processOneTestCase(
@@ -1493,6 +1601,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
1493
1601
  publishEvent,
1494
1602
  persistenceQueue,
1495
1603
  updateSnapshot,
1604
+ startedRef,
1496
1605
  completedRef,
1497
1606
  passedRef,
1498
1607
  failedRef