@m4trix/evals 0.20.0 → 0.21.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -1,7 +1,8 @@
1
1
  'use strict';
2
2
 
3
3
  var effect = require('effect');
4
- var jsonDiff = require('json-diff');
4
+ var diff = require('diff');
5
+ var stringify = require('fast-json-stable-stringify');
5
6
  var crypto = require('crypto');
6
7
  var fs = require('fs');
7
8
  var path = require('path');
@@ -10,6 +11,8 @@ var promises = require('fs/promises');
10
11
  var url = require('url');
11
12
 
12
13
  var _documentCurrentScript = typeof document !== 'undefined' ? document.currentScript : null;
14
+ function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
15
+
13
16
  function _interopNamespace(e) {
14
17
  if (e && e.__esModule) return e;
15
18
  var n = Object.create(null);
@@ -28,6 +31,7 @@ function _interopNamespace(e) {
28
31
  return Object.freeze(n);
29
32
  }
30
33
 
34
+ var stringify__default = /*#__PURE__*/_interopDefault(stringify);
31
35
  var jitiModule__namespace = /*#__PURE__*/_interopNamespace(jitiModule);
32
36
 
33
37
  // src/cli/data.mock.json
@@ -730,10 +734,102 @@ var binaryScore = Score.of({
730
734
  },
731
735
  aggregateValues: Score.aggregate.all
732
736
  });
737
+ function preprocessForDiff(value, options) {
738
+ if (options?.sort && Array.isArray(value)) {
739
+ return [...value].sort((a, b) => {
740
+ const aStr = stringify__default.default(preprocessForDiff(a, options));
741
+ const bStr = stringify__default.default(preprocessForDiff(b, options));
742
+ return aStr.localeCompare(bStr);
743
+ }).map((item) => preprocessForDiff(item, options));
744
+ }
745
+ if (value !== null && typeof value === "object" && !Array.isArray(value) && options?.excludeKeys) {
746
+ const keys = Array.isArray(options.excludeKeys) ? options.excludeKeys : options.excludeKeys.split(",").map((k) => k.trim());
747
+ const filtered = {};
748
+ for (const [k, v] of Object.entries(value)) {
749
+ if (!keys.includes(k)) {
750
+ filtered[k] = preprocessForDiff(v, options);
751
+ }
752
+ }
753
+ return filtered;
754
+ }
755
+ if (value !== null && typeof value === "object" && !Array.isArray(value)) {
756
+ const result = {};
757
+ for (const [k, v] of Object.entries(value)) {
758
+ result[k] = preprocessForDiff(v, options);
759
+ }
760
+ return result;
761
+ }
762
+ if (typeof value === "number" && options?.precision !== void 0) {
763
+ return Number(value.toFixed(options.precision));
764
+ }
765
+ return value;
766
+ }
767
+ function toPrettyJson(value) {
768
+ const str = stringify__default.default(value);
769
+ try {
770
+ const parsed = JSON.parse(str);
771
+ return JSON.stringify(parsed, null, 2);
772
+ } catch {
773
+ return str;
774
+ }
775
+ }
776
+ function formatDiffParts(parts) {
777
+ const lines = [];
778
+ for (const part of parts) {
779
+ const prefix = part.added ? "+ " : part.removed ? "- " : "";
780
+ const partLines = part.value.split("\n");
781
+ for (let i = 0; i < partLines.length; i++) {
782
+ const line = partLines[i];
783
+ if (i === partLines.length - 1 && line === "")
784
+ continue;
785
+ lines.push(prefix + line);
786
+ }
787
+ }
788
+ return lines.join("\n");
789
+ }
733
790
  function createDiffString(expected, actual, diffOptions) {
734
- const opts = { ...diffOptions, color: false };
735
- const result = jsonDiff.diffString(expected, actual, opts);
736
- return typeof result === "string" ? result : "";
791
+ const expectedProcessed = preprocessForDiff(expected, diffOptions);
792
+ const actualProcessed = preprocessForDiff(actual, diffOptions);
793
+ if (diffOptions?.keysOnly) {
794
+ const expectedKeys = JSON.stringify(
795
+ extractKeys(expectedProcessed),
796
+ null,
797
+ 2
798
+ );
799
+ const actualKeys = JSON.stringify(
800
+ extractKeys(actualProcessed),
801
+ null,
802
+ 2
803
+ );
804
+ const parts2 = diff.diffLines(expectedKeys, actualKeys);
805
+ return formatDiffParts(parts2);
806
+ }
807
+ const expectedStr = toPrettyJson(expectedProcessed);
808
+ const actualStr = toPrettyJson(actualProcessed);
809
+ if (expectedStr === actualStr) {
810
+ return "";
811
+ }
812
+ const parts = diff.diffLines(expectedStr, actualStr);
813
+ if (diffOptions?.outputNewOnly) {
814
+ const filtered = parts.filter(
815
+ (p) => p.added === true
816
+ );
817
+ return formatDiffParts(filtered);
818
+ }
819
+ return formatDiffParts(parts);
820
+ }
821
+ function extractKeys(value) {
822
+ if (value === null || typeof value !== "object") {
823
+ return "\xB7";
824
+ }
825
+ if (Array.isArray(value)) {
826
+ return value.map(extractKeys);
827
+ }
828
+ const result = {};
829
+ for (const [k, v] of Object.entries(value)) {
830
+ result[k] = extractKeys(v);
831
+ }
832
+ return result;
737
833
  }
738
834
  function formatLogMessage(msg) {
739
835
  if (typeof msg === "string")
@@ -1128,13 +1224,27 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
1128
1224
  `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
1129
1225
  );
1130
1226
  }
1131
- function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, completedRef, passedRef, failedRef) {
1227
+ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef) {
1132
1228
  return effect.Effect.gen(function* () {
1133
1229
  const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
1134
1230
  const rerunPassed = [];
1135
1231
  for (let r = 0; r < reruns; r++) {
1136
1232
  const evaluatorRunId = `run-${crypto.randomUUID()}`;
1137
1233
  const started = Date.now();
1234
+ const startedEvaluations = yield* effect.Ref.modify(startedRef, (n) => [
1235
+ n + 1,
1236
+ n + 1
1237
+ ]);
1238
+ yield* publishEvent({
1239
+ type: "TestCaseStarted",
1240
+ runId: task.runId,
1241
+ testCaseId: testCaseItem.id,
1242
+ testCaseName: testCaseItem.testCase.getName(),
1243
+ startedTestCases: startedEvaluations,
1244
+ totalTestCases: totalEvaluations,
1245
+ rerunIndex: r + 1,
1246
+ rerunTotal: reruns
1247
+ });
1138
1248
  const evaluatorScores = [];
1139
1249
  let testCaseError;
1140
1250
  const output = readOutput(testCaseItem.testCase);
@@ -1280,6 +1390,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
1280
1390
  );
1281
1391
  const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
1282
1392
  const completedRef = yield* effect.Ref.make(0);
1393
+ const startedRef = yield* effect.Ref.make(0);
1283
1394
  const passedRef = yield* effect.Ref.make(0);
1284
1395
  const failedRef = yield* effect.Ref.make(0);
1285
1396
  const processTestCase = (testCaseItem) => processOneTestCase(
@@ -1289,6 +1400,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
1289
1400
  publishEvent,
1290
1401
  persistenceQueue,
1291
1402
  updateSnapshot,
1403
+ startedRef,
1292
1404
  completedRef,
1293
1405
  passedRef,
1294
1406
  failedRef