@m4trix/evals 0.20.0 → 0.21.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -169,10 +169,9 @@ declare class Dataset {
169
169
 
170
170
  /**
171
171
  * Options for customizing JSON diff output. Passed to logDiff, createDiffLogEntry, and printJsonDiff.
172
- * @see https://www.npmjs.com/package/json-diff
173
172
  */
174
173
  interface JsonDiffOptions {
175
- /** Include equal sections of the document, not just deltas */
174
+ /** Include equal sections of the document, not just deltas (always true with current implementation) */
176
175
  full?: boolean;
177
176
  /** Sort primitive values in arrays before comparing */
178
177
  sort?: boolean;
@@ -437,6 +436,15 @@ type RunnerEvent = {
437
436
  type: 'RunStarted';
438
437
  runId: string;
439
438
  startedAt: number;
439
+ } | {
440
+ type: 'TestCaseStarted';
441
+ runId: string;
442
+ testCaseId: string;
443
+ testCaseName: string;
444
+ startedTestCases: number;
445
+ totalTestCases: number;
446
+ rerunIndex: number;
447
+ rerunTotal: number;
440
448
  } | {
441
449
  type: 'TestCaseProgress';
442
450
  runId: string;
package/dist/index.js CHANGED
@@ -1,6 +1,7 @@
1
1
  import { Effect, PubSub, Queue, Fiber, Ref } from 'effect';
2
2
  export { Schema as S } from 'effect';
3
- import { diffString } from 'json-diff';
3
+ import { diffLines } from 'diff';
4
+ import stringify from 'fast-json-stable-stringify';
4
5
  import { randomUUID } from 'crypto';
5
6
  import { existsSync } from 'fs';
6
7
  import { resolve as resolve$1, relative, join, dirname } from 'path';
@@ -708,10 +709,102 @@ var binaryScore = Score.of({
708
709
  },
709
710
  aggregateValues: Score.aggregate.all
710
711
  });
712
+ function preprocessForDiff(value, options) {
713
+ if (options?.sort && Array.isArray(value)) {
714
+ return [...value].sort((a, b) => {
715
+ const aStr = stringify(preprocessForDiff(a, options));
716
+ const bStr = stringify(preprocessForDiff(b, options));
717
+ return aStr.localeCompare(bStr);
718
+ }).map((item) => preprocessForDiff(item, options));
719
+ }
720
+ if (value !== null && typeof value === "object" && !Array.isArray(value) && options?.excludeKeys) {
721
+ const keys = Array.isArray(options.excludeKeys) ? options.excludeKeys : options.excludeKeys.split(",").map((k) => k.trim());
722
+ const filtered = {};
723
+ for (const [k, v] of Object.entries(value)) {
724
+ if (!keys.includes(k)) {
725
+ filtered[k] = preprocessForDiff(v, options);
726
+ }
727
+ }
728
+ return filtered;
729
+ }
730
+ if (value !== null && typeof value === "object" && !Array.isArray(value)) {
731
+ const result = {};
732
+ for (const [k, v] of Object.entries(value)) {
733
+ result[k] = preprocessForDiff(v, options);
734
+ }
735
+ return result;
736
+ }
737
+ if (typeof value === "number" && options?.precision !== void 0) {
738
+ return Number(value.toFixed(options.precision));
739
+ }
740
+ return value;
741
+ }
742
+ function toPrettyJson(value) {
743
+ const str = stringify(value);
744
+ try {
745
+ const parsed = JSON.parse(str);
746
+ return JSON.stringify(parsed, null, 2);
747
+ } catch {
748
+ return str;
749
+ }
750
+ }
751
+ function formatDiffParts(parts) {
752
+ const lines = [];
753
+ for (const part of parts) {
754
+ const prefix = part.added ? "+ " : part.removed ? "- " : "";
755
+ const partLines = part.value.split("\n");
756
+ for (let i = 0; i < partLines.length; i++) {
757
+ const line = partLines[i];
758
+ if (i === partLines.length - 1 && line === "")
759
+ continue;
760
+ lines.push(prefix + line);
761
+ }
762
+ }
763
+ return lines.join("\n");
764
+ }
711
765
  function createDiffString(expected, actual, diffOptions) {
712
- const opts = { ...diffOptions, color: false };
713
- const result = diffString(expected, actual, opts);
714
- return typeof result === "string" ? result : "";
766
+ const expectedProcessed = preprocessForDiff(expected, diffOptions);
767
+ const actualProcessed = preprocessForDiff(actual, diffOptions);
768
+ if (diffOptions?.keysOnly) {
769
+ const expectedKeys = JSON.stringify(
770
+ extractKeys(expectedProcessed),
771
+ null,
772
+ 2
773
+ );
774
+ const actualKeys = JSON.stringify(
775
+ extractKeys(actualProcessed),
776
+ null,
777
+ 2
778
+ );
779
+ const parts2 = diffLines(expectedKeys, actualKeys);
780
+ return formatDiffParts(parts2);
781
+ }
782
+ const expectedStr = toPrettyJson(expectedProcessed);
783
+ const actualStr = toPrettyJson(actualProcessed);
784
+ if (expectedStr === actualStr) {
785
+ return "";
786
+ }
787
+ const parts = diffLines(expectedStr, actualStr);
788
+ if (diffOptions?.outputNewOnly) {
789
+ const filtered = parts.filter(
790
+ (p) => p.added === true
791
+ );
792
+ return formatDiffParts(filtered);
793
+ }
794
+ return formatDiffParts(parts);
795
+ }
796
+ function extractKeys(value) {
797
+ if (value === null || typeof value !== "object") {
798
+ return "\xB7";
799
+ }
800
+ if (Array.isArray(value)) {
801
+ return value.map(extractKeys);
802
+ }
803
+ const result = {};
804
+ for (const [k, v] of Object.entries(value)) {
805
+ result[k] = extractKeys(v);
806
+ }
807
+ return result;
715
808
  }
716
809
  function formatLogMessage(msg) {
717
810
  if (typeof msg === "string")
@@ -1106,13 +1199,27 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
1106
1199
  `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
1107
1200
  );
1108
1201
  }
1109
- function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, completedRef, passedRef, failedRef) {
1202
+ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef) {
1110
1203
  return Effect.gen(function* () {
1111
1204
  const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
1112
1205
  const rerunPassed = [];
1113
1206
  for (let r = 0; r < reruns; r++) {
1114
1207
  const evaluatorRunId = `run-${randomUUID()}`;
1115
1208
  const started = Date.now();
1209
+ const startedEvaluations = yield* Ref.modify(startedRef, (n) => [
1210
+ n + 1,
1211
+ n + 1
1212
+ ]);
1213
+ yield* publishEvent({
1214
+ type: "TestCaseStarted",
1215
+ runId: task.runId,
1216
+ testCaseId: testCaseItem.id,
1217
+ testCaseName: testCaseItem.testCase.getName(),
1218
+ startedTestCases: startedEvaluations,
1219
+ totalTestCases: totalEvaluations,
1220
+ rerunIndex: r + 1,
1221
+ rerunTotal: reruns
1222
+ });
1116
1223
  const evaluatorScores = [];
1117
1224
  let testCaseError;
1118
1225
  const output = readOutput(testCaseItem.testCase);
@@ -1258,6 +1365,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
1258
1365
  );
1259
1366
  const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
1260
1367
  const completedRef = yield* Ref.make(0);
1368
+ const startedRef = yield* Ref.make(0);
1261
1369
  const passedRef = yield* Ref.make(0);
1262
1370
  const failedRef = yield* Ref.make(0);
1263
1371
  const processTestCase = (testCaseItem) => processOneTestCase(
@@ -1267,6 +1375,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
1267
1375
  publishEvent,
1268
1376
  persistenceQueue,
1269
1377
  updateSnapshot,
1378
+ startedRef,
1270
1379
  completedRef,
1271
1380
  passedRef,
1272
1381
  failedRef