@m4trix/evals 0.10.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -10,7 +10,7 @@ import { resolve, relative, join, dirname } from 'path';
10
10
  import * as jitiModule from 'jiti';
11
11
  import { mkdir, appendFile, readdir } from 'fs/promises';
12
12
  import { pathToFileURL } from 'url';
13
- import 'json-diff';
13
+ import { diffString } from 'json-diff';
14
14
 
15
15
  var SEP = " ";
16
16
  var ARROW = "\u203A";
@@ -1519,6 +1519,16 @@ async function collectTestCasesFromFiles(config) {
1519
1519
  );
1520
1520
  return found.flat();
1521
1521
  }
1522
+ function createDiffLogEntry(expected, actual, options) {
1523
+ const diff = diffString(expected, actual, { color: false });
1524
+ return {
1525
+ type: "diff",
1526
+ label: options?.label,
1527
+ expected,
1528
+ actual,
1529
+ diff: diff || "(no differences)"
1530
+ };
1531
+ }
1522
1532
 
1523
1533
  // src/evals/metric.ts
1524
1534
  var registry = /* @__PURE__ */ new Map();
@@ -1699,6 +1709,10 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
1699
1709
  continue;
1700
1710
  }
1701
1711
  try {
1712
+ const logs = [];
1713
+ const logDiff = (expected, actual, options) => {
1714
+ logs.push(createDiffLogEntry(expected, actual, options));
1715
+ };
1702
1716
  const ctx = yield* Effect.promise(
1703
1717
  () => Promise.resolve(evaluator.resolveContext())
1704
1718
  );
@@ -1707,13 +1721,20 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
1707
1721
  evaluateFn({
1708
1722
  input: testCaseItem.testCase.getInput(),
1709
1723
  ctx,
1710
- output
1724
+ output,
1725
+ logDiff
1711
1726
  })
1712
1727
  )
1713
1728
  );
1714
1729
  const { scores, metrics } = normalizeResult(result);
1715
1730
  const passed = computeEvaluatorPassed(evaluator, result, scores);
1716
- evaluatorScores.push({ evaluatorId, scores, passed, metrics });
1731
+ evaluatorScores.push({
1732
+ evaluatorId,
1733
+ scores,
1734
+ passed,
1735
+ metrics,
1736
+ logs: logs.length > 0 ? logs : void 0
1737
+ });
1717
1738
  } catch (error) {
1718
1739
  testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
1719
1740
  evaluatorScores.push({