@m4trix/evals 0.15.0 → 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -1,7 +1,7 @@
1
1
  'use strict';
2
2
 
3
3
  var effect = require('effect');
4
- var diff = require('diff');
4
+ var jsonDiff = require('json-diff');
5
5
  var crypto = require('crypto');
6
6
  var fs = require('fs');
7
7
  var path = require('path');
@@ -660,46 +660,48 @@ var binaryScore = Score.of({
660
660
  },
661
661
  aggregate: aggregateAll
662
662
  });
663
- function toJsonLines(value) {
663
+ function createDiffString(expected, actual, diffOptions) {
664
+ const opts = { ...diffOptions, color: false };
665
+ const result = jsonDiff.diffString(expected, actual, opts);
666
+ return typeof result === "string" ? result : "";
667
+ }
668
+ function formatLogMessage(msg) {
669
+ if (typeof msg === "string")
670
+ return msg;
664
671
  try {
665
- return JSON.stringify(value, null, 2);
672
+ if (msg !== null && typeof msg === "object") {
673
+ return JSON.stringify(msg, null, 2);
674
+ }
675
+ return String(msg);
666
676
  } catch {
667
- return String(value);
677
+ return String(msg);
668
678
  }
669
679
  }
670
- function formatDiffString(changes) {
671
- const lines = [];
672
- for (const part of changes) {
673
- const prefix = part.added ? "+" : part.removed ? "-" : " ";
674
- const partLines = part.value.split("\n");
675
- if (partLines[partLines.length - 1] === "") {
676
- partLines.pop();
677
- }
678
- for (const line of partLines) {
679
- lines.push(`${prefix} ${line}`);
680
- }
681
- }
682
- return lines.join("\n");
680
+ function createLogEntry(message, options) {
681
+ return {
682
+ type: "log",
683
+ label: options?.label,
684
+ message: formatLogMessage(message)
685
+ };
683
686
  }
684
- function createDiffString(expected, actual) {
685
- const expectedStr = toJsonLines(expected);
686
- const actualStr = toJsonLines(actual);
687
- const changes = diff.diffLines(expectedStr, actualStr);
688
- return formatDiffString(changes);
687
+ function getLogLines(entry) {
688
+ return entry.message.split("\n");
689
689
  }
690
690
  function createDiffLogEntry(expected, actual, options) {
691
- const diff = createDiffString(expected, actual);
691
+ const { label, ...diffOpts } = options ?? {};
692
+ const diff = createDiffString(expected, actual, diffOpts);
692
693
  return {
693
694
  type: "diff",
694
- label: options?.label,
695
+ label,
695
696
  expected,
696
697
  actual,
697
698
  diff: diff || "(no differences)"
698
699
  };
699
700
  }
700
701
  function printJsonDiff(expected, actual, options = {}) {
701
- const diff = createDiffString(expected, actual);
702
- if (options.color) {
702
+ const { color = true, ...diffOpts } = options;
703
+ const diff = createDiffString(expected, actual, diffOpts);
704
+ if (color) {
703
705
  const lines = diff.split("\n").map((line) => {
704
706
  const trimmed = line.trimStart();
705
707
  if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
@@ -1055,6 +1057,7 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
1055
1057
  const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
1056
1058
  const rerunPassed = [];
1057
1059
  for (let r = 0; r < reruns; r++) {
1060
+ const evaluatorRunId = `run-${crypto.randomUUID()}`;
1058
1061
  const started = Date.now();
1059
1062
  const evaluatorScores = [];
1060
1063
  let testCaseError;
@@ -1069,6 +1072,9 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
1069
1072
  const logDiff = (expected, actual, options) => {
1070
1073
  logs.push(createDiffLogEntry(expected, actual, options));
1071
1074
  };
1075
+ const log = (message, options) => {
1076
+ logs.push(createLogEntry(message, options));
1077
+ };
1072
1078
  const ctx = yield* effect.Effect.promise(
1073
1079
  () => Promise.resolve(evaluator.resolveContext())
1074
1080
  );
@@ -1078,7 +1084,13 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
1078
1084
  input: testCaseItem.testCase.getInput(),
1079
1085
  ctx,
1080
1086
  output,
1081
- logDiff
1087
+ meta: {
1088
+ triggerId: task.triggerId,
1089
+ runId: evaluatorRunId,
1090
+ datasetId: task.datasetId
1091
+ },
1092
+ logDiff,
1093
+ log
1082
1094
  })
1083
1095
  )
1084
1096
  );
@@ -1536,6 +1548,7 @@ var EffectRunner = class {
1536
1548
  (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
1537
1549
  0
1538
1550
  );
1551
+ const triggerId = request.triggerId ?? `trg-${crypto.randomUUID()}`;
1539
1552
  const runId = `run-${crypto.randomUUID()}`;
1540
1553
  const artifactPath = createArtifactPath(
1541
1554
  this.config.artifactDirectory,
@@ -1577,6 +1590,7 @@ var EffectRunner = class {
1577
1590
  await effect.Effect.runPromise(
1578
1591
  effect.Queue.offer(this.runQueue, {
1579
1592
  runId,
1593
+ triggerId,
1580
1594
  datasetId: request.datasetId,
1581
1595
  dataset: dataset.dataset,
1582
1596
  evaluators: selectedEvaluators,
@@ -1660,9 +1674,11 @@ exports.Metric = Metric;
1660
1674
  exports.Score = Score;
1661
1675
  exports.TestCase = TestCase;
1662
1676
  exports.binaryScore = binaryScore;
1677
+ exports.createLogEntry = createLogEntry;
1663
1678
  exports.createRunner = createRunner;
1664
1679
  exports.defaultRunnerConfig = defaultRunnerConfig;
1665
1680
  exports.defineConfig = defineConfig;
1681
+ exports.getLogLines = getLogLines;
1666
1682
  exports.getMetricById = getMetricById;
1667
1683
  exports.getScoreById = getScoreById;
1668
1684
  exports.latencyMetric = latencyMetric;