@m4trix/evals 0.15.0 → 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -11,7 +11,7 @@ import { existsSync } from 'fs';
11
11
  import * as jitiModule from 'jiti';
12
12
  import { readdir, readFile, mkdir, appendFile } from 'fs/promises';
13
13
  import { pathToFileURL } from 'url';
14
- import { diffLines } from 'diff';
14
+ import { diffString } from 'json-diff';
15
15
 
16
16
  var SEP = " ";
17
17
  var ARROW = "\u203A";
@@ -978,45 +978,46 @@ async function collectTestCasesFromFiles(config) {
978
978
  );
979
979
  return found.flat();
980
980
  }
981
- function toJsonLines(value) {
981
+ function createDiffString(expected, actual, diffOptions) {
982
+ const opts = { ...diffOptions, color: false };
983
+ const result = diffString(expected, actual, opts);
984
+ return typeof result === "string" ? result : "";
985
+ }
986
+ function formatLogMessage(msg) {
987
+ if (typeof msg === "string")
988
+ return msg;
982
989
  try {
983
- return JSON.stringify(value, null, 2);
990
+ if (msg !== null && typeof msg === "object") {
991
+ return JSON.stringify(msg, null, 2);
992
+ }
993
+ return String(msg);
984
994
  } catch {
985
- return String(value);
995
+ return String(msg);
986
996
  }
987
997
  }
988
- function formatDiffString(changes) {
989
- const lines = [];
990
- for (const part of changes) {
991
- const prefix = part.added ? "+" : part.removed ? "-" : " ";
992
- const partLines = part.value.split("\n");
993
- if (partLines[partLines.length - 1] === "") {
994
- partLines.pop();
995
- }
996
- for (const line of partLines) {
997
- lines.push(`${prefix} ${line}`);
998
- }
999
- }
1000
- return lines.join("\n");
998
+ function createLogEntry(message, options) {
999
+ return {
1000
+ type: "log",
1001
+ label: options?.label,
1002
+ message: formatLogMessage(message)
1003
+ };
1001
1004
  }
1002
- function createDiffString(expected, actual) {
1003
- const expectedStr = toJsonLines(expected);
1004
- const actualStr = toJsonLines(actual);
1005
- const changes = diffLines(expectedStr, actualStr);
1006
- return formatDiffString(changes);
1005
+ function getLogLines(entry) {
1006
+ return entry.message.split("\n");
1007
1007
  }
1008
1008
  function createDiffLogEntry(expected, actual, options) {
1009
- const diff = createDiffString(expected, actual);
1009
+ const { label, ...diffOpts } = options ?? {};
1010
+ const diff = createDiffString(expected, actual, diffOpts);
1010
1011
  return {
1011
1012
  type: "diff",
1012
- label: options?.label,
1013
+ label,
1013
1014
  expected,
1014
1015
  actual,
1015
1016
  diff: diff || "(no differences)"
1016
1017
  };
1017
1018
  }
1018
1019
  function getDiffLines(entry) {
1019
- const raw = createDiffString(entry.expected, entry.actual) || "(no differences)";
1020
+ const raw = entry.diff || "(no differences)";
1020
1021
  return raw.split("\n").map((line) => {
1021
1022
  const trimmed = line.trimStart();
1022
1023
  if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
@@ -1260,6 +1261,7 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
1260
1261
  const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
1261
1262
  const rerunPassed = [];
1262
1263
  for (let r = 0; r < reruns; r++) {
1264
+ const evaluatorRunId = `run-${randomUUID()}`;
1263
1265
  const started = Date.now();
1264
1266
  const evaluatorScores = [];
1265
1267
  let testCaseError;
@@ -1274,6 +1276,9 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
1274
1276
  const logDiff = (expected, actual, options) => {
1275
1277
  logs.push(createDiffLogEntry(expected, actual, options));
1276
1278
  };
1279
+ const log = (message, options) => {
1280
+ logs.push(createLogEntry(message, options));
1281
+ };
1277
1282
  const ctx = yield* Effect.promise(
1278
1283
  () => Promise.resolve(evaluator.resolveContext())
1279
1284
  );
@@ -1283,7 +1288,13 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
1283
1288
  input: testCaseItem.testCase.getInput(),
1284
1289
  ctx,
1285
1290
  output,
1286
- logDiff
1291
+ meta: {
1292
+ triggerId: task.triggerId,
1293
+ runId: evaluatorRunId,
1294
+ datasetId: task.datasetId
1295
+ },
1296
+ logDiff,
1297
+ log
1287
1298
  })
1288
1299
  )
1289
1300
  );
@@ -1771,6 +1782,7 @@ var EffectRunner = class {
1771
1782
  (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
1772
1783
  0
1773
1784
  );
1785
+ const triggerId = request.triggerId ?? `trg-${randomUUID()}`;
1774
1786
  const runId = `run-${randomUUID()}`;
1775
1787
  const artifactPath = createArtifactPath(
1776
1788
  this.config.artifactDirectory,
@@ -1812,6 +1824,7 @@ var EffectRunner = class {
1812
1824
  await Effect.runPromise(
1813
1825
  Queue.offer(this.runQueue, {
1814
1826
  runId,
1827
+ triggerId,
1815
1828
  datasetId: request.datasetId,
1816
1829
  dataset: dataset.dataset,
1817
1830
  evaluators: selectedEvaluators,
@@ -2360,6 +2373,23 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
2360
2373
  )
2361
2374
  );
2362
2375
  }
2376
+ } else if (log.type === "log") {
2377
+ const logLines = getLogLines(log);
2378
+ for (let lineIdx = 0; lineIdx < logLines.length; lineIdx++) {
2379
+ rows.push(
2380
+ /* @__PURE__ */ jsxs(
2381
+ Text,
2382
+ {
2383
+ color: "gray",
2384
+ children: [
2385
+ " ",
2386
+ logLines[lineIdx]
2387
+ ]
2388
+ },
2389
+ `tc-${tc.testCaseId}-${item.evaluatorId}-${logIdx}-${lineIdx}`
2390
+ )
2391
+ );
2392
+ }
2363
2393
  }
2364
2394
  }
2365
2395
  }