@m4trix/evals 0.15.0 → 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,7 +8,7 @@ var path = require('path');
8
8
  var jitiModule = require('jiti');
9
9
  var promises = require('fs/promises');
10
10
  var url = require('url');
11
- var diff = require('diff');
11
+ var jsonDiff = require('json-diff');
12
12
  var React2 = require('react');
13
13
  var ink = require('ink');
14
14
  var jsxRuntime = require('react/jsx-runtime');
@@ -286,45 +286,46 @@ async function collectTestCasesFromFiles(config) {
286
286
  );
287
287
  return found.flat();
288
288
  }
289
- function toJsonLines(value) {
289
+ function createDiffString(expected, actual, diffOptions) {
290
+ const opts = { ...diffOptions, color: false };
291
+ const result = jsonDiff.diffString(expected, actual, opts);
292
+ return typeof result === "string" ? result : "";
293
+ }
294
+ function formatLogMessage(msg) {
295
+ if (typeof msg === "string")
296
+ return msg;
290
297
  try {
291
- return JSON.stringify(value, null, 2);
298
+ if (msg !== null && typeof msg === "object") {
299
+ return JSON.stringify(msg, null, 2);
300
+ }
301
+ return String(msg);
292
302
  } catch {
293
- return String(value);
303
+ return String(msg);
294
304
  }
295
305
  }
296
- function formatDiffString(changes) {
297
- const lines = [];
298
- for (const part of changes) {
299
- const prefix = part.added ? "+" : part.removed ? "-" : " ";
300
- const partLines = part.value.split("\n");
301
- if (partLines[partLines.length - 1] === "") {
302
- partLines.pop();
303
- }
304
- for (const line of partLines) {
305
- lines.push(`${prefix} ${line}`);
306
- }
307
- }
308
- return lines.join("\n");
306
+ function createLogEntry(message, options) {
307
+ return {
308
+ type: "log",
309
+ label: options?.label,
310
+ message: formatLogMessage(message)
311
+ };
309
312
  }
310
- function createDiffString(expected, actual) {
311
- const expectedStr = toJsonLines(expected);
312
- const actualStr = toJsonLines(actual);
313
- const changes = diff.diffLines(expectedStr, actualStr);
314
- return formatDiffString(changes);
313
+ function getLogLines(entry) {
314
+ return entry.message.split("\n");
315
315
  }
316
316
  function createDiffLogEntry(expected, actual, options) {
317
- const diff = createDiffString(expected, actual);
317
+ const { label, ...diffOpts } = options ?? {};
318
+ const diff = createDiffString(expected, actual, diffOpts);
318
319
  return {
319
320
  type: "diff",
320
- label: options?.label,
321
+ label,
321
322
  expected,
322
323
  actual,
323
324
  diff: diff || "(no differences)"
324
325
  };
325
326
  }
326
327
  function getDiffLines(entry) {
327
- const raw = createDiffString(entry.expected, entry.actual) || "(no differences)";
328
+ const raw = entry.diff || "(no differences)";
328
329
  return raw.split("\n").map((line) => {
329
330
  const trimmed = line.trimStart();
330
331
  if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
@@ -586,6 +587,7 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
586
587
  const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
587
588
  const rerunPassed = [];
588
589
  for (let r = 0; r < reruns; r++) {
590
+ const evaluatorRunId = `run-${crypto.randomUUID()}`;
589
591
  const started = Date.now();
590
592
  const evaluatorScores = [];
591
593
  let testCaseError;
@@ -600,6 +602,9 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
600
602
  const logDiff = (expected, actual, options) => {
601
603
  logs.push(createDiffLogEntry(expected, actual, options));
602
604
  };
605
+ const log = (message, options) => {
606
+ logs.push(createLogEntry(message, options));
607
+ };
603
608
  const ctx = yield* effect.Effect.promise(
604
609
  () => Promise.resolve(evaluator.resolveContext())
605
610
  );
@@ -609,7 +614,13 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
609
614
  input: testCaseItem.testCase.getInput(),
610
615
  ctx,
611
616
  output,
612
- logDiff
617
+ meta: {
618
+ triggerId: task.triggerId,
619
+ runId: evaluatorRunId,
620
+ datasetId: task.datasetId
621
+ },
622
+ logDiff,
623
+ log
613
624
  })
614
625
  )
615
626
  );
@@ -1067,6 +1078,7 @@ var EffectRunner = class {
1067
1078
  (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
1068
1079
  0
1069
1080
  );
1081
+ const triggerId = request.triggerId ?? `trg-${crypto.randomUUID()}`;
1070
1082
  const runId = `run-${crypto.randomUUID()}`;
1071
1083
  const artifactPath = createArtifactPath(
1072
1084
  this.config.artifactDirectory,
@@ -1108,6 +1120,7 @@ var EffectRunner = class {
1108
1120
  await effect.Effect.runPromise(
1109
1121
  effect.Queue.offer(this.runQueue, {
1110
1122
  runId,
1123
+ triggerId,
1111
1124
  datasetId: request.datasetId,
1112
1125
  dataset: dataset.dataset,
1113
1126
  evaluators: selectedEvaluators,
@@ -1782,7 +1795,7 @@ function RunView({
1782
1795
  },
1783
1796
  lineIdx
1784
1797
  )
1785
- ) }, logIdx) : null
1798
+ ) }, logIdx) : log.type === "log" ? /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: getLogLines(log).map((line, lineIdx) => /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: line }, lineIdx)) }, logIdx) : null
1786
1799
  ) })
1787
1800
  ]
1788
1801
  },
@@ -2286,6 +2299,10 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2286
2299
  const colored = useColor && type === "remove" ? colorize(` ${line}`, ansi2.red) : useColor && type === "add" ? colorize(` ${line}`, ansi2.green) : ` ${line}`;
2287
2300
  lines.push(colored);
2288
2301
  }
2302
+ } else if (log.type === "log") {
2303
+ for (const line of getLogLines(log)) {
2304
+ lines.push(` ${line}`);
2305
+ }
2289
2306
  }
2290
2307
  }
2291
2308
  }