@m4trix/evals 0.15.0 → 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,7 +6,7 @@ import { resolve, relative, join, parse, dirname } from 'path';
6
6
  import * as jitiModule from 'jiti';
7
7
  import { writeFile, readdir, readFile, mkdir, appendFile } from 'fs/promises';
8
8
  import { pathToFileURL } from 'url';
9
- import { diffLines } from 'diff';
9
+ import { diffString } from 'json-diff';
10
10
  import React2, { useState, useEffect, useCallback } from 'react';
11
11
  import { render, Box, Text } from 'ink';
12
12
  import { jsxs, jsx, Fragment } from 'react/jsx-runtime';
@@ -260,45 +260,46 @@ async function collectTestCasesFromFiles(config) {
260
260
  );
261
261
  return found.flat();
262
262
  }
263
- function toJsonLines(value) {
263
+ function createDiffString(expected, actual, diffOptions) {
264
+ const opts = { ...diffOptions, color: false };
265
+ const result = diffString(expected, actual, opts);
266
+ return typeof result === "string" ? result : "";
267
+ }
268
+ function formatLogMessage(msg) {
269
+ if (typeof msg === "string")
270
+ return msg;
264
271
  try {
265
- return JSON.stringify(value, null, 2);
272
+ if (msg !== null && typeof msg === "object") {
273
+ return JSON.stringify(msg, null, 2);
274
+ }
275
+ return String(msg);
266
276
  } catch {
267
- return String(value);
277
+ return String(msg);
268
278
  }
269
279
  }
270
- function formatDiffString(changes) {
271
- const lines = [];
272
- for (const part of changes) {
273
- const prefix = part.added ? "+" : part.removed ? "-" : " ";
274
- const partLines = part.value.split("\n");
275
- if (partLines[partLines.length - 1] === "") {
276
- partLines.pop();
277
- }
278
- for (const line of partLines) {
279
- lines.push(`${prefix} ${line}`);
280
- }
281
- }
282
- return lines.join("\n");
280
+ function createLogEntry(message, options) {
281
+ return {
282
+ type: "log",
283
+ label: options?.label,
284
+ message: formatLogMessage(message)
285
+ };
283
286
  }
284
- function createDiffString(expected, actual) {
285
- const expectedStr = toJsonLines(expected);
286
- const actualStr = toJsonLines(actual);
287
- const changes = diffLines(expectedStr, actualStr);
288
- return formatDiffString(changes);
287
+ function getLogLines(entry) {
288
+ return entry.message.split("\n");
289
289
  }
290
290
  function createDiffLogEntry(expected, actual, options) {
291
- const diff = createDiffString(expected, actual);
291
+ const { label, ...diffOpts } = options ?? {};
292
+ const diff = createDiffString(expected, actual, diffOpts);
292
293
  return {
293
294
  type: "diff",
294
- label: options?.label,
295
+ label,
295
296
  expected,
296
297
  actual,
297
298
  diff: diff || "(no differences)"
298
299
  };
299
300
  }
300
301
  function getDiffLines(entry) {
301
- const raw = createDiffString(entry.expected, entry.actual) || "(no differences)";
302
+ const raw = entry.diff || "(no differences)";
302
303
  return raw.split("\n").map((line) => {
303
304
  const trimmed = line.trimStart();
304
305
  if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
@@ -560,6 +561,7 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
560
561
  const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
561
562
  const rerunPassed = [];
562
563
  for (let r = 0; r < reruns; r++) {
564
+ const evaluatorRunId = `run-${randomUUID()}`;
563
565
  const started = Date.now();
564
566
  const evaluatorScores = [];
565
567
  let testCaseError;
@@ -574,6 +576,9 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
574
576
  const logDiff = (expected, actual, options) => {
575
577
  logs.push(createDiffLogEntry(expected, actual, options));
576
578
  };
579
+ const log = (message, options) => {
580
+ logs.push(createLogEntry(message, options));
581
+ };
577
582
  const ctx = yield* Effect.promise(
578
583
  () => Promise.resolve(evaluator.resolveContext())
579
584
  );
@@ -583,7 +588,13 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
583
588
  input: testCaseItem.testCase.getInput(),
584
589
  ctx,
585
590
  output,
586
- logDiff
591
+ meta: {
592
+ triggerId: task.triggerId,
593
+ runId: evaluatorRunId,
594
+ datasetId: task.datasetId
595
+ },
596
+ logDiff,
597
+ log
587
598
  })
588
599
  )
589
600
  );
@@ -1041,6 +1052,7 @@ var EffectRunner = class {
1041
1052
  (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
1042
1053
  0
1043
1054
  );
1055
+ const triggerId = request.triggerId ?? `trg-${randomUUID()}`;
1044
1056
  const runId = `run-${randomUUID()}`;
1045
1057
  const artifactPath = createArtifactPath(
1046
1058
  this.config.artifactDirectory,
@@ -1082,6 +1094,7 @@ var EffectRunner = class {
1082
1094
  await Effect.runPromise(
1083
1095
  Queue.offer(this.runQueue, {
1084
1096
  runId,
1097
+ triggerId,
1085
1098
  datasetId: request.datasetId,
1086
1099
  dataset: dataset.dataset,
1087
1100
  evaluators: selectedEvaluators,
@@ -1756,7 +1769,7 @@ function RunView({
1756
1769
  },
1757
1770
  lineIdx
1758
1771
  )
1759
- ) }, logIdx) : null
1772
+ ) }, logIdx) : log.type === "log" ? /* @__PURE__ */ jsx(Box, { flexDirection: "column", children: getLogLines(log).map((line, lineIdx) => /* @__PURE__ */ jsx(Text, { color: "gray", children: line }, lineIdx)) }, logIdx) : null
1760
1773
  ) })
1761
1774
  ]
1762
1775
  },
@@ -2260,6 +2273,10 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2260
2273
  const colored = useColor && type === "remove" ? colorize(` ${line}`, ansi2.red) : useColor && type === "add" ? colorize(` ${line}`, ansi2.green) : ` ${line}`;
2261
2274
  lines.push(colored);
2262
2275
  }
2276
+ } else if (log.type === "log") {
2277
+ for (const line of getLogLines(log)) {
2278
+ lines.push(` ${line}`);
2279
+ }
2263
2280
  }
2264
2281
  }
2265
2282
  }