@m4trix/evals 0.14.0 → 0.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -11,7 +11,7 @@ import { existsSync } from 'fs';
11
11
  import * as jitiModule from 'jiti';
12
12
  import { readdir, readFile, mkdir, appendFile } from 'fs/promises';
13
13
  import { pathToFileURL } from 'url';
14
- import { diffLines } from 'diff';
14
+ import { diffString } from 'json-diff';
15
15
 
16
16
  var SEP = " ";
17
17
  var ARROW = "\u203A";
@@ -978,45 +978,46 @@ async function collectTestCasesFromFiles(config) {
978
978
  );
979
979
  return found.flat();
980
980
  }
981
- function toJsonLines(value) {
981
+ function createDiffString(expected, actual, diffOptions) {
982
+ const opts = { ...diffOptions, color: false };
983
+ const result = diffString(expected, actual, opts);
984
+ return typeof result === "string" ? result : "";
985
+ }
986
+ function formatLogMessage(msg) {
987
+ if (typeof msg === "string")
988
+ return msg;
982
989
  try {
983
- return JSON.stringify(value, null, 2);
990
+ if (msg !== null && typeof msg === "object") {
991
+ return JSON.stringify(msg, null, 2);
992
+ }
993
+ return String(msg);
984
994
  } catch {
985
- return String(value);
995
+ return String(msg);
986
996
  }
987
997
  }
988
- function formatDiffString(changes) {
989
- const lines = [];
990
- for (const part of changes) {
991
- const prefix = part.added ? "+" : part.removed ? "-" : " ";
992
- const partLines = part.value.split("\n");
993
- if (partLines[partLines.length - 1] === "") {
994
- partLines.pop();
995
- }
996
- for (const line of partLines) {
997
- lines.push(`${prefix} ${line}`);
998
- }
999
- }
1000
- return lines.join("\n");
998
+ function createLogEntry(message, options) {
999
+ return {
1000
+ type: "log",
1001
+ label: options?.label,
1002
+ message: formatLogMessage(message)
1003
+ };
1001
1004
  }
1002
- function createDiffString(expected, actual) {
1003
- const expectedStr = toJsonLines(expected);
1004
- const actualStr = toJsonLines(actual);
1005
- const changes = diffLines(expectedStr, actualStr);
1006
- return formatDiffString(changes);
1005
+ function getLogLines(entry) {
1006
+ return entry.message.split("\n");
1007
1007
  }
1008
1008
  function createDiffLogEntry(expected, actual, options) {
1009
- const diff = createDiffString(expected, actual);
1009
+ const { label, ...diffOpts } = options ?? {};
1010
+ const diff = createDiffString(expected, actual, diffOpts);
1010
1011
  return {
1011
1012
  type: "diff",
1012
- label: options?.label,
1013
+ label,
1013
1014
  expected,
1014
1015
  actual,
1015
1016
  diff: diff || "(no differences)"
1016
1017
  };
1017
1018
  }
1018
1019
  function getDiffLines(entry) {
1019
- const raw = createDiffString(entry.expected, entry.actual) || "(no differences)";
1020
+ const raw = entry.diff || "(no differences)";
1020
1021
  return raw.split("\n").map((line) => {
1021
1022
  const trimmed = line.trimStart();
1022
1023
  if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
@@ -1274,6 +1275,9 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
1274
1275
  const logDiff = (expected, actual, options) => {
1275
1276
  logs.push(createDiffLogEntry(expected, actual, options));
1276
1277
  };
1278
+ const log = (message, options) => {
1279
+ logs.push(createLogEntry(message, options));
1280
+ };
1277
1281
  const ctx = yield* Effect.promise(
1278
1282
  () => Promise.resolve(evaluator.resolveContext())
1279
1283
  );
@@ -1283,7 +1287,8 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
1283
1287
  input: testCaseItem.testCase.getInput(),
1284
1288
  ctx,
1285
1289
  output,
1286
- logDiff
1290
+ logDiff,
1291
+ log
1287
1292
  })
1288
1293
  )
1289
1294
  );
@@ -2285,26 +2290,60 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
2285
2290
  ":",
2286
2291
  " ",
2287
2292
  /* @__PURE__ */ jsx(Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
2288
- " ",
2289
- item.scores.map((s) => /* @__PURE__ */ jsxs(Text, { color: scoreColor(toNumericScore(s.data) ?? 0), children: [
2290
- formatScorePart(s),
2291
- " "
2292
- ] }, s.id)),
2293
- item.metrics?.map((m) => {
2294
- const def = getMetricById(m.id);
2295
- if (!def)
2296
- return null;
2297
- const formatted = def.format(m.data);
2298
- return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
2299
- "[",
2300
- def.name ? `${def.name}: ` : "",
2301
- formatted,
2302
- "]",
2303
- " "
2304
- ] }, m.id);
2305
- })
2293
+ item.metrics && item.metrics.length > 0 ? /* @__PURE__ */ jsxs(Fragment, { children: [
2294
+ " ",
2295
+ item.metrics.map((m) => {
2296
+ const def = getMetricById(m.id);
2297
+ if (!def)
2298
+ return null;
2299
+ const formatted = def.format(m.data);
2300
+ return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
2301
+ "[",
2302
+ def.name ? `${def.name}: ` : "",
2303
+ formatted,
2304
+ "]",
2305
+ " "
2306
+ ] }, m.id);
2307
+ })
2308
+ ] }) : null
2306
2309
  ] }, `tc-${tc.testCaseId}-${item.evaluatorId}`)
2307
2310
  );
2311
+ if (item.scores.length > 0) {
2312
+ for (let sIdx = 0; sIdx < item.scores.length; sIdx++) {
2313
+ const s = item.scores[sIdx];
2314
+ const def = getScoreById(s.id);
2315
+ const scoreLabel = def ? def.name ?? def.id : s.id;
2316
+ rows.push(
2317
+ /* @__PURE__ */ jsxs(
2318
+ Text,
2319
+ {
2320
+ color: scoreColor(toNumericScore(s.data) ?? 0),
2321
+ children: [
2322
+ " ",
2323
+ scoreLabel,
2324
+ ": ",
2325
+ formatScorePart(s)
2326
+ ]
2327
+ },
2328
+ `tc-${tc.testCaseId}-${item.evaluatorId}-score-${sIdx}`
2329
+ )
2330
+ );
2331
+ }
2332
+ } else {
2333
+ rows.push(
2334
+ /* @__PURE__ */ jsxs(
2335
+ Text,
2336
+ {
2337
+ color: "gray",
2338
+ children: [
2339
+ " ",
2340
+ "n/a"
2341
+ ]
2342
+ },
2343
+ `tc-${tc.testCaseId}-${item.evaluatorId}-n/a`
2344
+ )
2345
+ );
2346
+ }
2308
2347
  if (!item.passed && item.logs && item.logs.length > 0) {
2309
2348
  for (let logIdx = 0; logIdx < item.logs.length; logIdx++) {
2310
2349
  const log = item.logs[logIdx];
@@ -2326,6 +2365,23 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
2326
2365
  )
2327
2366
  );
2328
2367
  }
2368
+ } else if (log.type === "log") {
2369
+ const logLines = getLogLines(log);
2370
+ for (let lineIdx = 0; lineIdx < logLines.length; lineIdx++) {
2371
+ rows.push(
2372
+ /* @__PURE__ */ jsxs(
2373
+ Text,
2374
+ {
2375
+ color: "gray",
2376
+ children: [
2377
+ " ",
2378
+ logLines[lineIdx]
2379
+ ]
2380
+ },
2381
+ `tc-${tc.testCaseId}-${item.evaluatorId}-${logIdx}-${lineIdx}`
2382
+ )
2383
+ );
2384
+ }
2329
2385
  }
2330
2386
  }
2331
2387
  }