@m4trix/evals 0.14.0 → 0.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.cjs CHANGED
@@ -13,7 +13,7 @@ var fs = require('fs');
13
13
  var jitiModule = require('jiti');
14
14
  var promises = require('fs/promises');
15
15
  var url = require('url');
16
- var diff = require('diff');
16
+ var jsonDiff = require('json-diff');
17
17
 
18
18
  var _documentCurrentScript = typeof document !== 'undefined' ? document.currentScript : null;
19
19
  function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
@@ -1004,45 +1004,46 @@ async function collectTestCasesFromFiles(config) {
1004
1004
  );
1005
1005
  return found.flat();
1006
1006
  }
1007
- function toJsonLines(value) {
1007
+ function createDiffString(expected, actual, diffOptions) {
1008
+ const opts = { ...diffOptions, color: false };
1009
+ const result = jsonDiff.diffString(expected, actual, opts);
1010
+ return typeof result === "string" ? result : "";
1011
+ }
1012
+ function formatLogMessage(msg) {
1013
+ if (typeof msg === "string")
1014
+ return msg;
1008
1015
  try {
1009
- return JSON.stringify(value, null, 2);
1016
+ if (msg !== null && typeof msg === "object") {
1017
+ return JSON.stringify(msg, null, 2);
1018
+ }
1019
+ return String(msg);
1010
1020
  } catch {
1011
- return String(value);
1021
+ return String(msg);
1012
1022
  }
1013
1023
  }
1014
- function formatDiffString(changes) {
1015
- const lines = [];
1016
- for (const part of changes) {
1017
- const prefix = part.added ? "+" : part.removed ? "-" : " ";
1018
- const partLines = part.value.split("\n");
1019
- if (partLines[partLines.length - 1] === "") {
1020
- partLines.pop();
1021
- }
1022
- for (const line of partLines) {
1023
- lines.push(`${prefix} ${line}`);
1024
- }
1025
- }
1026
- return lines.join("\n");
1024
+ function createLogEntry(message, options) {
1025
+ return {
1026
+ type: "log",
1027
+ label: options?.label,
1028
+ message: formatLogMessage(message)
1029
+ };
1027
1030
  }
1028
- function createDiffString(expected, actual) {
1029
- const expectedStr = toJsonLines(expected);
1030
- const actualStr = toJsonLines(actual);
1031
- const changes = diff.diffLines(expectedStr, actualStr);
1032
- return formatDiffString(changes);
1031
+ function getLogLines(entry) {
1032
+ return entry.message.split("\n");
1033
1033
  }
1034
1034
  function createDiffLogEntry(expected, actual, options) {
1035
- const diff = createDiffString(expected, actual);
1035
+ const { label, ...diffOpts } = options ?? {};
1036
+ const diff = createDiffString(expected, actual, diffOpts);
1036
1037
  return {
1037
1038
  type: "diff",
1038
- label: options?.label,
1039
+ label,
1039
1040
  expected,
1040
1041
  actual,
1041
1042
  diff: diff || "(no differences)"
1042
1043
  };
1043
1044
  }
1044
1045
  function getDiffLines(entry) {
1045
- const raw = createDiffString(entry.expected, entry.actual) || "(no differences)";
1046
+ const raw = entry.diff || "(no differences)";
1046
1047
  return raw.split("\n").map((line) => {
1047
1048
  const trimmed = line.trimStart();
1048
1049
  if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
@@ -1300,6 +1301,9 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
1300
1301
  const logDiff = (expected, actual, options) => {
1301
1302
  logs.push(createDiffLogEntry(expected, actual, options));
1302
1303
  };
1304
+ const log = (message, options) => {
1305
+ logs.push(createLogEntry(message, options));
1306
+ };
1303
1307
  const ctx = yield* effect.Effect.promise(
1304
1308
  () => Promise.resolve(evaluator.resolveContext())
1305
1309
  );
@@ -1309,7 +1313,8 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
1309
1313
  input: testCaseItem.testCase.getInput(),
1310
1314
  ctx,
1311
1315
  output,
1312
- logDiff
1316
+ logDiff,
1317
+ log
1313
1318
  })
1314
1319
  )
1315
1320
  );
@@ -2311,26 +2316,60 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
2311
2316
  ":",
2312
2317
  " ",
2313
2318
  /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
2314
- " ",
2315
- item.scores.map((s) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: scoreColor(toNumericScore(s.data) ?? 0), children: [
2316
- formatScorePart(s),
2317
- " "
2318
- ] }, s.id)),
2319
- item.metrics?.map((m) => {
2320
- const def = getMetricById(m.id);
2321
- if (!def)
2322
- return null;
2323
- const formatted = def.format(m.data);
2324
- return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
2325
- "[",
2326
- def.name ? `${def.name}: ` : "",
2327
- formatted,
2328
- "]",
2329
- " "
2330
- ] }, m.id);
2331
- })
2319
+ item.metrics && item.metrics.length > 0 ? /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
2320
+ " ",
2321
+ item.metrics.map((m) => {
2322
+ const def = getMetricById(m.id);
2323
+ if (!def)
2324
+ return null;
2325
+ const formatted = def.format(m.data);
2326
+ return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
2327
+ "[",
2328
+ def.name ? `${def.name}: ` : "",
2329
+ formatted,
2330
+ "]",
2331
+ " "
2332
+ ] }, m.id);
2333
+ })
2334
+ ] }) : null
2332
2335
  ] }, `tc-${tc.testCaseId}-${item.evaluatorId}`)
2333
2336
  );
2337
+ if (item.scores.length > 0) {
2338
+ for (let sIdx = 0; sIdx < item.scores.length; sIdx++) {
2339
+ const s = item.scores[sIdx];
2340
+ const def = getScoreById(s.id);
2341
+ const scoreLabel = def ? def.name ?? def.id : s.id;
2342
+ rows.push(
2343
+ /* @__PURE__ */ jsxRuntime.jsxs(
2344
+ ink.Text,
2345
+ {
2346
+ color: scoreColor(toNumericScore(s.data) ?? 0),
2347
+ children: [
2348
+ " ",
2349
+ scoreLabel,
2350
+ ": ",
2351
+ formatScorePart(s)
2352
+ ]
2353
+ },
2354
+ `tc-${tc.testCaseId}-${item.evaluatorId}-score-${sIdx}`
2355
+ )
2356
+ );
2357
+ }
2358
+ } else {
2359
+ rows.push(
2360
+ /* @__PURE__ */ jsxRuntime.jsxs(
2361
+ ink.Text,
2362
+ {
2363
+ color: "gray",
2364
+ children: [
2365
+ " ",
2366
+ "n/a"
2367
+ ]
2368
+ },
2369
+ `tc-${tc.testCaseId}-${item.evaluatorId}-n/a`
2370
+ )
2371
+ );
2372
+ }
2334
2373
  if (!item.passed && item.logs && item.logs.length > 0) {
2335
2374
  for (let logIdx = 0; logIdx < item.logs.length; logIdx++) {
2336
2375
  const log = item.logs[logIdx];
@@ -2352,6 +2391,23 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
2352
2391
  )
2353
2392
  );
2354
2393
  }
2394
+ } else if (log.type === "log") {
2395
+ const logLines = getLogLines(log);
2396
+ for (let lineIdx = 0; lineIdx < logLines.length; lineIdx++) {
2397
+ rows.push(
2398
+ /* @__PURE__ */ jsxRuntime.jsxs(
2399
+ ink.Text,
2400
+ {
2401
+ color: "gray",
2402
+ children: [
2403
+ " ",
2404
+ logLines[lineIdx]
2405
+ ]
2406
+ },
2407
+ `tc-${tc.testCaseId}-${item.evaluatorId}-${logIdx}-${lineIdx}`
2408
+ )
2409
+ );
2410
+ }
2355
2411
  }
2356
2412
  }
2357
2413
  }