@m4trix/evals 0.10.0 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +85 -27
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +85 -27
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +24 -3
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +24 -3
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +23 -2
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +25 -13
- package/dist/index.js +23 -2
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli-simple.cjs
CHANGED
|
@@ -8,7 +8,7 @@ var path = require('path');
|
|
|
8
8
|
var jitiModule = require('jiti');
|
|
9
9
|
var promises = require('fs/promises');
|
|
10
10
|
var url = require('url');
|
|
11
|
-
require('json-diff');
|
|
11
|
+
var jsonDiff = require('json-diff');
|
|
12
12
|
var React2 = require('react');
|
|
13
13
|
var ink = require('ink');
|
|
14
14
|
var jsxRuntime = require('react/jsx-runtime');
|
|
@@ -282,6 +282,29 @@ async function collectTestCasesFromFiles(config) {
|
|
|
282
282
|
);
|
|
283
283
|
return found.flat();
|
|
284
284
|
}
|
|
285
|
+
function createDiffLogEntry(expected, actual, options) {
|
|
286
|
+
const diff = jsonDiff.diffString(expected, actual, { color: false });
|
|
287
|
+
return {
|
|
288
|
+
type: "diff",
|
|
289
|
+
label: options?.label,
|
|
290
|
+
expected,
|
|
291
|
+
actual,
|
|
292
|
+
diff: diff || "(no differences)"
|
|
293
|
+
};
|
|
294
|
+
}
|
|
295
|
+
function getDiffLines(entry) {
|
|
296
|
+
const raw = jsonDiff.diffString(entry.expected, entry.actual, { color: false }) || "(no differences)";
|
|
297
|
+
return raw.split("\n").map((line) => {
|
|
298
|
+
const trimmed = line.trimStart();
|
|
299
|
+
if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
|
|
300
|
+
return { type: "remove", line };
|
|
301
|
+
}
|
|
302
|
+
if (trimmed.startsWith("+") && !trimmed.startsWith("+++")) {
|
|
303
|
+
return { type: "add", line };
|
|
304
|
+
}
|
|
305
|
+
return { type: "context", line };
|
|
306
|
+
});
|
|
307
|
+
}
|
|
285
308
|
|
|
286
309
|
// src/evals/metric.ts
|
|
287
310
|
var registry = /* @__PURE__ */ new Map();
|
|
@@ -465,6 +488,10 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
465
488
|
continue;
|
|
466
489
|
}
|
|
467
490
|
try {
|
|
491
|
+
const logs = [];
|
|
492
|
+
const logDiff = (expected, actual, options) => {
|
|
493
|
+
logs.push(createDiffLogEntry(expected, actual, options));
|
|
494
|
+
};
|
|
468
495
|
const ctx = yield* effect.Effect.promise(
|
|
469
496
|
() => Promise.resolve(evaluator.resolveContext())
|
|
470
497
|
);
|
|
@@ -473,13 +500,20 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
473
500
|
evaluateFn({
|
|
474
501
|
input: testCaseItem.testCase.getInput(),
|
|
475
502
|
ctx,
|
|
476
|
-
output
|
|
503
|
+
output,
|
|
504
|
+
logDiff
|
|
477
505
|
})
|
|
478
506
|
)
|
|
479
507
|
);
|
|
480
508
|
const { scores, metrics } = normalizeResult(result);
|
|
481
509
|
const passed = computeEvaluatorPassed(evaluator, result, scores);
|
|
482
|
-
evaluatorScores.push({
|
|
510
|
+
evaluatorScores.push({
|
|
511
|
+
evaluatorId,
|
|
512
|
+
scores,
|
|
513
|
+
passed,
|
|
514
|
+
metrics,
|
|
515
|
+
logs: logs.length > 0 ? logs : void 0
|
|
516
|
+
});
|
|
483
517
|
} catch (error) {
|
|
484
518
|
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
485
519
|
evaluatorScores.push({
|
|
@@ -1202,7 +1236,8 @@ function RunView({
|
|
|
1202
1236
|
evaluatorName: nameById.get(item.evaluatorId) ?? item.evaluatorId,
|
|
1203
1237
|
scores: item.scores,
|
|
1204
1238
|
passed: item.passed,
|
|
1205
|
-
metrics: item.metrics
|
|
1239
|
+
metrics: item.metrics,
|
|
1240
|
+
logs: item.logs
|
|
1206
1241
|
}))
|
|
1207
1242
|
}
|
|
1208
1243
|
]);
|
|
@@ -1289,30 +1324,42 @@ function RunView({
|
|
|
1289
1324
|
"ms)"
|
|
1290
1325
|
] })
|
|
1291
1326
|
] }),
|
|
1292
|
-
tc.evaluatorScores.map((item) => /* @__PURE__ */ jsxRuntime.
|
|
1293
|
-
|
|
1294
|
-
|
|
1295
|
-
|
|
1296
|
-
|
|
1297
|
-
|
|
1298
|
-
|
|
1299
|
-
|
|
1300
|
-
|
|
1301
|
-
] }, s.id)),
|
|
1302
|
-
item.metrics?.map((m) => {
|
|
1303
|
-
const def = getMetricById(m.id);
|
|
1304
|
-
if (!def)
|
|
1305
|
-
return null;
|
|
1306
|
-
const formatted = def.format(m.data);
|
|
1307
|
-
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1308
|
-
"[",
|
|
1309
|
-
def.name ? `${def.name}: ` : "",
|
|
1310
|
-
formatted,
|
|
1311
|
-
"]",
|
|
1327
|
+
tc.evaluatorScores.map((item) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginLeft: 2, children: [
|
|
1328
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1329
|
+
item.evaluatorName,
|
|
1330
|
+
":",
|
|
1331
|
+
" ",
|
|
1332
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
|
|
1333
|
+
" ",
|
|
1334
|
+
item.scores.map((s) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: scoreColor(toNumericScore(s.data) ?? 0), children: [
|
|
1335
|
+
formatScorePart(s),
|
|
1312
1336
|
" "
|
|
1313
|
-
] },
|
|
1314
|
-
|
|
1315
|
-
|
|
1337
|
+
] }, s.id)),
|
|
1338
|
+
item.metrics?.map((m) => {
|
|
1339
|
+
const def = getMetricById(m.id);
|
|
1340
|
+
if (!def)
|
|
1341
|
+
return null;
|
|
1342
|
+
const formatted = def.format(m.data);
|
|
1343
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1344
|
+
"[",
|
|
1345
|
+
def.name ? `${def.name}: ` : "",
|
|
1346
|
+
formatted,
|
|
1347
|
+
"]",
|
|
1348
|
+
" "
|
|
1349
|
+
] }, m.id);
|
|
1350
|
+
})
|
|
1351
|
+
] }),
|
|
1352
|
+
!item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
|
|
1353
|
+
(log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: getDiffLines(log).map(({ type, line }, lineIdx) => /* @__PURE__ */ jsxRuntime.jsx(
|
|
1354
|
+
ink.Text,
|
|
1355
|
+
{
|
|
1356
|
+
color: type === "remove" ? "red" : type === "add" ? "green" : "gray",
|
|
1357
|
+
children: line
|
|
1358
|
+
},
|
|
1359
|
+
lineIdx
|
|
1360
|
+
)) }, logIdx) : null
|
|
1361
|
+
) })
|
|
1362
|
+
] }, item.evaluatorId))
|
|
1316
1363
|
] }, i)) }),
|
|
1317
1364
|
phase === "completed" && summary && /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", children: [
|
|
1318
1365
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children: "Run Summary" }),
|
|
@@ -1561,6 +1608,17 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
1561
1608
|
item.metrics
|
|
1562
1609
|
)
|
|
1563
1610
|
);
|
|
1611
|
+
if (!item.passed && item.logs && item.logs.length > 0) {
|
|
1612
|
+
for (const log of item.logs) {
|
|
1613
|
+
if (log.type === "diff") {
|
|
1614
|
+
const useColor = process.stdout.isTTY;
|
|
1615
|
+
for (const { type, line } of getDiffLines(log)) {
|
|
1616
|
+
const colored = useColor && type === "remove" ? colorize(` ${line}`, ansi2.red) : useColor && type === "add" ? colorize(` ${line}`, ansi2.green) : ` ${line}`;
|
|
1617
|
+
console.log(colored);
|
|
1618
|
+
}
|
|
1619
|
+
}
|
|
1620
|
+
}
|
|
1621
|
+
}
|
|
1564
1622
|
const numeric = toNumericScoreFromScores(item.scores);
|
|
1565
1623
|
if (numeric !== void 0) {
|
|
1566
1624
|
const current = aggregates.get(item.evaluatorId) ?? {
|