@m4trix/evals 0.10.0 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +85 -27
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +85 -27
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +24 -3
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +24 -3
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +23 -2
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +25 -13
- package/dist/index.js +23 -2
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli-simple.js
CHANGED
|
@@ -6,7 +6,7 @@ import { resolve, relative, join, parse, dirname } from 'path';
|
|
|
6
6
|
import * as jitiModule from 'jiti';
|
|
7
7
|
import { writeFile, mkdir, appendFile, readdir } from 'fs/promises';
|
|
8
8
|
import { pathToFileURL } from 'url';
|
|
9
|
-
import 'json-diff';
|
|
9
|
+
import { diffString } from 'json-diff';
|
|
10
10
|
import React2, { useState, useEffect, useCallback } from 'react';
|
|
11
11
|
import { render, Box, Text } from 'ink';
|
|
12
12
|
import { jsxs, jsx, Fragment } from 'react/jsx-runtime';
|
|
@@ -256,6 +256,29 @@ async function collectTestCasesFromFiles(config) {
|
|
|
256
256
|
);
|
|
257
257
|
return found.flat();
|
|
258
258
|
}
|
|
259
|
+
function createDiffLogEntry(expected, actual, options) {
|
|
260
|
+
const diff = diffString(expected, actual, { color: false });
|
|
261
|
+
return {
|
|
262
|
+
type: "diff",
|
|
263
|
+
label: options?.label,
|
|
264
|
+
expected,
|
|
265
|
+
actual,
|
|
266
|
+
diff: diff || "(no differences)"
|
|
267
|
+
};
|
|
268
|
+
}
|
|
269
|
+
function getDiffLines(entry) {
|
|
270
|
+
const raw = diffString(entry.expected, entry.actual, { color: false }) || "(no differences)";
|
|
271
|
+
return raw.split("\n").map((line) => {
|
|
272
|
+
const trimmed = line.trimStart();
|
|
273
|
+
if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
|
|
274
|
+
return { type: "remove", line };
|
|
275
|
+
}
|
|
276
|
+
if (trimmed.startsWith("+") && !trimmed.startsWith("+++")) {
|
|
277
|
+
return { type: "add", line };
|
|
278
|
+
}
|
|
279
|
+
return { type: "context", line };
|
|
280
|
+
});
|
|
281
|
+
}
|
|
259
282
|
|
|
260
283
|
// src/evals/metric.ts
|
|
261
284
|
var registry = /* @__PURE__ */ new Map();
|
|
@@ -439,6 +462,10 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
439
462
|
continue;
|
|
440
463
|
}
|
|
441
464
|
try {
|
|
465
|
+
const logs = [];
|
|
466
|
+
const logDiff = (expected, actual, options) => {
|
|
467
|
+
logs.push(createDiffLogEntry(expected, actual, options));
|
|
468
|
+
};
|
|
442
469
|
const ctx = yield* Effect.promise(
|
|
443
470
|
() => Promise.resolve(evaluator.resolveContext())
|
|
444
471
|
);
|
|
@@ -447,13 +474,20 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
447
474
|
evaluateFn({
|
|
448
475
|
input: testCaseItem.testCase.getInput(),
|
|
449
476
|
ctx,
|
|
450
|
-
output
|
|
477
|
+
output,
|
|
478
|
+
logDiff
|
|
451
479
|
})
|
|
452
480
|
)
|
|
453
481
|
);
|
|
454
482
|
const { scores, metrics } = normalizeResult(result);
|
|
455
483
|
const passed = computeEvaluatorPassed(evaluator, result, scores);
|
|
456
|
-
evaluatorScores.push({
|
|
484
|
+
evaluatorScores.push({
|
|
485
|
+
evaluatorId,
|
|
486
|
+
scores,
|
|
487
|
+
passed,
|
|
488
|
+
metrics,
|
|
489
|
+
logs: logs.length > 0 ? logs : void 0
|
|
490
|
+
});
|
|
457
491
|
} catch (error) {
|
|
458
492
|
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
459
493
|
evaluatorScores.push({
|
|
@@ -1176,7 +1210,8 @@ function RunView({
|
|
|
1176
1210
|
evaluatorName: nameById.get(item.evaluatorId) ?? item.evaluatorId,
|
|
1177
1211
|
scores: item.scores,
|
|
1178
1212
|
passed: item.passed,
|
|
1179
|
-
metrics: item.metrics
|
|
1213
|
+
metrics: item.metrics,
|
|
1214
|
+
logs: item.logs
|
|
1180
1215
|
}))
|
|
1181
1216
|
}
|
|
1182
1217
|
]);
|
|
@@ -1263,30 +1298,42 @@ function RunView({
|
|
|
1263
1298
|
"ms)"
|
|
1264
1299
|
] })
|
|
1265
1300
|
] }),
|
|
1266
|
-
tc.evaluatorScores.map((item) => /* @__PURE__ */
|
|
1267
|
-
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
|
|
1271
|
-
|
|
1272
|
-
|
|
1273
|
-
|
|
1274
|
-
|
|
1275
|
-
] }, s.id)),
|
|
1276
|
-
item.metrics?.map((m) => {
|
|
1277
|
-
const def = getMetricById(m.id);
|
|
1278
|
-
if (!def)
|
|
1279
|
-
return null;
|
|
1280
|
-
const formatted = def.format(m.data);
|
|
1281
|
-
return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
1282
|
-
"[",
|
|
1283
|
-
def.name ? `${def.name}: ` : "",
|
|
1284
|
-
formatted,
|
|
1285
|
-
"]",
|
|
1301
|
+
tc.evaluatorScores.map((item) => /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginLeft: 2, children: [
|
|
1302
|
+
/* @__PURE__ */ jsxs(Text, { children: [
|
|
1303
|
+
item.evaluatorName,
|
|
1304
|
+
":",
|
|
1305
|
+
" ",
|
|
1306
|
+
/* @__PURE__ */ jsx(Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
|
|
1307
|
+
" ",
|
|
1308
|
+
item.scores.map((s) => /* @__PURE__ */ jsxs(Text, { color: scoreColor(toNumericScore(s.data) ?? 0), children: [
|
|
1309
|
+
formatScorePart(s),
|
|
1286
1310
|
" "
|
|
1287
|
-
] },
|
|
1288
|
-
|
|
1289
|
-
|
|
1311
|
+
] }, s.id)),
|
|
1312
|
+
item.metrics?.map((m) => {
|
|
1313
|
+
const def = getMetricById(m.id);
|
|
1314
|
+
if (!def)
|
|
1315
|
+
return null;
|
|
1316
|
+
const formatted = def.format(m.data);
|
|
1317
|
+
return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
1318
|
+
"[",
|
|
1319
|
+
def.name ? `${def.name}: ` : "",
|
|
1320
|
+
formatted,
|
|
1321
|
+
"]",
|
|
1322
|
+
" "
|
|
1323
|
+
] }, m.id);
|
|
1324
|
+
})
|
|
1325
|
+
] }),
|
|
1326
|
+
!item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsx(Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
|
|
1327
|
+
(log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsx(Box, { flexDirection: "column", children: getDiffLines(log).map(({ type, line }, lineIdx) => /* @__PURE__ */ jsx(
|
|
1328
|
+
Text,
|
|
1329
|
+
{
|
|
1330
|
+
color: type === "remove" ? "red" : type === "add" ? "green" : "gray",
|
|
1331
|
+
children: line
|
|
1332
|
+
},
|
|
1333
|
+
lineIdx
|
|
1334
|
+
)) }, logIdx) : null
|
|
1335
|
+
) })
|
|
1336
|
+
] }, item.evaluatorId))
|
|
1290
1337
|
] }, i)) }),
|
|
1291
1338
|
phase === "completed" && summary && /* @__PURE__ */ jsxs(Box, { flexDirection: "column", children: [
|
|
1292
1339
|
/* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Run Summary" }),
|
|
@@ -1535,6 +1582,17 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
1535
1582
|
item.metrics
|
|
1536
1583
|
)
|
|
1537
1584
|
);
|
|
1585
|
+
if (!item.passed && item.logs && item.logs.length > 0) {
|
|
1586
|
+
for (const log of item.logs) {
|
|
1587
|
+
if (log.type === "diff") {
|
|
1588
|
+
const useColor = process.stdout.isTTY;
|
|
1589
|
+
for (const { type, line } of getDiffLines(log)) {
|
|
1590
|
+
const colored = useColor && type === "remove" ? colorize(` ${line}`, ansi2.red) : useColor && type === "add" ? colorize(` ${line}`, ansi2.green) : ` ${line}`;
|
|
1591
|
+
console.log(colored);
|
|
1592
|
+
}
|
|
1593
|
+
}
|
|
1594
|
+
}
|
|
1595
|
+
}
|
|
1538
1596
|
const numeric = toNumericScoreFromScores(item.scores);
|
|
1539
1597
|
if (numeric !== void 0) {
|
|
1540
1598
|
const current = aggregates.get(item.evaluatorId) ?? {
|