@m4trix/evals 0.14.0 → 0.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +200 -111
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +200 -111
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +100 -44
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +100 -44
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +35 -27
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +49 -4
- package/dist/index.js +34 -28
- package/dist/index.js.map +1 -1
- package/package.json +3 -2
package/dist/cli.js
CHANGED
|
@@ -11,7 +11,7 @@ import { existsSync } from 'fs';
|
|
|
11
11
|
import * as jitiModule from 'jiti';
|
|
12
12
|
import { readdir, readFile, mkdir, appendFile } from 'fs/promises';
|
|
13
13
|
import { pathToFileURL } from 'url';
|
|
14
|
-
import {
|
|
14
|
+
import { diffString } from 'json-diff';
|
|
15
15
|
|
|
16
16
|
var SEP = " ";
|
|
17
17
|
var ARROW = "\u203A";
|
|
@@ -978,45 +978,46 @@ async function collectTestCasesFromFiles(config) {
|
|
|
978
978
|
);
|
|
979
979
|
return found.flat();
|
|
980
980
|
}
|
|
981
|
-
function
|
|
981
|
+
function createDiffString(expected, actual, diffOptions) {
|
|
982
|
+
const opts = { ...diffOptions, color: false };
|
|
983
|
+
const result = diffString(expected, actual, opts);
|
|
984
|
+
return typeof result === "string" ? result : "";
|
|
985
|
+
}
|
|
986
|
+
function formatLogMessage(msg) {
|
|
987
|
+
if (typeof msg === "string")
|
|
988
|
+
return msg;
|
|
982
989
|
try {
|
|
983
|
-
|
|
990
|
+
if (msg !== null && typeof msg === "object") {
|
|
991
|
+
return JSON.stringify(msg, null, 2);
|
|
992
|
+
}
|
|
993
|
+
return String(msg);
|
|
984
994
|
} catch {
|
|
985
|
-
return String(
|
|
995
|
+
return String(msg);
|
|
986
996
|
}
|
|
987
997
|
}
|
|
988
|
-
function
|
|
989
|
-
|
|
990
|
-
|
|
991
|
-
|
|
992
|
-
|
|
993
|
-
|
|
994
|
-
partLines.pop();
|
|
995
|
-
}
|
|
996
|
-
for (const line of partLines) {
|
|
997
|
-
lines.push(`${prefix} ${line}`);
|
|
998
|
-
}
|
|
999
|
-
}
|
|
1000
|
-
return lines.join("\n");
|
|
998
|
+
function createLogEntry(message, options) {
|
|
999
|
+
return {
|
|
1000
|
+
type: "log",
|
|
1001
|
+
label: options?.label,
|
|
1002
|
+
message: formatLogMessage(message)
|
|
1003
|
+
};
|
|
1001
1004
|
}
|
|
1002
|
-
function
|
|
1003
|
-
|
|
1004
|
-
const actualStr = toJsonLines(actual);
|
|
1005
|
-
const changes = diffLines(expectedStr, actualStr);
|
|
1006
|
-
return formatDiffString(changes);
|
|
1005
|
+
function getLogLines(entry) {
|
|
1006
|
+
return entry.message.split("\n");
|
|
1007
1007
|
}
|
|
1008
1008
|
function createDiffLogEntry(expected, actual, options) {
|
|
1009
|
-
const
|
|
1009
|
+
const { label, ...diffOpts } = options ?? {};
|
|
1010
|
+
const diff = createDiffString(expected, actual, diffOpts);
|
|
1010
1011
|
return {
|
|
1011
1012
|
type: "diff",
|
|
1012
|
-
label
|
|
1013
|
+
label,
|
|
1013
1014
|
expected,
|
|
1014
1015
|
actual,
|
|
1015
1016
|
diff: diff || "(no differences)"
|
|
1016
1017
|
};
|
|
1017
1018
|
}
|
|
1018
1019
|
function getDiffLines(entry) {
|
|
1019
|
-
const raw =
|
|
1020
|
+
const raw = entry.diff || "(no differences)";
|
|
1020
1021
|
return raw.split("\n").map((line) => {
|
|
1021
1022
|
const trimmed = line.trimStart();
|
|
1022
1023
|
if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
|
|
@@ -1274,6 +1275,9 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
1274
1275
|
const logDiff = (expected, actual, options) => {
|
|
1275
1276
|
logs.push(createDiffLogEntry(expected, actual, options));
|
|
1276
1277
|
};
|
|
1278
|
+
const log = (message, options) => {
|
|
1279
|
+
logs.push(createLogEntry(message, options));
|
|
1280
|
+
};
|
|
1277
1281
|
const ctx = yield* Effect.promise(
|
|
1278
1282
|
() => Promise.resolve(evaluator.resolveContext())
|
|
1279
1283
|
);
|
|
@@ -1283,7 +1287,8 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
1283
1287
|
input: testCaseItem.testCase.getInput(),
|
|
1284
1288
|
ctx,
|
|
1285
1289
|
output,
|
|
1286
|
-
logDiff
|
|
1290
|
+
logDiff,
|
|
1291
|
+
log
|
|
1287
1292
|
})
|
|
1288
1293
|
)
|
|
1289
1294
|
);
|
|
@@ -2285,26 +2290,60 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
2285
2290
|
":",
|
|
2286
2291
|
" ",
|
|
2287
2292
|
/* @__PURE__ */ jsx(Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
|
|
2288
|
-
|
|
2289
|
-
|
|
2290
|
-
|
|
2291
|
-
|
|
2292
|
-
|
|
2293
|
-
|
|
2294
|
-
|
|
2295
|
-
|
|
2296
|
-
|
|
2297
|
-
|
|
2298
|
-
|
|
2299
|
-
|
|
2300
|
-
|
|
2301
|
-
|
|
2302
|
-
|
|
2303
|
-
|
|
2304
|
-
] }, m.id);
|
|
2305
|
-
})
|
|
2293
|
+
item.metrics && item.metrics.length > 0 ? /* @__PURE__ */ jsxs(Fragment, { children: [
|
|
2294
|
+
" ",
|
|
2295
|
+
item.metrics.map((m) => {
|
|
2296
|
+
const def = getMetricById(m.id);
|
|
2297
|
+
if (!def)
|
|
2298
|
+
return null;
|
|
2299
|
+
const formatted = def.format(m.data);
|
|
2300
|
+
return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
2301
|
+
"[",
|
|
2302
|
+
def.name ? `${def.name}: ` : "",
|
|
2303
|
+
formatted,
|
|
2304
|
+
"]",
|
|
2305
|
+
" "
|
|
2306
|
+
] }, m.id);
|
|
2307
|
+
})
|
|
2308
|
+
] }) : null
|
|
2306
2309
|
] }, `tc-${tc.testCaseId}-${item.evaluatorId}`)
|
|
2307
2310
|
);
|
|
2311
|
+
if (item.scores.length > 0) {
|
|
2312
|
+
for (let sIdx = 0; sIdx < item.scores.length; sIdx++) {
|
|
2313
|
+
const s = item.scores[sIdx];
|
|
2314
|
+
const def = getScoreById(s.id);
|
|
2315
|
+
const scoreLabel = def ? def.name ?? def.id : s.id;
|
|
2316
|
+
rows.push(
|
|
2317
|
+
/* @__PURE__ */ jsxs(
|
|
2318
|
+
Text,
|
|
2319
|
+
{
|
|
2320
|
+
color: scoreColor(toNumericScore(s.data) ?? 0),
|
|
2321
|
+
children: [
|
|
2322
|
+
" ",
|
|
2323
|
+
scoreLabel,
|
|
2324
|
+
": ",
|
|
2325
|
+
formatScorePart(s)
|
|
2326
|
+
]
|
|
2327
|
+
},
|
|
2328
|
+
`tc-${tc.testCaseId}-${item.evaluatorId}-score-${sIdx}`
|
|
2329
|
+
)
|
|
2330
|
+
);
|
|
2331
|
+
}
|
|
2332
|
+
} else {
|
|
2333
|
+
rows.push(
|
|
2334
|
+
/* @__PURE__ */ jsxs(
|
|
2335
|
+
Text,
|
|
2336
|
+
{
|
|
2337
|
+
color: "gray",
|
|
2338
|
+
children: [
|
|
2339
|
+
" ",
|
|
2340
|
+
"n/a"
|
|
2341
|
+
]
|
|
2342
|
+
},
|
|
2343
|
+
`tc-${tc.testCaseId}-${item.evaluatorId}-n/a`
|
|
2344
|
+
)
|
|
2345
|
+
);
|
|
2346
|
+
}
|
|
2308
2347
|
if (!item.passed && item.logs && item.logs.length > 0) {
|
|
2309
2348
|
for (let logIdx = 0; logIdx < item.logs.length; logIdx++) {
|
|
2310
2349
|
const log = item.logs[logIdx];
|
|
@@ -2326,6 +2365,23 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
2326
2365
|
)
|
|
2327
2366
|
);
|
|
2328
2367
|
}
|
|
2368
|
+
} else if (log.type === "log") {
|
|
2369
|
+
const logLines = getLogLines(log);
|
|
2370
|
+
for (let lineIdx = 0; lineIdx < logLines.length; lineIdx++) {
|
|
2371
|
+
rows.push(
|
|
2372
|
+
/* @__PURE__ */ jsxs(
|
|
2373
|
+
Text,
|
|
2374
|
+
{
|
|
2375
|
+
color: "gray",
|
|
2376
|
+
children: [
|
|
2377
|
+
" ",
|
|
2378
|
+
logLines[lineIdx]
|
|
2379
|
+
]
|
|
2380
|
+
},
|
|
2381
|
+
`tc-${tc.testCaseId}-${item.evaluatorId}-${logIdx}-${lineIdx}`
|
|
2382
|
+
)
|
|
2383
|
+
);
|
|
2384
|
+
}
|
|
2329
2385
|
}
|
|
2330
2386
|
}
|
|
2331
2387
|
}
|