@m4trix/evals 0.15.0 → 0.17.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +44 -27
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +44 -27
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +56 -26
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +56 -26
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +43 -27
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +67 -4
- package/dist/index.js +42 -28
- package/dist/index.js.map +1 -1
- package/package.json +3 -2
package/dist/cli.js
CHANGED
|
@@ -11,7 +11,7 @@ import { existsSync } from 'fs';
|
|
|
11
11
|
import * as jitiModule from 'jiti';
|
|
12
12
|
import { readdir, readFile, mkdir, appendFile } from 'fs/promises';
|
|
13
13
|
import { pathToFileURL } from 'url';
|
|
14
|
-
import {
|
|
14
|
+
import { diffString } from 'json-diff';
|
|
15
15
|
|
|
16
16
|
var SEP = " ";
|
|
17
17
|
var ARROW = "\u203A";
|
|
@@ -978,45 +978,46 @@ async function collectTestCasesFromFiles(config) {
|
|
|
978
978
|
);
|
|
979
979
|
return found.flat();
|
|
980
980
|
}
|
|
981
|
-
function
|
|
981
|
+
function createDiffString(expected, actual, diffOptions) {
|
|
982
|
+
const opts = { ...diffOptions, color: false };
|
|
983
|
+
const result = diffString(expected, actual, opts);
|
|
984
|
+
return typeof result === "string" ? result : "";
|
|
985
|
+
}
|
|
986
|
+
function formatLogMessage(msg) {
|
|
987
|
+
if (typeof msg === "string")
|
|
988
|
+
return msg;
|
|
982
989
|
try {
|
|
983
|
-
|
|
990
|
+
if (msg !== null && typeof msg === "object") {
|
|
991
|
+
return JSON.stringify(msg, null, 2);
|
|
992
|
+
}
|
|
993
|
+
return String(msg);
|
|
984
994
|
} catch {
|
|
985
|
-
return String(
|
|
995
|
+
return String(msg);
|
|
986
996
|
}
|
|
987
997
|
}
|
|
988
|
-
function
|
|
989
|
-
|
|
990
|
-
|
|
991
|
-
|
|
992
|
-
|
|
993
|
-
|
|
994
|
-
partLines.pop();
|
|
995
|
-
}
|
|
996
|
-
for (const line of partLines) {
|
|
997
|
-
lines.push(`${prefix} ${line}`);
|
|
998
|
-
}
|
|
999
|
-
}
|
|
1000
|
-
return lines.join("\n");
|
|
998
|
+
function createLogEntry(message, options) {
|
|
999
|
+
return {
|
|
1000
|
+
type: "log",
|
|
1001
|
+
label: options?.label,
|
|
1002
|
+
message: formatLogMessage(message)
|
|
1003
|
+
};
|
|
1001
1004
|
}
|
|
1002
|
-
function
|
|
1003
|
-
|
|
1004
|
-
const actualStr = toJsonLines(actual);
|
|
1005
|
-
const changes = diffLines(expectedStr, actualStr);
|
|
1006
|
-
return formatDiffString(changes);
|
|
1005
|
+
function getLogLines(entry) {
|
|
1006
|
+
return entry.message.split("\n");
|
|
1007
1007
|
}
|
|
1008
1008
|
function createDiffLogEntry(expected, actual, options) {
|
|
1009
|
-
const
|
|
1009
|
+
const { label, ...diffOpts } = options ?? {};
|
|
1010
|
+
const diff = createDiffString(expected, actual, diffOpts);
|
|
1010
1011
|
return {
|
|
1011
1012
|
type: "diff",
|
|
1012
|
-
label
|
|
1013
|
+
label,
|
|
1013
1014
|
expected,
|
|
1014
1015
|
actual,
|
|
1015
1016
|
diff: diff || "(no differences)"
|
|
1016
1017
|
};
|
|
1017
1018
|
}
|
|
1018
1019
|
function getDiffLines(entry) {
|
|
1019
|
-
const raw =
|
|
1020
|
+
const raw = entry.diff || "(no differences)";
|
|
1020
1021
|
return raw.split("\n").map((line) => {
|
|
1021
1022
|
const trimmed = line.trimStart();
|
|
1022
1023
|
if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
|
|
@@ -1260,6 +1261,7 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
1260
1261
|
const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
|
|
1261
1262
|
const rerunPassed = [];
|
|
1262
1263
|
for (let r = 0; r < reruns; r++) {
|
|
1264
|
+
const evaluatorRunId = `run-${randomUUID()}`;
|
|
1263
1265
|
const started = Date.now();
|
|
1264
1266
|
const evaluatorScores = [];
|
|
1265
1267
|
let testCaseError;
|
|
@@ -1274,6 +1276,9 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
1274
1276
|
const logDiff = (expected, actual, options) => {
|
|
1275
1277
|
logs.push(createDiffLogEntry(expected, actual, options));
|
|
1276
1278
|
};
|
|
1279
|
+
const log = (message, options) => {
|
|
1280
|
+
logs.push(createLogEntry(message, options));
|
|
1281
|
+
};
|
|
1277
1282
|
const ctx = yield* Effect.promise(
|
|
1278
1283
|
() => Promise.resolve(evaluator.resolveContext())
|
|
1279
1284
|
);
|
|
@@ -1283,7 +1288,13 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
1283
1288
|
input: testCaseItem.testCase.getInput(),
|
|
1284
1289
|
ctx,
|
|
1285
1290
|
output,
|
|
1286
|
-
|
|
1291
|
+
meta: {
|
|
1292
|
+
triggerId: task.triggerId,
|
|
1293
|
+
runId: evaluatorRunId,
|
|
1294
|
+
datasetId: task.datasetId
|
|
1295
|
+
},
|
|
1296
|
+
logDiff,
|
|
1297
|
+
log
|
|
1287
1298
|
})
|
|
1288
1299
|
)
|
|
1289
1300
|
);
|
|
@@ -1771,6 +1782,7 @@ var EffectRunner = class {
|
|
|
1771
1782
|
(sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
|
|
1772
1783
|
0
|
|
1773
1784
|
);
|
|
1785
|
+
const triggerId = request.triggerId ?? `trg-${randomUUID()}`;
|
|
1774
1786
|
const runId = `run-${randomUUID()}`;
|
|
1775
1787
|
const artifactPath = createArtifactPath(
|
|
1776
1788
|
this.config.artifactDirectory,
|
|
@@ -1812,6 +1824,7 @@ var EffectRunner = class {
|
|
|
1812
1824
|
await Effect.runPromise(
|
|
1813
1825
|
Queue.offer(this.runQueue, {
|
|
1814
1826
|
runId,
|
|
1827
|
+
triggerId,
|
|
1815
1828
|
datasetId: request.datasetId,
|
|
1816
1829
|
dataset: dataset.dataset,
|
|
1817
1830
|
evaluators: selectedEvaluators,
|
|
@@ -2360,6 +2373,23 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
2360
2373
|
)
|
|
2361
2374
|
);
|
|
2362
2375
|
}
|
|
2376
|
+
} else if (log.type === "log") {
|
|
2377
|
+
const logLines = getLogLines(log);
|
|
2378
|
+
for (let lineIdx = 0; lineIdx < logLines.length; lineIdx++) {
|
|
2379
|
+
rows.push(
|
|
2380
|
+
/* @__PURE__ */ jsxs(
|
|
2381
|
+
Text,
|
|
2382
|
+
{
|
|
2383
|
+
color: "gray",
|
|
2384
|
+
children: [
|
|
2385
|
+
" ",
|
|
2386
|
+
logLines[lineIdx]
|
|
2387
|
+
]
|
|
2388
|
+
},
|
|
2389
|
+
`tc-${tc.testCaseId}-${item.evaluatorId}-${logIdx}-${lineIdx}`
|
|
2390
|
+
)
|
|
2391
|
+
);
|
|
2392
|
+
}
|
|
2363
2393
|
}
|
|
2364
2394
|
}
|
|
2365
2395
|
}
|