@m4trix/evals 0.15.0 → 0.17.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +44 -27
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +44 -27
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +56 -26
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +56 -26
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +43 -27
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +67 -4
- package/dist/index.js +42 -28
- package/dist/index.js.map +1 -1
- package/package.json +3 -2
package/dist/cli.cjs
CHANGED
|
@@ -13,7 +13,7 @@ var fs = require('fs');
|
|
|
13
13
|
var jitiModule = require('jiti');
|
|
14
14
|
var promises = require('fs/promises');
|
|
15
15
|
var url = require('url');
|
|
16
|
-
var
|
|
16
|
+
var jsonDiff = require('json-diff');
|
|
17
17
|
|
|
18
18
|
var _documentCurrentScript = typeof document !== 'undefined' ? document.currentScript : null;
|
|
19
19
|
function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
|
|
@@ -1004,45 +1004,46 @@ async function collectTestCasesFromFiles(config) {
|
|
|
1004
1004
|
);
|
|
1005
1005
|
return found.flat();
|
|
1006
1006
|
}
|
|
1007
|
-
function
|
|
1007
|
+
function createDiffString(expected, actual, diffOptions) {
|
|
1008
|
+
const opts = { ...diffOptions, color: false };
|
|
1009
|
+
const result = jsonDiff.diffString(expected, actual, opts);
|
|
1010
|
+
return typeof result === "string" ? result : "";
|
|
1011
|
+
}
|
|
1012
|
+
function formatLogMessage(msg) {
|
|
1013
|
+
if (typeof msg === "string")
|
|
1014
|
+
return msg;
|
|
1008
1015
|
try {
|
|
1009
|
-
|
|
1016
|
+
if (msg !== null && typeof msg === "object") {
|
|
1017
|
+
return JSON.stringify(msg, null, 2);
|
|
1018
|
+
}
|
|
1019
|
+
return String(msg);
|
|
1010
1020
|
} catch {
|
|
1011
|
-
return String(
|
|
1021
|
+
return String(msg);
|
|
1012
1022
|
}
|
|
1013
1023
|
}
|
|
1014
|
-
function
|
|
1015
|
-
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
|
|
1020
|
-
partLines.pop();
|
|
1021
|
-
}
|
|
1022
|
-
for (const line of partLines) {
|
|
1023
|
-
lines.push(`${prefix} ${line}`);
|
|
1024
|
-
}
|
|
1025
|
-
}
|
|
1026
|
-
return lines.join("\n");
|
|
1024
|
+
function createLogEntry(message, options) {
|
|
1025
|
+
return {
|
|
1026
|
+
type: "log",
|
|
1027
|
+
label: options?.label,
|
|
1028
|
+
message: formatLogMessage(message)
|
|
1029
|
+
};
|
|
1027
1030
|
}
|
|
1028
|
-
function
|
|
1029
|
-
|
|
1030
|
-
const actualStr = toJsonLines(actual);
|
|
1031
|
-
const changes = diff.diffLines(expectedStr, actualStr);
|
|
1032
|
-
return formatDiffString(changes);
|
|
1031
|
+
function getLogLines(entry) {
|
|
1032
|
+
return entry.message.split("\n");
|
|
1033
1033
|
}
|
|
1034
1034
|
function createDiffLogEntry(expected, actual, options) {
|
|
1035
|
-
const
|
|
1035
|
+
const { label, ...diffOpts } = options ?? {};
|
|
1036
|
+
const diff = createDiffString(expected, actual, diffOpts);
|
|
1036
1037
|
return {
|
|
1037
1038
|
type: "diff",
|
|
1038
|
-
label
|
|
1039
|
+
label,
|
|
1039
1040
|
expected,
|
|
1040
1041
|
actual,
|
|
1041
1042
|
diff: diff || "(no differences)"
|
|
1042
1043
|
};
|
|
1043
1044
|
}
|
|
1044
1045
|
function getDiffLines(entry) {
|
|
1045
|
-
const raw =
|
|
1046
|
+
const raw = entry.diff || "(no differences)";
|
|
1046
1047
|
return raw.split("\n").map((line) => {
|
|
1047
1048
|
const trimmed = line.trimStart();
|
|
1048
1049
|
if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
|
|
@@ -1286,6 +1287,7 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
1286
1287
|
const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
|
|
1287
1288
|
const rerunPassed = [];
|
|
1288
1289
|
for (let r = 0; r < reruns; r++) {
|
|
1290
|
+
const evaluatorRunId = `run-${crypto.randomUUID()}`;
|
|
1289
1291
|
const started = Date.now();
|
|
1290
1292
|
const evaluatorScores = [];
|
|
1291
1293
|
let testCaseError;
|
|
@@ -1300,6 +1302,9 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
1300
1302
|
const logDiff = (expected, actual, options) => {
|
|
1301
1303
|
logs.push(createDiffLogEntry(expected, actual, options));
|
|
1302
1304
|
};
|
|
1305
|
+
const log = (message, options) => {
|
|
1306
|
+
logs.push(createLogEntry(message, options));
|
|
1307
|
+
};
|
|
1303
1308
|
const ctx = yield* effect.Effect.promise(
|
|
1304
1309
|
() => Promise.resolve(evaluator.resolveContext())
|
|
1305
1310
|
);
|
|
@@ -1309,7 +1314,13 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
1309
1314
|
input: testCaseItem.testCase.getInput(),
|
|
1310
1315
|
ctx,
|
|
1311
1316
|
output,
|
|
1312
|
-
|
|
1317
|
+
meta: {
|
|
1318
|
+
triggerId: task.triggerId,
|
|
1319
|
+
runId: evaluatorRunId,
|
|
1320
|
+
datasetId: task.datasetId
|
|
1321
|
+
},
|
|
1322
|
+
logDiff,
|
|
1323
|
+
log
|
|
1313
1324
|
})
|
|
1314
1325
|
)
|
|
1315
1326
|
);
|
|
@@ -1797,6 +1808,7 @@ var EffectRunner = class {
|
|
|
1797
1808
|
(sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
|
|
1798
1809
|
0
|
|
1799
1810
|
);
|
|
1811
|
+
const triggerId = request.triggerId ?? `trg-${crypto.randomUUID()}`;
|
|
1800
1812
|
const runId = `run-${crypto.randomUUID()}`;
|
|
1801
1813
|
const artifactPath = createArtifactPath(
|
|
1802
1814
|
this.config.artifactDirectory,
|
|
@@ -1838,6 +1850,7 @@ var EffectRunner = class {
|
|
|
1838
1850
|
await effect.Effect.runPromise(
|
|
1839
1851
|
effect.Queue.offer(this.runQueue, {
|
|
1840
1852
|
runId,
|
|
1853
|
+
triggerId,
|
|
1841
1854
|
datasetId: request.datasetId,
|
|
1842
1855
|
dataset: dataset.dataset,
|
|
1843
1856
|
evaluators: selectedEvaluators,
|
|
@@ -2386,6 +2399,23 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
2386
2399
|
)
|
|
2387
2400
|
);
|
|
2388
2401
|
}
|
|
2402
|
+
} else if (log.type === "log") {
|
|
2403
|
+
const logLines = getLogLines(log);
|
|
2404
|
+
for (let lineIdx = 0; lineIdx < logLines.length; lineIdx++) {
|
|
2405
|
+
rows.push(
|
|
2406
|
+
/* @__PURE__ */ jsxRuntime.jsxs(
|
|
2407
|
+
ink.Text,
|
|
2408
|
+
{
|
|
2409
|
+
color: "gray",
|
|
2410
|
+
children: [
|
|
2411
|
+
" ",
|
|
2412
|
+
logLines[lineIdx]
|
|
2413
|
+
]
|
|
2414
|
+
},
|
|
2415
|
+
`tc-${tc.testCaseId}-${item.evaluatorId}-${logIdx}-${lineIdx}`
|
|
2416
|
+
)
|
|
2417
|
+
);
|
|
2418
|
+
}
|
|
2389
2419
|
}
|
|
2390
2420
|
}
|
|
2391
2421
|
}
|