@m4trix/evals 0.15.0 → 0.17.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +44 -27
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +44 -27
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +56 -26
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +56 -26
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +43 -27
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +67 -4
- package/dist/index.js +42 -28
- package/dist/index.js.map +1 -1
- package/package.json +3 -2
package/dist/cli-simple.js
CHANGED
|
@@ -6,7 +6,7 @@ import { resolve, relative, join, parse, dirname } from 'path';
|
|
|
6
6
|
import * as jitiModule from 'jiti';
|
|
7
7
|
import { writeFile, readdir, readFile, mkdir, appendFile } from 'fs/promises';
|
|
8
8
|
import { pathToFileURL } from 'url';
|
|
9
|
-
import {
|
|
9
|
+
import { diffString } from 'json-diff';
|
|
10
10
|
import React2, { useState, useEffect, useCallback } from 'react';
|
|
11
11
|
import { render, Box, Text } from 'ink';
|
|
12
12
|
import { jsxs, jsx, Fragment } from 'react/jsx-runtime';
|
|
@@ -260,45 +260,46 @@ async function collectTestCasesFromFiles(config) {
|
|
|
260
260
|
);
|
|
261
261
|
return found.flat();
|
|
262
262
|
}
|
|
263
|
-
function
|
|
263
|
+
function createDiffString(expected, actual, diffOptions) {
|
|
264
|
+
const opts = { ...diffOptions, color: false };
|
|
265
|
+
const result = diffString(expected, actual, opts);
|
|
266
|
+
return typeof result === "string" ? result : "";
|
|
267
|
+
}
|
|
268
|
+
function formatLogMessage(msg) {
|
|
269
|
+
if (typeof msg === "string")
|
|
270
|
+
return msg;
|
|
264
271
|
try {
|
|
265
|
-
|
|
272
|
+
if (msg !== null && typeof msg === "object") {
|
|
273
|
+
return JSON.stringify(msg, null, 2);
|
|
274
|
+
}
|
|
275
|
+
return String(msg);
|
|
266
276
|
} catch {
|
|
267
|
-
return String(
|
|
277
|
+
return String(msg);
|
|
268
278
|
}
|
|
269
279
|
}
|
|
270
|
-
function
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
partLines.pop();
|
|
277
|
-
}
|
|
278
|
-
for (const line of partLines) {
|
|
279
|
-
lines.push(`${prefix} ${line}`);
|
|
280
|
-
}
|
|
281
|
-
}
|
|
282
|
-
return lines.join("\n");
|
|
280
|
+
function createLogEntry(message, options) {
|
|
281
|
+
return {
|
|
282
|
+
type: "log",
|
|
283
|
+
label: options?.label,
|
|
284
|
+
message: formatLogMessage(message)
|
|
285
|
+
};
|
|
283
286
|
}
|
|
284
|
-
function
|
|
285
|
-
|
|
286
|
-
const actualStr = toJsonLines(actual);
|
|
287
|
-
const changes = diffLines(expectedStr, actualStr);
|
|
288
|
-
return formatDiffString(changes);
|
|
287
|
+
function getLogLines(entry) {
|
|
288
|
+
return entry.message.split("\n");
|
|
289
289
|
}
|
|
290
290
|
function createDiffLogEntry(expected, actual, options) {
|
|
291
|
-
const
|
|
291
|
+
const { label, ...diffOpts } = options ?? {};
|
|
292
|
+
const diff = createDiffString(expected, actual, diffOpts);
|
|
292
293
|
return {
|
|
293
294
|
type: "diff",
|
|
294
|
-
label
|
|
295
|
+
label,
|
|
295
296
|
expected,
|
|
296
297
|
actual,
|
|
297
298
|
diff: diff || "(no differences)"
|
|
298
299
|
};
|
|
299
300
|
}
|
|
300
301
|
function getDiffLines(entry) {
|
|
301
|
-
const raw =
|
|
302
|
+
const raw = entry.diff || "(no differences)";
|
|
302
303
|
return raw.split("\n").map((line) => {
|
|
303
304
|
const trimmed = line.trimStart();
|
|
304
305
|
if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
|
|
@@ -560,6 +561,7 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
560
561
|
const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
|
|
561
562
|
const rerunPassed = [];
|
|
562
563
|
for (let r = 0; r < reruns; r++) {
|
|
564
|
+
const evaluatorRunId = `run-${randomUUID()}`;
|
|
563
565
|
const started = Date.now();
|
|
564
566
|
const evaluatorScores = [];
|
|
565
567
|
let testCaseError;
|
|
@@ -574,6 +576,9 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
574
576
|
const logDiff = (expected, actual, options) => {
|
|
575
577
|
logs.push(createDiffLogEntry(expected, actual, options));
|
|
576
578
|
};
|
|
579
|
+
const log = (message, options) => {
|
|
580
|
+
logs.push(createLogEntry(message, options));
|
|
581
|
+
};
|
|
577
582
|
const ctx = yield* Effect.promise(
|
|
578
583
|
() => Promise.resolve(evaluator.resolveContext())
|
|
579
584
|
);
|
|
@@ -583,7 +588,13 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
583
588
|
input: testCaseItem.testCase.getInput(),
|
|
584
589
|
ctx,
|
|
585
590
|
output,
|
|
586
|
-
|
|
591
|
+
meta: {
|
|
592
|
+
triggerId: task.triggerId,
|
|
593
|
+
runId: evaluatorRunId,
|
|
594
|
+
datasetId: task.datasetId
|
|
595
|
+
},
|
|
596
|
+
logDiff,
|
|
597
|
+
log
|
|
587
598
|
})
|
|
588
599
|
)
|
|
589
600
|
);
|
|
@@ -1041,6 +1052,7 @@ var EffectRunner = class {
|
|
|
1041
1052
|
(sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
|
|
1042
1053
|
0
|
|
1043
1054
|
);
|
|
1055
|
+
const triggerId = request.triggerId ?? `trg-${randomUUID()}`;
|
|
1044
1056
|
const runId = `run-${randomUUID()}`;
|
|
1045
1057
|
const artifactPath = createArtifactPath(
|
|
1046
1058
|
this.config.artifactDirectory,
|
|
@@ -1082,6 +1094,7 @@ var EffectRunner = class {
|
|
|
1082
1094
|
await Effect.runPromise(
|
|
1083
1095
|
Queue.offer(this.runQueue, {
|
|
1084
1096
|
runId,
|
|
1097
|
+
triggerId,
|
|
1085
1098
|
datasetId: request.datasetId,
|
|
1086
1099
|
dataset: dataset.dataset,
|
|
1087
1100
|
evaluators: selectedEvaluators,
|
|
@@ -1756,7 +1769,7 @@ function RunView({
|
|
|
1756
1769
|
},
|
|
1757
1770
|
lineIdx
|
|
1758
1771
|
)
|
|
1759
|
-
) }, logIdx) : null
|
|
1772
|
+
) }, logIdx) : log.type === "log" ? /* @__PURE__ */ jsx(Box, { flexDirection: "column", children: getLogLines(log).map((line, lineIdx) => /* @__PURE__ */ jsx(Text, { color: "gray", children: line }, lineIdx)) }, logIdx) : null
|
|
1760
1773
|
) })
|
|
1761
1774
|
]
|
|
1762
1775
|
},
|
|
@@ -2260,6 +2273,10 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2260
2273
|
const colored = useColor && type === "remove" ? colorize(` ${line}`, ansi2.red) : useColor && type === "add" ? colorize(` ${line}`, ansi2.green) : ` ${line}`;
|
|
2261
2274
|
lines.push(colored);
|
|
2262
2275
|
}
|
|
2276
|
+
} else if (log.type === "log") {
|
|
2277
|
+
for (const line of getLogLines(log)) {
|
|
2278
|
+
lines.push(` ${line}`);
|
|
2279
|
+
}
|
|
2263
2280
|
}
|
|
2264
2281
|
}
|
|
2265
2282
|
}
|