@m4trix/evals 0.15.0 → 0.17.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +44 -27
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +44 -27
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +56 -26
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +56 -26
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +43 -27
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +67 -4
- package/dist/index.js +42 -28
- package/dist/index.js.map +1 -1
- package/package.json +3 -2
package/dist/cli-simple.cjs
CHANGED
|
@@ -8,7 +8,7 @@ var path = require('path');
|
|
|
8
8
|
var jitiModule = require('jiti');
|
|
9
9
|
var promises = require('fs/promises');
|
|
10
10
|
var url = require('url');
|
|
11
|
-
var
|
|
11
|
+
var jsonDiff = require('json-diff');
|
|
12
12
|
var React2 = require('react');
|
|
13
13
|
var ink = require('ink');
|
|
14
14
|
var jsxRuntime = require('react/jsx-runtime');
|
|
@@ -286,45 +286,46 @@ async function collectTestCasesFromFiles(config) {
|
|
|
286
286
|
);
|
|
287
287
|
return found.flat();
|
|
288
288
|
}
|
|
289
|
-
function
|
|
289
|
+
function createDiffString(expected, actual, diffOptions) {
|
|
290
|
+
const opts = { ...diffOptions, color: false };
|
|
291
|
+
const result = jsonDiff.diffString(expected, actual, opts);
|
|
292
|
+
return typeof result === "string" ? result : "";
|
|
293
|
+
}
|
|
294
|
+
function formatLogMessage(msg) {
|
|
295
|
+
if (typeof msg === "string")
|
|
296
|
+
return msg;
|
|
290
297
|
try {
|
|
291
|
-
|
|
298
|
+
if (msg !== null && typeof msg === "object") {
|
|
299
|
+
return JSON.stringify(msg, null, 2);
|
|
300
|
+
}
|
|
301
|
+
return String(msg);
|
|
292
302
|
} catch {
|
|
293
|
-
return String(
|
|
303
|
+
return String(msg);
|
|
294
304
|
}
|
|
295
305
|
}
|
|
296
|
-
function
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
partLines.pop();
|
|
303
|
-
}
|
|
304
|
-
for (const line of partLines) {
|
|
305
|
-
lines.push(`${prefix} ${line}`);
|
|
306
|
-
}
|
|
307
|
-
}
|
|
308
|
-
return lines.join("\n");
|
|
306
|
+
function createLogEntry(message, options) {
|
|
307
|
+
return {
|
|
308
|
+
type: "log",
|
|
309
|
+
label: options?.label,
|
|
310
|
+
message: formatLogMessage(message)
|
|
311
|
+
};
|
|
309
312
|
}
|
|
310
|
-
function
|
|
311
|
-
|
|
312
|
-
const actualStr = toJsonLines(actual);
|
|
313
|
-
const changes = diff.diffLines(expectedStr, actualStr);
|
|
314
|
-
return formatDiffString(changes);
|
|
313
|
+
function getLogLines(entry) {
|
|
314
|
+
return entry.message.split("\n");
|
|
315
315
|
}
|
|
316
316
|
function createDiffLogEntry(expected, actual, options) {
|
|
317
|
-
const
|
|
317
|
+
const { label, ...diffOpts } = options ?? {};
|
|
318
|
+
const diff = createDiffString(expected, actual, diffOpts);
|
|
318
319
|
return {
|
|
319
320
|
type: "diff",
|
|
320
|
-
label
|
|
321
|
+
label,
|
|
321
322
|
expected,
|
|
322
323
|
actual,
|
|
323
324
|
diff: diff || "(no differences)"
|
|
324
325
|
};
|
|
325
326
|
}
|
|
326
327
|
function getDiffLines(entry) {
|
|
327
|
-
const raw =
|
|
328
|
+
const raw = entry.diff || "(no differences)";
|
|
328
329
|
return raw.split("\n").map((line) => {
|
|
329
330
|
const trimmed = line.trimStart();
|
|
330
331
|
if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
|
|
@@ -586,6 +587,7 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
586
587
|
const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
|
|
587
588
|
const rerunPassed = [];
|
|
588
589
|
for (let r = 0; r < reruns; r++) {
|
|
590
|
+
const evaluatorRunId = `run-${crypto.randomUUID()}`;
|
|
589
591
|
const started = Date.now();
|
|
590
592
|
const evaluatorScores = [];
|
|
591
593
|
let testCaseError;
|
|
@@ -600,6 +602,9 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
600
602
|
const logDiff = (expected, actual, options) => {
|
|
601
603
|
logs.push(createDiffLogEntry(expected, actual, options));
|
|
602
604
|
};
|
|
605
|
+
const log = (message, options) => {
|
|
606
|
+
logs.push(createLogEntry(message, options));
|
|
607
|
+
};
|
|
603
608
|
const ctx = yield* effect.Effect.promise(
|
|
604
609
|
() => Promise.resolve(evaluator.resolveContext())
|
|
605
610
|
);
|
|
@@ -609,7 +614,13 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
609
614
|
input: testCaseItem.testCase.getInput(),
|
|
610
615
|
ctx,
|
|
611
616
|
output,
|
|
612
|
-
|
|
617
|
+
meta: {
|
|
618
|
+
triggerId: task.triggerId,
|
|
619
|
+
runId: evaluatorRunId,
|
|
620
|
+
datasetId: task.datasetId
|
|
621
|
+
},
|
|
622
|
+
logDiff,
|
|
623
|
+
log
|
|
613
624
|
})
|
|
614
625
|
)
|
|
615
626
|
);
|
|
@@ -1067,6 +1078,7 @@ var EffectRunner = class {
|
|
|
1067
1078
|
(sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
|
|
1068
1079
|
0
|
|
1069
1080
|
);
|
|
1081
|
+
const triggerId = request.triggerId ?? `trg-${crypto.randomUUID()}`;
|
|
1070
1082
|
const runId = `run-${crypto.randomUUID()}`;
|
|
1071
1083
|
const artifactPath = createArtifactPath(
|
|
1072
1084
|
this.config.artifactDirectory,
|
|
@@ -1108,6 +1120,7 @@ var EffectRunner = class {
|
|
|
1108
1120
|
await effect.Effect.runPromise(
|
|
1109
1121
|
effect.Queue.offer(this.runQueue, {
|
|
1110
1122
|
runId,
|
|
1123
|
+
triggerId,
|
|
1111
1124
|
datasetId: request.datasetId,
|
|
1112
1125
|
dataset: dataset.dataset,
|
|
1113
1126
|
evaluators: selectedEvaluators,
|
|
@@ -1782,7 +1795,7 @@ function RunView({
|
|
|
1782
1795
|
},
|
|
1783
1796
|
lineIdx
|
|
1784
1797
|
)
|
|
1785
|
-
) }, logIdx) : null
|
|
1798
|
+
) }, logIdx) : log.type === "log" ? /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: getLogLines(log).map((line, lineIdx) => /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: line }, lineIdx)) }, logIdx) : null
|
|
1786
1799
|
) })
|
|
1787
1800
|
]
|
|
1788
1801
|
},
|
|
@@ -2286,6 +2299,10 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2286
2299
|
const colored = useColor && type === "remove" ? colorize(` ${line}`, ansi2.red) : useColor && type === "add" ? colorize(` ${line}`, ansi2.green) : ` ${line}`;
|
|
2287
2300
|
lines.push(colored);
|
|
2288
2301
|
}
|
|
2302
|
+
} else if (log.type === "log") {
|
|
2303
|
+
for (const line of getLogLines(log)) {
|
|
2304
|
+
lines.push(` ${line}`);
|
|
2305
|
+
}
|
|
2289
2306
|
}
|
|
2290
2307
|
}
|
|
2291
2308
|
}
|