@m4trix/evals 0.26.0 → 0.27.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -3
- package/dist/cli-simple.cjs +17 -8
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +17 -8
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +14 -5
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +14 -5
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +93 -69
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +30 -10
- package/dist/index.js +91 -70
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli-simple.js
CHANGED
|
@@ -29,6 +29,7 @@ function makeEntityIdSchema(brand, label) {
|
|
|
29
29
|
var RunConfigNameSchema = makeEntityIdSchema("RunConfigName", "RunConfig name");
|
|
30
30
|
makeEntityIdSchema("EvaluatorName", "Evaluator name");
|
|
31
31
|
makeEntityIdSchema("TestCaseName", "Test case name");
|
|
32
|
+
makeEntityIdSchema("DatasetName", "Dataset name");
|
|
32
33
|
function validateWithSchema(schema, raw, context) {
|
|
33
34
|
const trimmed = raw.trim();
|
|
34
35
|
const decode = Schema.decodeUnknownEither(
|
|
@@ -585,6 +586,14 @@ function getTestCaseTagList(testCase) {
|
|
|
585
586
|
return typeof testCase.getTags === "function" ? [...testCase.getTags()] : [];
|
|
586
587
|
}
|
|
587
588
|
|
|
589
|
+
// src/evals/dataset.ts
|
|
590
|
+
function getDatasetDisplayLabel(dataset) {
|
|
591
|
+
if (typeof dataset.getDisplayLabel === "function") {
|
|
592
|
+
return dataset.getDisplayLabel();
|
|
593
|
+
}
|
|
594
|
+
return typeof dataset.getName === "function" ? dataset.getName() : "";
|
|
595
|
+
}
|
|
596
|
+
|
|
588
597
|
// src/evals/metric.ts
|
|
589
598
|
var registry = /* @__PURE__ */ new Map();
|
|
590
599
|
var Metric = {
|
|
@@ -978,7 +987,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
978
987
|
meta: {
|
|
979
988
|
triggerId: task.triggerId,
|
|
980
989
|
runId: evaluatorRunId,
|
|
981
|
-
|
|
990
|
+
datasetName: task.dataset.getDisplayLabel(),
|
|
982
991
|
repetitionId,
|
|
983
992
|
repetitionIndex,
|
|
984
993
|
repetitionCount,
|
|
@@ -1393,7 +1402,7 @@ var EffectRunner = class {
|
|
|
1393
1402
|
);
|
|
1394
1403
|
if (!dsCollected) {
|
|
1395
1404
|
throw new Error(
|
|
1396
|
-
`RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.
|
|
1405
|
+
`RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getDisplayLabel()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
|
|
1397
1406
|
);
|
|
1398
1407
|
}
|
|
1399
1408
|
let evaluatorIds;
|
|
@@ -1528,7 +1537,7 @@ var EffectRunner = class {
|
|
|
1528
1537
|
const snapshot = {
|
|
1529
1538
|
runId,
|
|
1530
1539
|
datasetId: params.datasetId,
|
|
1531
|
-
datasetName: dataset.dataset.
|
|
1540
|
+
datasetName: dataset.dataset.getDisplayLabel(),
|
|
1532
1541
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
1533
1542
|
queuedAt: Date.now(),
|
|
1534
1543
|
totalTestCases: totalEvaluations,
|
|
@@ -1549,7 +1558,7 @@ var EffectRunner = class {
|
|
|
1549
1558
|
type: "RunQueued",
|
|
1550
1559
|
runId,
|
|
1551
1560
|
datasetId: params.datasetId,
|
|
1552
|
-
datasetName: dataset.dataset.
|
|
1561
|
+
datasetName: dataset.dataset.getDisplayLabel(),
|
|
1553
1562
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
1554
1563
|
totalTestCases: totalEvaluations,
|
|
1555
1564
|
artifactPath
|
|
@@ -1703,7 +1712,7 @@ function getSimpleCliUsage() {
|
|
|
1703
1712
|
return [
|
|
1704
1713
|
"Usage:",
|
|
1705
1714
|
" eval-agents-simple run --run-config <name> [--run-config <name> ...] [--concurrency N] [--ci]",
|
|
1706
|
-
" eval-agents-simple generate --dataset <
|
|
1715
|
+
" eval-agents-simple generate --dataset <datasetId>",
|
|
1707
1716
|
"",
|
|
1708
1717
|
"Options:",
|
|
1709
1718
|
" --ci With run: exit with code 1 if any test case fails.",
|
|
@@ -1771,7 +1780,7 @@ function GenerateView({
|
|
|
1771
1780
|
if (!cancelled) {
|
|
1772
1781
|
setResult({
|
|
1773
1782
|
count: payload.length,
|
|
1774
|
-
datasetName: dataset.dataset
|
|
1783
|
+
datasetName: getDatasetDisplayLabel(dataset.dataset),
|
|
1775
1784
|
outputPath
|
|
1776
1785
|
});
|
|
1777
1786
|
setTimeout(() => onComplete(), 200);
|
|
@@ -1832,7 +1841,7 @@ async function generateDatasetJsonCommandPlain(runner, datasetName) {
|
|
|
1832
1841
|
const outputPath = createOutputPath(absoluteDatasetPath);
|
|
1833
1842
|
await writeFile(outputPath, `${JSON.stringify(payload, null, 2)}
|
|
1834
1843
|
`, "utf8");
|
|
1835
|
-
console.log(`Generated ${payload.length} test cases for dataset "${dataset.dataset
|
|
1844
|
+
console.log(`Generated ${payload.length} test cases for dataset "${getDatasetDisplayLabel(dataset.dataset)}".`);
|
|
1836
1845
|
console.log(`Wrote ${outputPath}`);
|
|
1837
1846
|
}
|
|
1838
1847
|
async function generateDatasetJsonCommandInk(runner, datasetName) {
|
|
@@ -3083,7 +3092,7 @@ async function main() {
|
|
|
3083
3092
|
}
|
|
3084
3093
|
const genDataset = args.datasetName;
|
|
3085
3094
|
if (!genDataset) {
|
|
3086
|
-
console.error("Missing required --dataset <
|
|
3095
|
+
console.error("Missing required --dataset <datasetId> argument.");
|
|
3087
3096
|
printUsageAndExit(1);
|
|
3088
3097
|
}
|
|
3089
3098
|
await (useInk ? generateDatasetJsonCommandInk : generateDatasetJsonCommandPlain)(
|