@m4trix/evals 0.26.0 → 0.28.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -4
- package/dist/cli-simple.cjs +53 -23
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +53 -23
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +25 -12
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +25 -12
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +108 -79
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +65 -24
- package/dist/index.js +106 -80
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli-simple.js
CHANGED
|
@@ -29,6 +29,7 @@ function makeEntityIdSchema(brand, label) {
|
|
|
29
29
|
var RunConfigNameSchema = makeEntityIdSchema("RunConfigName", "RunConfig name");
|
|
30
30
|
makeEntityIdSchema("EvaluatorName", "Evaluator name");
|
|
31
31
|
makeEntityIdSchema("TestCaseName", "Test case name");
|
|
32
|
+
makeEntityIdSchema("DatasetName", "Dataset name");
|
|
32
33
|
function validateWithSchema(schema, raw, context) {
|
|
33
34
|
const trimmed = raw.trim();
|
|
34
35
|
const decode = Schema.decodeUnknownEither(
|
|
@@ -585,6 +586,14 @@ function getTestCaseTagList(testCase) {
|
|
|
585
586
|
return typeof testCase.getTags === "function" ? [...testCase.getTags()] : [];
|
|
586
587
|
}
|
|
587
588
|
|
|
589
|
+
// src/evals/dataset.ts
|
|
590
|
+
function getDatasetDisplayLabel(dataset) {
|
|
591
|
+
if (typeof dataset.getDisplayLabel === "function") {
|
|
592
|
+
return dataset.getDisplayLabel();
|
|
593
|
+
}
|
|
594
|
+
return typeof dataset.getName === "function" ? dataset.getName() : "";
|
|
595
|
+
}
|
|
596
|
+
|
|
588
597
|
// src/evals/metric.ts
|
|
589
598
|
var registry = /* @__PURE__ */ new Map();
|
|
590
599
|
var Metric = {
|
|
@@ -978,15 +987,16 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
978
987
|
meta: {
|
|
979
988
|
triggerId: task.triggerId,
|
|
980
989
|
runId: evaluatorRunId,
|
|
981
|
-
|
|
990
|
+
datasetName: task.dataset.getDisplayLabel(),
|
|
982
991
|
repetitionId,
|
|
983
992
|
repetitionIndex,
|
|
984
993
|
repetitionCount,
|
|
985
|
-
runConfigName: task.runConfigName
|
|
994
|
+
runConfigName: task.runConfigName,
|
|
995
|
+
...task.experimentName !== void 0 && task.experimentName !== "" ? { experimentName: task.experimentName } : {},
|
|
996
|
+
testCaseTags: getTestCaseTagList(testCaseItem.testCase),
|
|
997
|
+
runConfigTags: task.runConfigTags,
|
|
998
|
+
evaluatorTags: getEvaluatorTagList(evaluator)
|
|
986
999
|
},
|
|
987
|
-
testCaseTags: getTestCaseTagList(testCaseItem.testCase),
|
|
988
|
-
runConfigTags: task.runConfigTags,
|
|
989
|
-
evaluatorTags: getEvaluatorTagList(evaluator),
|
|
990
1000
|
logDiff,
|
|
991
1001
|
log,
|
|
992
1002
|
createError
|
|
@@ -1393,7 +1403,7 @@ var EffectRunner = class {
|
|
|
1393
1403
|
);
|
|
1394
1404
|
if (!dsCollected) {
|
|
1395
1405
|
throw new Error(
|
|
1396
|
-
`RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.
|
|
1406
|
+
`RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getDisplayLabel()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
|
|
1397
1407
|
);
|
|
1398
1408
|
}
|
|
1399
1409
|
let evaluatorIds;
|
|
@@ -1465,7 +1475,8 @@ var EffectRunner = class {
|
|
|
1465
1475
|
globalEvaluationSemaphore: sem,
|
|
1466
1476
|
runConfigName: job.runConfigName,
|
|
1467
1477
|
runConfigTags: job.runConfigTags,
|
|
1468
|
-
repetitions: job.repetitions
|
|
1478
|
+
repetitions: job.repetitions,
|
|
1479
|
+
experimentName: request.experimentName
|
|
1469
1480
|
})
|
|
1470
1481
|
);
|
|
1471
1482
|
}
|
|
@@ -1500,7 +1511,8 @@ var EffectRunner = class {
|
|
|
1500
1511
|
maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
|
|
1501
1512
|
repetitions: request.repetitions,
|
|
1502
1513
|
runConfigName,
|
|
1503
|
-
runConfigTags: request.runConfigTags
|
|
1514
|
+
runConfigTags: request.runConfigTags,
|
|
1515
|
+
experimentName: request.experimentName
|
|
1504
1516
|
});
|
|
1505
1517
|
}
|
|
1506
1518
|
async startDatasetRun(params) {
|
|
@@ -1528,7 +1540,7 @@ var EffectRunner = class {
|
|
|
1528
1540
|
const snapshot = {
|
|
1529
1541
|
runId,
|
|
1530
1542
|
datasetId: params.datasetId,
|
|
1531
|
-
datasetName: dataset.dataset.
|
|
1543
|
+
datasetName: dataset.dataset.getDisplayLabel(),
|
|
1532
1544
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
1533
1545
|
queuedAt: Date.now(),
|
|
1534
1546
|
totalTestCases: totalEvaluations,
|
|
@@ -1549,7 +1561,7 @@ var EffectRunner = class {
|
|
|
1549
1561
|
type: "RunQueued",
|
|
1550
1562
|
runId,
|
|
1551
1563
|
datasetId: params.datasetId,
|
|
1552
|
-
datasetName: dataset.dataset.
|
|
1564
|
+
datasetName: dataset.dataset.getDisplayLabel(),
|
|
1553
1565
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
1554
1566
|
totalTestCases: totalEvaluations,
|
|
1555
1567
|
artifactPath
|
|
@@ -1575,7 +1587,8 @@ var EffectRunner = class {
|
|
|
1575
1587
|
globalEvaluationSemaphore: params.globalEvaluationSemaphore,
|
|
1576
1588
|
runConfigName: params.runConfigName,
|
|
1577
1589
|
runConfigTags,
|
|
1578
|
-
repetitions
|
|
1590
|
+
repetitions,
|
|
1591
|
+
experimentName: params.experimentName
|
|
1579
1592
|
})
|
|
1580
1593
|
);
|
|
1581
1594
|
return snapshot;
|
|
@@ -1695,6 +1708,17 @@ function parseSimpleCliArgs(argv) {
|
|
|
1695
1708
|
index += 1;
|
|
1696
1709
|
continue;
|
|
1697
1710
|
}
|
|
1711
|
+
if (token === "--experiment" && argv[index + 1]) {
|
|
1712
|
+
const raw = argv[index + 1];
|
|
1713
|
+
if (typeof raw === "string") {
|
|
1714
|
+
const trimmed = raw.trim();
|
|
1715
|
+
if (trimmed.length > 0) {
|
|
1716
|
+
args.experimentName = trimmed;
|
|
1717
|
+
}
|
|
1718
|
+
}
|
|
1719
|
+
index += 1;
|
|
1720
|
+
continue;
|
|
1721
|
+
}
|
|
1698
1722
|
args.unknownArgs.push(token);
|
|
1699
1723
|
}
|
|
1700
1724
|
return args;
|
|
@@ -1702,12 +1726,13 @@ function parseSimpleCliArgs(argv) {
|
|
|
1702
1726
|
function getSimpleCliUsage() {
|
|
1703
1727
|
return [
|
|
1704
1728
|
"Usage:",
|
|
1705
|
-
" eval-agents-simple run --run-config <name> [--run-config <name> ...] [--concurrency N] [--ci]",
|
|
1706
|
-
" eval-agents-simple generate --dataset <
|
|
1729
|
+
" eval-agents-simple run --run-config <name> [--run-config <name> ...] [--concurrency N] [--experiment <name>] [--ci]",
|
|
1730
|
+
" eval-agents-simple generate --dataset <datasetId>",
|
|
1707
1731
|
"",
|
|
1708
1732
|
"Options:",
|
|
1709
1733
|
" --ci With run: exit with code 1 if any test case fails.",
|
|
1710
|
-
" --concurrency, -c N Max concurrent evaluations (default: 4). Use 1 for sequential."
|
|
1734
|
+
" --concurrency, -c N Max concurrent evaluations (default: 4). Use 1 for sequential.",
|
|
1735
|
+
" --experiment <name> With run: set evaluator meta.experimentName for this invocation."
|
|
1711
1736
|
].join("\n");
|
|
1712
1737
|
}
|
|
1713
1738
|
|
|
@@ -1771,7 +1796,7 @@ function GenerateView({
|
|
|
1771
1796
|
if (!cancelled) {
|
|
1772
1797
|
setResult({
|
|
1773
1798
|
count: payload.length,
|
|
1774
|
-
datasetName: dataset.dataset
|
|
1799
|
+
datasetName: getDatasetDisplayLabel(dataset.dataset),
|
|
1775
1800
|
outputPath
|
|
1776
1801
|
});
|
|
1777
1802
|
setTimeout(() => onComplete(), 200);
|
|
@@ -1832,7 +1857,7 @@ async function generateDatasetJsonCommandPlain(runner, datasetName) {
|
|
|
1832
1857
|
const outputPath = createOutputPath(absoluteDatasetPath);
|
|
1833
1858
|
await writeFile(outputPath, `${JSON.stringify(payload, null, 2)}
|
|
1834
1859
|
`, "utf8");
|
|
1835
|
-
console.log(`Generated ${payload.length} test cases for dataset "${dataset.dataset
|
|
1860
|
+
console.log(`Generated ${payload.length} test cases for dataset "${getDatasetDisplayLabel(dataset.dataset)}".`);
|
|
1836
1861
|
console.log(`Wrote ${outputPath}`);
|
|
1837
1862
|
}
|
|
1838
1863
|
async function generateDatasetJsonCommandInk(runner, datasetName) {
|
|
@@ -1984,6 +2009,7 @@ function RunView({
|
|
|
1984
2009
|
runner,
|
|
1985
2010
|
runConfigNames,
|
|
1986
2011
|
concurrency,
|
|
2012
|
+
experimentName,
|
|
1987
2013
|
onComplete
|
|
1988
2014
|
}) {
|
|
1989
2015
|
const [phase, setPhase] = useState("loading");
|
|
@@ -2153,7 +2179,8 @@ function RunView({
|
|
|
2153
2179
|
});
|
|
2154
2180
|
const snapshots = await runner.runDatasetJobsWithSharedConcurrency({
|
|
2155
2181
|
jobs,
|
|
2156
|
-
globalConcurrency: concurrency
|
|
2182
|
+
globalConcurrency: concurrency,
|
|
2183
|
+
experimentName
|
|
2157
2184
|
});
|
|
2158
2185
|
for (let i = 0; i < snapshots.length; i += 1) {
|
|
2159
2186
|
const snap = snapshots[i];
|
|
@@ -2210,7 +2237,7 @@ function RunView({
|
|
|
2210
2237
|
setPhase("completed");
|
|
2211
2238
|
const exitCode = failedTestCases > 0 ? 1 : 0;
|
|
2212
2239
|
setTimeout(() => onComplete(void 0, exitCode), 200);
|
|
2213
|
-
}, [runner, runConfigNames, concurrency, onComplete]);
|
|
2240
|
+
}, [runner, runConfigNames, concurrency, experimentName, onComplete]);
|
|
2214
2241
|
useEffect(() => {
|
|
2215
2242
|
void runEval();
|
|
2216
2243
|
}, [runEval]);
|
|
@@ -2698,7 +2725,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
|
|
|
2698
2725
|
}
|
|
2699
2726
|
return lines;
|
|
2700
2727
|
}
|
|
2701
|
-
async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency) {
|
|
2728
|
+
async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency, experimentName) {
|
|
2702
2729
|
const jobs = await runner.expandRunConfigNamesToJobs(runConfigNames);
|
|
2703
2730
|
if (jobs.length === 0) {
|
|
2704
2731
|
throw new Error("No jobs expanded from RunConfigs.");
|
|
@@ -2907,7 +2934,8 @@ async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency)
|
|
|
2907
2934
|
console.log("");
|
|
2908
2935
|
const snapshots = await runner.runDatasetJobsWithSharedConcurrency({
|
|
2909
2936
|
jobs,
|
|
2910
|
-
globalConcurrency: concurrency
|
|
2937
|
+
globalConcurrency: concurrency,
|
|
2938
|
+
experimentName
|
|
2911
2939
|
});
|
|
2912
2940
|
for (let i = 0; i < snapshots.length; i += 1) {
|
|
2913
2941
|
const snap = snapshots[i];
|
|
@@ -3007,13 +3035,14 @@ async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency)
|
|
|
3007
3035
|
}
|
|
3008
3036
|
return failedTestCasesTotal > 0 ? 1 : 0;
|
|
3009
3037
|
}
|
|
3010
|
-
async function runSimpleEvalRunConfigsInk(runner, runConfigNames, concurrency) {
|
|
3038
|
+
async function runSimpleEvalRunConfigsInk(runner, runConfigNames, concurrency, experimentName) {
|
|
3011
3039
|
return new Promise((resolve5, reject) => {
|
|
3012
3040
|
const app = render(
|
|
3013
3041
|
React.createElement(RunView, {
|
|
3014
3042
|
runner,
|
|
3015
3043
|
runConfigNames,
|
|
3016
3044
|
concurrency,
|
|
3045
|
+
experimentName,
|
|
3017
3046
|
onComplete: (err, exitCode) => {
|
|
3018
3047
|
app.unmount();
|
|
3019
3048
|
if (err) {
|
|
@@ -3074,7 +3103,8 @@ async function main() {
|
|
|
3074
3103
|
const exitCode = await (useInk ? runSimpleEvalRunConfigsInk : runSimpleEvalRunConfigsPlain)(
|
|
3075
3104
|
runner,
|
|
3076
3105
|
args.runConfigNames,
|
|
3077
|
-
concurrency
|
|
3106
|
+
concurrency,
|
|
3107
|
+
args.experimentName
|
|
3078
3108
|
);
|
|
3079
3109
|
if (args.ci && exitCode !== 0) {
|
|
3080
3110
|
process.exit(1);
|
|
@@ -3083,7 +3113,7 @@ async function main() {
|
|
|
3083
3113
|
}
|
|
3084
3114
|
const genDataset = args.datasetName;
|
|
3085
3115
|
if (!genDataset) {
|
|
3086
|
-
console.error("Missing required --dataset <
|
|
3116
|
+
console.error("Missing required --dataset <datasetId> argument.");
|
|
3087
3117
|
printUsageAndExit(1);
|
|
3088
3118
|
}
|
|
3089
3119
|
await (useInk ? generateDatasetJsonCommandInk : generateDatasetJsonCommandPlain)(
|