@m4trix/evals 0.27.0 → 0.29.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -2
- package/dist/cli-simple.cjs +38 -15
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +38 -15
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +13 -7
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +13 -7
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +17 -10
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +39 -14
- package/dist/index.js +17 -10
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli-simple.js
CHANGED
|
@@ -988,14 +988,17 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
988
988
|
triggerId: task.triggerId,
|
|
989
989
|
runId: evaluatorRunId,
|
|
990
990
|
datasetName: task.dataset.getDisplayLabel(),
|
|
991
|
+
testCaseId: testCaseItem.id,
|
|
992
|
+
testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
|
|
991
993
|
repetitionId,
|
|
992
994
|
repetitionIndex,
|
|
993
995
|
repetitionCount,
|
|
994
|
-
runConfigName: task.runConfigName
|
|
996
|
+
runConfigName: task.runConfigName,
|
|
997
|
+
...task.experimentName !== void 0 && task.experimentName !== "" ? { experimentName: task.experimentName } : {},
|
|
998
|
+
testCaseTags: getTestCaseTagList(testCaseItem.testCase),
|
|
999
|
+
runConfigTags: task.runConfigTags,
|
|
1000
|
+
evaluatorTags: getEvaluatorTagList(evaluator)
|
|
995
1001
|
},
|
|
996
|
-
testCaseTags: getTestCaseTagList(testCaseItem.testCase),
|
|
997
|
-
runConfigTags: task.runConfigTags,
|
|
998
|
-
evaluatorTags: getEvaluatorTagList(evaluator),
|
|
999
1002
|
logDiff,
|
|
1000
1003
|
log,
|
|
1001
1004
|
createError
|
|
@@ -1474,7 +1477,8 @@ var EffectRunner = class {
|
|
|
1474
1477
|
globalEvaluationSemaphore: sem,
|
|
1475
1478
|
runConfigName: job.runConfigName,
|
|
1476
1479
|
runConfigTags: job.runConfigTags,
|
|
1477
|
-
repetitions: job.repetitions
|
|
1480
|
+
repetitions: job.repetitions,
|
|
1481
|
+
experimentName: request.experimentName
|
|
1478
1482
|
})
|
|
1479
1483
|
);
|
|
1480
1484
|
}
|
|
@@ -1509,7 +1513,8 @@ var EffectRunner = class {
|
|
|
1509
1513
|
maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
|
|
1510
1514
|
repetitions: request.repetitions,
|
|
1511
1515
|
runConfigName,
|
|
1512
|
-
runConfigTags: request.runConfigTags
|
|
1516
|
+
runConfigTags: request.runConfigTags,
|
|
1517
|
+
experimentName: request.experimentName
|
|
1513
1518
|
});
|
|
1514
1519
|
}
|
|
1515
1520
|
async startDatasetRun(params) {
|
|
@@ -1584,7 +1589,8 @@ var EffectRunner = class {
|
|
|
1584
1589
|
globalEvaluationSemaphore: params.globalEvaluationSemaphore,
|
|
1585
1590
|
runConfigName: params.runConfigName,
|
|
1586
1591
|
runConfigTags,
|
|
1587
|
-
repetitions
|
|
1592
|
+
repetitions,
|
|
1593
|
+
experimentName: params.experimentName
|
|
1588
1594
|
})
|
|
1589
1595
|
);
|
|
1590
1596
|
return snapshot;
|
|
@@ -1704,6 +1710,17 @@ function parseSimpleCliArgs(argv) {
|
|
|
1704
1710
|
index += 1;
|
|
1705
1711
|
continue;
|
|
1706
1712
|
}
|
|
1713
|
+
if (token === "--experiment" && argv[index + 1]) {
|
|
1714
|
+
const raw = argv[index + 1];
|
|
1715
|
+
if (typeof raw === "string") {
|
|
1716
|
+
const trimmed = raw.trim();
|
|
1717
|
+
if (trimmed.length > 0) {
|
|
1718
|
+
args.experimentName = trimmed;
|
|
1719
|
+
}
|
|
1720
|
+
}
|
|
1721
|
+
index += 1;
|
|
1722
|
+
continue;
|
|
1723
|
+
}
|
|
1707
1724
|
args.unknownArgs.push(token);
|
|
1708
1725
|
}
|
|
1709
1726
|
return args;
|
|
@@ -1711,12 +1728,13 @@ function parseSimpleCliArgs(argv) {
|
|
|
1711
1728
|
function getSimpleCliUsage() {
|
|
1712
1729
|
return [
|
|
1713
1730
|
"Usage:",
|
|
1714
|
-
" eval-agents-simple run --run-config <name> [--run-config <name> ...] [--concurrency N] [--ci]",
|
|
1731
|
+
" eval-agents-simple run --run-config <name> [--run-config <name> ...] [--concurrency N] [--experiment <name>] [--ci]",
|
|
1715
1732
|
" eval-agents-simple generate --dataset <datasetId>",
|
|
1716
1733
|
"",
|
|
1717
1734
|
"Options:",
|
|
1718
1735
|
" --ci With run: exit with code 1 if any test case fails.",
|
|
1719
|
-
" --concurrency, -c N Max concurrent evaluations (default: 4). Use 1 for sequential."
|
|
1736
|
+
" --concurrency, -c N Max concurrent evaluations (default: 4). Use 1 for sequential.",
|
|
1737
|
+
" --experiment <name> With run: set evaluator meta.experimentName for this invocation."
|
|
1720
1738
|
].join("\n");
|
|
1721
1739
|
}
|
|
1722
1740
|
|
|
@@ -1993,6 +2011,7 @@ function RunView({
|
|
|
1993
2011
|
runner,
|
|
1994
2012
|
runConfigNames,
|
|
1995
2013
|
concurrency,
|
|
2014
|
+
experimentName,
|
|
1996
2015
|
onComplete
|
|
1997
2016
|
}) {
|
|
1998
2017
|
const [phase, setPhase] = useState("loading");
|
|
@@ -2162,7 +2181,8 @@ function RunView({
|
|
|
2162
2181
|
});
|
|
2163
2182
|
const snapshots = await runner.runDatasetJobsWithSharedConcurrency({
|
|
2164
2183
|
jobs,
|
|
2165
|
-
globalConcurrency: concurrency
|
|
2184
|
+
globalConcurrency: concurrency,
|
|
2185
|
+
experimentName
|
|
2166
2186
|
});
|
|
2167
2187
|
for (let i = 0; i < snapshots.length; i += 1) {
|
|
2168
2188
|
const snap = snapshots[i];
|
|
@@ -2219,7 +2239,7 @@ function RunView({
|
|
|
2219
2239
|
setPhase("completed");
|
|
2220
2240
|
const exitCode = failedTestCases > 0 ? 1 : 0;
|
|
2221
2241
|
setTimeout(() => onComplete(void 0, exitCode), 200);
|
|
2222
|
-
}, [runner, runConfigNames, concurrency, onComplete]);
|
|
2242
|
+
}, [runner, runConfigNames, concurrency, experimentName, onComplete]);
|
|
2223
2243
|
useEffect(() => {
|
|
2224
2244
|
void runEval();
|
|
2225
2245
|
}, [runEval]);
|
|
@@ -2707,7 +2727,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
|
|
|
2707
2727
|
}
|
|
2708
2728
|
return lines;
|
|
2709
2729
|
}
|
|
2710
|
-
async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency) {
|
|
2730
|
+
async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency, experimentName) {
|
|
2711
2731
|
const jobs = await runner.expandRunConfigNamesToJobs(runConfigNames);
|
|
2712
2732
|
if (jobs.length === 0) {
|
|
2713
2733
|
throw new Error("No jobs expanded from RunConfigs.");
|
|
@@ -2916,7 +2936,8 @@ async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency)
|
|
|
2916
2936
|
console.log("");
|
|
2917
2937
|
const snapshots = await runner.runDatasetJobsWithSharedConcurrency({
|
|
2918
2938
|
jobs,
|
|
2919
|
-
globalConcurrency: concurrency
|
|
2939
|
+
globalConcurrency: concurrency,
|
|
2940
|
+
experimentName
|
|
2920
2941
|
});
|
|
2921
2942
|
for (let i = 0; i < snapshots.length; i += 1) {
|
|
2922
2943
|
const snap = snapshots[i];
|
|
@@ -3016,13 +3037,14 @@ async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency)
|
|
|
3016
3037
|
}
|
|
3017
3038
|
return failedTestCasesTotal > 0 ? 1 : 0;
|
|
3018
3039
|
}
|
|
3019
|
-
async function runSimpleEvalRunConfigsInk(runner, runConfigNames, concurrency) {
|
|
3040
|
+
async function runSimpleEvalRunConfigsInk(runner, runConfigNames, concurrency, experimentName) {
|
|
3020
3041
|
return new Promise((resolve5, reject) => {
|
|
3021
3042
|
const app = render(
|
|
3022
3043
|
React.createElement(RunView, {
|
|
3023
3044
|
runner,
|
|
3024
3045
|
runConfigNames,
|
|
3025
3046
|
concurrency,
|
|
3047
|
+
experimentName,
|
|
3026
3048
|
onComplete: (err, exitCode) => {
|
|
3027
3049
|
app.unmount();
|
|
3028
3050
|
if (err) {
|
|
@@ -3083,7 +3105,8 @@ async function main() {
|
|
|
3083
3105
|
const exitCode = await (useInk ? runSimpleEvalRunConfigsInk : runSimpleEvalRunConfigsPlain)(
|
|
3084
3106
|
runner,
|
|
3085
3107
|
args.runConfigNames,
|
|
3086
|
-
concurrency
|
|
3108
|
+
concurrency,
|
|
3109
|
+
args.experimentName
|
|
3087
3110
|
);
|
|
3088
3111
|
if (args.ci && exitCode !== 0) {
|
|
3089
3112
|
process.exit(1);
|