@m4trix/evals 0.27.0 → 0.28.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -2
- package/dist/cli-simple.cjs +36 -15
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +36 -15
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +11 -7
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +11 -7
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +15 -10
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +35 -14
- package/dist/index.js +15 -10
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli-simple.js
CHANGED
|
@@ -991,11 +991,12 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
991
991
|
repetitionId,
|
|
992
992
|
repetitionIndex,
|
|
993
993
|
repetitionCount,
|
|
994
|
-
runConfigName: task.runConfigName
|
|
994
|
+
runConfigName: task.runConfigName,
|
|
995
|
+
...task.experimentName !== void 0 && task.experimentName !== "" ? { experimentName: task.experimentName } : {},
|
|
996
|
+
testCaseTags: getTestCaseTagList(testCaseItem.testCase),
|
|
997
|
+
runConfigTags: task.runConfigTags,
|
|
998
|
+
evaluatorTags: getEvaluatorTagList(evaluator)
|
|
995
999
|
},
|
|
996
|
-
testCaseTags: getTestCaseTagList(testCaseItem.testCase),
|
|
997
|
-
runConfigTags: task.runConfigTags,
|
|
998
|
-
evaluatorTags: getEvaluatorTagList(evaluator),
|
|
999
1000
|
logDiff,
|
|
1000
1001
|
log,
|
|
1001
1002
|
createError
|
|
@@ -1474,7 +1475,8 @@ var EffectRunner = class {
|
|
|
1474
1475
|
globalEvaluationSemaphore: sem,
|
|
1475
1476
|
runConfigName: job.runConfigName,
|
|
1476
1477
|
runConfigTags: job.runConfigTags,
|
|
1477
|
-
repetitions: job.repetitions
|
|
1478
|
+
repetitions: job.repetitions,
|
|
1479
|
+
experimentName: request.experimentName
|
|
1478
1480
|
})
|
|
1479
1481
|
);
|
|
1480
1482
|
}
|
|
@@ -1509,7 +1511,8 @@ var EffectRunner = class {
|
|
|
1509
1511
|
maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
|
|
1510
1512
|
repetitions: request.repetitions,
|
|
1511
1513
|
runConfigName,
|
|
1512
|
-
runConfigTags: request.runConfigTags
|
|
1514
|
+
runConfigTags: request.runConfigTags,
|
|
1515
|
+
experimentName: request.experimentName
|
|
1513
1516
|
});
|
|
1514
1517
|
}
|
|
1515
1518
|
async startDatasetRun(params) {
|
|
@@ -1584,7 +1587,8 @@ var EffectRunner = class {
|
|
|
1584
1587
|
globalEvaluationSemaphore: params.globalEvaluationSemaphore,
|
|
1585
1588
|
runConfigName: params.runConfigName,
|
|
1586
1589
|
runConfigTags,
|
|
1587
|
-
repetitions
|
|
1590
|
+
repetitions,
|
|
1591
|
+
experimentName: params.experimentName
|
|
1588
1592
|
})
|
|
1589
1593
|
);
|
|
1590
1594
|
return snapshot;
|
|
@@ -1704,6 +1708,17 @@ function parseSimpleCliArgs(argv) {
|
|
|
1704
1708
|
index += 1;
|
|
1705
1709
|
continue;
|
|
1706
1710
|
}
|
|
1711
|
+
if (token === "--experiment" && argv[index + 1]) {
|
|
1712
|
+
const raw = argv[index + 1];
|
|
1713
|
+
if (typeof raw === "string") {
|
|
1714
|
+
const trimmed = raw.trim();
|
|
1715
|
+
if (trimmed.length > 0) {
|
|
1716
|
+
args.experimentName = trimmed;
|
|
1717
|
+
}
|
|
1718
|
+
}
|
|
1719
|
+
index += 1;
|
|
1720
|
+
continue;
|
|
1721
|
+
}
|
|
1707
1722
|
args.unknownArgs.push(token);
|
|
1708
1723
|
}
|
|
1709
1724
|
return args;
|
|
@@ -1711,12 +1726,13 @@ function parseSimpleCliArgs(argv) {
|
|
|
1711
1726
|
function getSimpleCliUsage() {
|
|
1712
1727
|
return [
|
|
1713
1728
|
"Usage:",
|
|
1714
|
-
" eval-agents-simple run --run-config <name> [--run-config <name> ...] [--concurrency N] [--ci]",
|
|
1729
|
+
" eval-agents-simple run --run-config <name> [--run-config <name> ...] [--concurrency N] [--experiment <name>] [--ci]",
|
|
1715
1730
|
" eval-agents-simple generate --dataset <datasetId>",
|
|
1716
1731
|
"",
|
|
1717
1732
|
"Options:",
|
|
1718
1733
|
" --ci With run: exit with code 1 if any test case fails.",
|
|
1719
|
-
" --concurrency, -c N Max concurrent evaluations (default: 4). Use 1 for sequential."
|
|
1734
|
+
" --concurrency, -c N Max concurrent evaluations (default: 4). Use 1 for sequential.",
|
|
1735
|
+
" --experiment <name> With run: set evaluator meta.experimentName for this invocation."
|
|
1720
1736
|
].join("\n");
|
|
1721
1737
|
}
|
|
1722
1738
|
|
|
@@ -1993,6 +2009,7 @@ function RunView({
|
|
|
1993
2009
|
runner,
|
|
1994
2010
|
runConfigNames,
|
|
1995
2011
|
concurrency,
|
|
2012
|
+
experimentName,
|
|
1996
2013
|
onComplete
|
|
1997
2014
|
}) {
|
|
1998
2015
|
const [phase, setPhase] = useState("loading");
|
|
@@ -2162,7 +2179,8 @@ function RunView({
|
|
|
2162
2179
|
});
|
|
2163
2180
|
const snapshots = await runner.runDatasetJobsWithSharedConcurrency({
|
|
2164
2181
|
jobs,
|
|
2165
|
-
globalConcurrency: concurrency
|
|
2182
|
+
globalConcurrency: concurrency,
|
|
2183
|
+
experimentName
|
|
2166
2184
|
});
|
|
2167
2185
|
for (let i = 0; i < snapshots.length; i += 1) {
|
|
2168
2186
|
const snap = snapshots[i];
|
|
@@ -2219,7 +2237,7 @@ function RunView({
|
|
|
2219
2237
|
setPhase("completed");
|
|
2220
2238
|
const exitCode = failedTestCases > 0 ? 1 : 0;
|
|
2221
2239
|
setTimeout(() => onComplete(void 0, exitCode), 200);
|
|
2222
|
-
}, [runner, runConfigNames, concurrency, onComplete]);
|
|
2240
|
+
}, [runner, runConfigNames, concurrency, experimentName, onComplete]);
|
|
2223
2241
|
useEffect(() => {
|
|
2224
2242
|
void runEval();
|
|
2225
2243
|
}, [runEval]);
|
|
@@ -2707,7 +2725,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
|
|
|
2707
2725
|
}
|
|
2708
2726
|
return lines;
|
|
2709
2727
|
}
|
|
2710
|
-
async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency) {
|
|
2728
|
+
async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency, experimentName) {
|
|
2711
2729
|
const jobs = await runner.expandRunConfigNamesToJobs(runConfigNames);
|
|
2712
2730
|
if (jobs.length === 0) {
|
|
2713
2731
|
throw new Error("No jobs expanded from RunConfigs.");
|
|
@@ -2916,7 +2934,8 @@ async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency)
|
|
|
2916
2934
|
console.log("");
|
|
2917
2935
|
const snapshots = await runner.runDatasetJobsWithSharedConcurrency({
|
|
2918
2936
|
jobs,
|
|
2919
|
-
globalConcurrency: concurrency
|
|
2937
|
+
globalConcurrency: concurrency,
|
|
2938
|
+
experimentName
|
|
2920
2939
|
});
|
|
2921
2940
|
for (let i = 0; i < snapshots.length; i += 1) {
|
|
2922
2941
|
const snap = snapshots[i];
|
|
@@ -3016,13 +3035,14 @@ async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency)
|
|
|
3016
3035
|
}
|
|
3017
3036
|
return failedTestCasesTotal > 0 ? 1 : 0;
|
|
3018
3037
|
}
|
|
3019
|
-
async function runSimpleEvalRunConfigsInk(runner, runConfigNames, concurrency) {
|
|
3038
|
+
async function runSimpleEvalRunConfigsInk(runner, runConfigNames, concurrency, experimentName) {
|
|
3020
3039
|
return new Promise((resolve5, reject) => {
|
|
3021
3040
|
const app = render(
|
|
3022
3041
|
React.createElement(RunView, {
|
|
3023
3042
|
runner,
|
|
3024
3043
|
runConfigNames,
|
|
3025
3044
|
concurrency,
|
|
3045
|
+
experimentName,
|
|
3026
3046
|
onComplete: (err, exitCode) => {
|
|
3027
3047
|
app.unmount();
|
|
3028
3048
|
if (err) {
|
|
@@ -3083,7 +3103,8 @@ async function main() {
|
|
|
3083
3103
|
const exitCode = await (useInk ? runSimpleEvalRunConfigsInk : runSimpleEvalRunConfigsPlain)(
|
|
3084
3104
|
runner,
|
|
3085
3105
|
args.runConfigNames,
|
|
3086
|
-
concurrency
|
|
3106
|
+
concurrency,
|
|
3107
|
+
args.experimentName
|
|
3087
3108
|
);
|
|
3088
3109
|
if (args.ci && exitCode !== 0) {
|
|
3089
3110
|
process.exit(1);
|