@m4trix/evals 0.27.0 → 0.28.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -2
- package/dist/cli-simple.cjs +36 -15
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +36 -15
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +11 -7
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +11 -7
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +15 -10
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +35 -14
- package/dist/index.js +15 -10
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -80,6 +80,7 @@ export const myEvaluator = Evaluator.define({
|
|
|
80
80
|
inputSchema,
|
|
81
81
|
outputSchema: S.Unknown,
|
|
82
82
|
scoreSchema: S.Struct({ scores: S.Array(S.Unknown) }),
|
|
83
|
+
// optional: tags: ['suite-a'],
|
|
83
84
|
}).evaluate(async ({ input, ctx: _ctx, output, createError }) => {
|
|
84
85
|
const start = Date.now();
|
|
85
86
|
const value = 85;
|
|
@@ -132,13 +133,15 @@ export const myTestCase = TestCase.describe({
|
|
|
132
133
|
});
|
|
133
134
|
```
|
|
134
135
|
|
|
136
|
+
`tags` is optional; omit it when the test case has no declared labels. Evaluators read them as `meta.testCaseTags`.
|
|
137
|
+
|
|
135
138
|
### 4) RunConfig (optional)
|
|
136
139
|
|
|
137
140
|
Group several dataset/evaluator runs under one named config. Each row is either
|
|
138
141
|
`evaluators: [...]` (same module instances discovery loads) or `evaluatorPattern: "..."`
|
|
139
142
|
(wildcard / regex rules from `RunnerApi.resolveEvaluatorsByNamePattern`). Multiple jobs share one `--concurrency` cap.
|
|
140
143
|
|
|
141
|
-
Optional **`repetitions`** on a row (default `1`) runs each matching test case that many times. Every execution in that group shares the same **`repetitionId`** in the evaluator callback **`meta`**, with **`repetitionIndex`** / **`repetitionCount`**. Evaluator **`meta`** includes **`datasetName`** (`Dataset.getDisplayLabel()` → `displayName ?? name`)
|
|
144
|
+
Optional **`repetitions`** on a row (default `1`) runs each matching test case that many times. Every execution in that group shares the same **`repetitionId`** in the evaluator callback **`meta`**, with **`repetitionIndex`** / **`repetitionCount`**. Evaluator **`meta`** includes **`datasetName`** (`Dataset.getDisplayLabel()` → `displayName ?? name`), **`runConfigName`** (the **`RunConfig`** id or **`programmatic`** from **`PROGRAMMATIC_RUN_CONFIG`** for API/TUI-only **`runDatasetWith`**), optional **`experimentName`**, and declared tag lists **`testCaseTags`**, **`runConfigTags`**, and **`evaluatorTags`** (empty arrays when unset). **`Dataset`** **`includedTags` / `excludedTags`** only filter which test cases belong to a dataset; they are not the same as **`TestCase.describe({ tags })`** or **`Evaluator.define({ tags })`**, which label the case/evaluator and show up in **`meta`**. **`Dataset`** and **`TestCase`** follow the same naming convention as **`RunConfig`**: **`name`** is the stable id; optional **`displayName`** is unrestricted for UI. Names may use **kebab-case**, **snake_case**, **camelCase**, etc. (letters, digits, `_`, `-` only, no spaces); resolution is **case-insensitive**.
|
|
142
145
|
|
|
143
146
|
```ts
|
|
144
147
|
import { RunConfig } from '@m4trix/evals';
|
|
@@ -160,7 +163,7 @@ export const nightly = RunConfig.define({
|
|
|
160
163
|
eval-agents-simple run --run-config "nightly"
|
|
161
164
|
```
|
|
162
165
|
|
|
163
|
-
Repeat **`--run-config`** to queue several configs; jobs share one **`--concurrency`** cap.
|
|
166
|
+
Repeat **`--run-config`** to queue several configs; jobs share one **`--concurrency`** cap. Use **`--experiment <name>`** to set **`meta.experimentName`** for every evaluator in that CLI run (any non-empty string; trimmed).
|
|
164
167
|
|
|
165
168
|
## CLI Commands
|
|
166
169
|
|
package/dist/cli-simple.cjs
CHANGED
|
@@ -1017,11 +1017,12 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1017
1017
|
repetitionId,
|
|
1018
1018
|
repetitionIndex,
|
|
1019
1019
|
repetitionCount,
|
|
1020
|
-
runConfigName: task.runConfigName
|
|
1020
|
+
runConfigName: task.runConfigName,
|
|
1021
|
+
...task.experimentName !== void 0 && task.experimentName !== "" ? { experimentName: task.experimentName } : {},
|
|
1022
|
+
testCaseTags: getTestCaseTagList(testCaseItem.testCase),
|
|
1023
|
+
runConfigTags: task.runConfigTags,
|
|
1024
|
+
evaluatorTags: getEvaluatorTagList(evaluator)
|
|
1021
1025
|
},
|
|
1022
|
-
testCaseTags: getTestCaseTagList(testCaseItem.testCase),
|
|
1023
|
-
runConfigTags: task.runConfigTags,
|
|
1024
|
-
evaluatorTags: getEvaluatorTagList(evaluator),
|
|
1025
1026
|
logDiff,
|
|
1026
1027
|
log,
|
|
1027
1028
|
createError
|
|
@@ -1500,7 +1501,8 @@ var EffectRunner = class {
|
|
|
1500
1501
|
globalEvaluationSemaphore: sem,
|
|
1501
1502
|
runConfigName: job.runConfigName,
|
|
1502
1503
|
runConfigTags: job.runConfigTags,
|
|
1503
|
-
repetitions: job.repetitions
|
|
1504
|
+
repetitions: job.repetitions,
|
|
1505
|
+
experimentName: request.experimentName
|
|
1504
1506
|
})
|
|
1505
1507
|
);
|
|
1506
1508
|
}
|
|
@@ -1535,7 +1537,8 @@ var EffectRunner = class {
|
|
|
1535
1537
|
maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
|
|
1536
1538
|
repetitions: request.repetitions,
|
|
1537
1539
|
runConfigName,
|
|
1538
|
-
runConfigTags: request.runConfigTags
|
|
1540
|
+
runConfigTags: request.runConfigTags,
|
|
1541
|
+
experimentName: request.experimentName
|
|
1539
1542
|
});
|
|
1540
1543
|
}
|
|
1541
1544
|
async startDatasetRun(params) {
|
|
@@ -1610,7 +1613,8 @@ var EffectRunner = class {
|
|
|
1610
1613
|
globalEvaluationSemaphore: params.globalEvaluationSemaphore,
|
|
1611
1614
|
runConfigName: params.runConfigName,
|
|
1612
1615
|
runConfigTags,
|
|
1613
|
-
repetitions
|
|
1616
|
+
repetitions,
|
|
1617
|
+
experimentName: params.experimentName
|
|
1614
1618
|
})
|
|
1615
1619
|
);
|
|
1616
1620
|
return snapshot;
|
|
@@ -1730,6 +1734,17 @@ function parseSimpleCliArgs(argv) {
|
|
|
1730
1734
|
index += 1;
|
|
1731
1735
|
continue;
|
|
1732
1736
|
}
|
|
1737
|
+
if (token === "--experiment" && argv[index + 1]) {
|
|
1738
|
+
const raw = argv[index + 1];
|
|
1739
|
+
if (typeof raw === "string") {
|
|
1740
|
+
const trimmed = raw.trim();
|
|
1741
|
+
if (trimmed.length > 0) {
|
|
1742
|
+
args.experimentName = trimmed;
|
|
1743
|
+
}
|
|
1744
|
+
}
|
|
1745
|
+
index += 1;
|
|
1746
|
+
continue;
|
|
1747
|
+
}
|
|
1733
1748
|
args.unknownArgs.push(token);
|
|
1734
1749
|
}
|
|
1735
1750
|
return args;
|
|
@@ -1737,12 +1752,13 @@ function parseSimpleCliArgs(argv) {
|
|
|
1737
1752
|
function getSimpleCliUsage() {
|
|
1738
1753
|
return [
|
|
1739
1754
|
"Usage:",
|
|
1740
|
-
" eval-agents-simple run --run-config <name> [--run-config <name> ...] [--concurrency N] [--ci]",
|
|
1755
|
+
" eval-agents-simple run --run-config <name> [--run-config <name> ...] [--concurrency N] [--experiment <name>] [--ci]",
|
|
1741
1756
|
" eval-agents-simple generate --dataset <datasetId>",
|
|
1742
1757
|
"",
|
|
1743
1758
|
"Options:",
|
|
1744
1759
|
" --ci With run: exit with code 1 if any test case fails.",
|
|
1745
|
-
" --concurrency, -c N Max concurrent evaluations (default: 4). Use 1 for sequential."
|
|
1760
|
+
" --concurrency, -c N Max concurrent evaluations (default: 4). Use 1 for sequential.",
|
|
1761
|
+
" --experiment <name> With run: set evaluator meta.experimentName for this invocation."
|
|
1746
1762
|
].join("\n");
|
|
1747
1763
|
}
|
|
1748
1764
|
|
|
@@ -2019,6 +2035,7 @@ function RunView({
|
|
|
2019
2035
|
runner,
|
|
2020
2036
|
runConfigNames,
|
|
2021
2037
|
concurrency,
|
|
2038
|
+
experimentName,
|
|
2022
2039
|
onComplete
|
|
2023
2040
|
}) {
|
|
2024
2041
|
const [phase, setPhase] = React.useState("loading");
|
|
@@ -2188,7 +2205,8 @@ function RunView({
|
|
|
2188
2205
|
});
|
|
2189
2206
|
const snapshots = await runner.runDatasetJobsWithSharedConcurrency({
|
|
2190
2207
|
jobs,
|
|
2191
|
-
globalConcurrency: concurrency
|
|
2208
|
+
globalConcurrency: concurrency,
|
|
2209
|
+
experimentName
|
|
2192
2210
|
});
|
|
2193
2211
|
for (let i = 0; i < snapshots.length; i += 1) {
|
|
2194
2212
|
const snap = snapshots[i];
|
|
@@ -2245,7 +2263,7 @@ function RunView({
|
|
|
2245
2263
|
setPhase("completed");
|
|
2246
2264
|
const exitCode = failedTestCases > 0 ? 1 : 0;
|
|
2247
2265
|
setTimeout(() => onComplete(void 0, exitCode), 200);
|
|
2248
|
-
}, [runner, runConfigNames, concurrency, onComplete]);
|
|
2266
|
+
}, [runner, runConfigNames, concurrency, experimentName, onComplete]);
|
|
2249
2267
|
React.useEffect(() => {
|
|
2250
2268
|
void runEval();
|
|
2251
2269
|
}, [runEval]);
|
|
@@ -2733,7 +2751,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
|
|
|
2733
2751
|
}
|
|
2734
2752
|
return lines;
|
|
2735
2753
|
}
|
|
2736
|
-
async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency) {
|
|
2754
|
+
async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency, experimentName) {
|
|
2737
2755
|
const jobs = await runner.expandRunConfigNamesToJobs(runConfigNames);
|
|
2738
2756
|
if (jobs.length === 0) {
|
|
2739
2757
|
throw new Error("No jobs expanded from RunConfigs.");
|
|
@@ -2942,7 +2960,8 @@ async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency)
|
|
|
2942
2960
|
console.log("");
|
|
2943
2961
|
const snapshots = await runner.runDatasetJobsWithSharedConcurrency({
|
|
2944
2962
|
jobs,
|
|
2945
|
-
globalConcurrency: concurrency
|
|
2963
|
+
globalConcurrency: concurrency,
|
|
2964
|
+
experimentName
|
|
2946
2965
|
});
|
|
2947
2966
|
for (let i = 0; i < snapshots.length; i += 1) {
|
|
2948
2967
|
const snap = snapshots[i];
|
|
@@ -3042,13 +3061,14 @@ async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency)
|
|
|
3042
3061
|
}
|
|
3043
3062
|
return failedTestCasesTotal > 0 ? 1 : 0;
|
|
3044
3063
|
}
|
|
3045
|
-
async function runSimpleEvalRunConfigsInk(runner, runConfigNames, concurrency) {
|
|
3064
|
+
async function runSimpleEvalRunConfigsInk(runner, runConfigNames, concurrency, experimentName) {
|
|
3046
3065
|
return new Promise((resolve5, reject) => {
|
|
3047
3066
|
const app = ink.render(
|
|
3048
3067
|
React__namespace.createElement(RunView, {
|
|
3049
3068
|
runner,
|
|
3050
3069
|
runConfigNames,
|
|
3051
3070
|
concurrency,
|
|
3071
|
+
experimentName,
|
|
3052
3072
|
onComplete: (err, exitCode) => {
|
|
3053
3073
|
app.unmount();
|
|
3054
3074
|
if (err) {
|
|
@@ -3109,7 +3129,8 @@ async function main() {
|
|
|
3109
3129
|
const exitCode = await (useInk ? runSimpleEvalRunConfigsInk : runSimpleEvalRunConfigsPlain)(
|
|
3110
3130
|
runner,
|
|
3111
3131
|
args.runConfigNames,
|
|
3112
|
-
concurrency
|
|
3132
|
+
concurrency,
|
|
3133
|
+
args.experimentName
|
|
3113
3134
|
);
|
|
3114
3135
|
if (args.ci && exitCode !== 0) {
|
|
3115
3136
|
process.exit(1);
|