@m4trix/evals 0.27.0 → 0.29.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -2
- package/dist/cli-simple.cjs +38 -15
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +38 -15
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +13 -7
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +13 -7
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +17 -10
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +39 -14
- package/dist/index.js +17 -10
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -80,6 +80,7 @@ export const myEvaluator = Evaluator.define({
|
|
|
80
80
|
inputSchema,
|
|
81
81
|
outputSchema: S.Unknown,
|
|
82
82
|
scoreSchema: S.Struct({ scores: S.Array(S.Unknown) }),
|
|
83
|
+
// optional: tags: ['suite-a'],
|
|
83
84
|
}).evaluate(async ({ input, ctx: _ctx, output, createError }) => {
|
|
84
85
|
const start = Date.now();
|
|
85
86
|
const value = 85;
|
|
@@ -132,13 +133,15 @@ export const myTestCase = TestCase.describe({
|
|
|
132
133
|
});
|
|
133
134
|
```
|
|
134
135
|
|
|
136
|
+
`tags` is optional; omit it when the test case has no declared labels. Evaluators read them as `meta.testCaseTags`.
|
|
137
|
+
|
|
135
138
|
### 4) RunConfig (optional)
|
|
136
139
|
|
|
137
140
|
Group several dataset/evaluator runs under one named config. Each row is either
|
|
138
141
|
`evaluators: [...]` (same module instances discovery loads) or `evaluatorPattern: "..."`
|
|
139
142
|
(wildcard / regex rules from `RunnerApi.resolveEvaluatorsByNamePattern`). Multiple jobs share one `--concurrency` cap.
|
|
140
143
|
|
|
141
|
-
Optional **`repetitions`** on a row (default `1`) runs each matching test case that many times. Every execution in that group shares the same **`repetitionId`** in the evaluator callback **`meta`**, with **`repetitionIndex`** / **`repetitionCount`**. Evaluator **`meta`** includes **`datasetName`** (`Dataset.getDisplayLabel()` → `displayName ?? name`)
|
|
144
|
+
Optional **`repetitions`** on a row (default `1`) runs each matching test case that many times. Every execution in that group shares the same **`repetitionId`** in the evaluator callback **`meta`**, with **`repetitionIndex`** / **`repetitionCount`**. Evaluator **`meta`** includes **`datasetName`** (`Dataset.getDisplayLabel()` → `displayName ?? name`), **`testCaseId`** (discovery id, same as runner events), **`testCaseName`** (`TestCase.getDisplayLabel()` → `displayName ?? name`), **`runConfigName`** (the **`RunConfig`** id or **`programmatic`** from **`PROGRAMMATIC_RUN_CONFIG`** for API/TUI-only **`runDatasetWith`**), optional **`experimentName`**, and declared tag lists **`testCaseTags`**, **`runConfigTags`**, and **`evaluatorTags`** (empty arrays when unset). **`Dataset`** **`includedTags` / `excludedTags`** only filter which test cases belong to a dataset; they are not the same as **`TestCase.describe({ tags })`** or **`Evaluator.define({ tags })`**, which label the case/evaluator and show up in **`meta`**. **`Dataset`** and **`TestCase`** follow the same naming convention as **`RunConfig`**: **`name`** is the stable id; optional **`displayName`** is unrestricted for UI. Names may use **kebab-case**, **snake_case**, **camelCase**, etc. (letters, digits, `_`, `-` only, no spaces); resolution is **case-insensitive**.
|
|
142
145
|
|
|
143
146
|
```ts
|
|
144
147
|
import { RunConfig } from '@m4trix/evals';
|
|
@@ -160,7 +163,7 @@ export const nightly = RunConfig.define({
|
|
|
160
163
|
eval-agents-simple run --run-config "nightly"
|
|
161
164
|
```
|
|
162
165
|
|
|
163
|
-
Repeat **`--run-config`** to queue several configs; jobs share one **`--concurrency`** cap.
|
|
166
|
+
Repeat **`--run-config`** to queue several configs; jobs share one **`--concurrency`** cap. Use **`--experiment <name>`** to set **`meta.experimentName`** for every evaluator in that CLI run (any non-empty string; trimmed).
|
|
164
167
|
|
|
165
168
|
## CLI Commands
|
|
166
169
|
|
package/dist/cli-simple.cjs
CHANGED
|
@@ -1014,14 +1014,17 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1014
1014
|
triggerId: task.triggerId,
|
|
1015
1015
|
runId: evaluatorRunId,
|
|
1016
1016
|
datasetName: task.dataset.getDisplayLabel(),
|
|
1017
|
+
testCaseId: testCaseItem.id,
|
|
1018
|
+
testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
|
|
1017
1019
|
repetitionId,
|
|
1018
1020
|
repetitionIndex,
|
|
1019
1021
|
repetitionCount,
|
|
1020
|
-
runConfigName: task.runConfigName
|
|
1022
|
+
runConfigName: task.runConfigName,
|
|
1023
|
+
...task.experimentName !== void 0 && task.experimentName !== "" ? { experimentName: task.experimentName } : {},
|
|
1024
|
+
testCaseTags: getTestCaseTagList(testCaseItem.testCase),
|
|
1025
|
+
runConfigTags: task.runConfigTags,
|
|
1026
|
+
evaluatorTags: getEvaluatorTagList(evaluator)
|
|
1021
1027
|
},
|
|
1022
|
-
testCaseTags: getTestCaseTagList(testCaseItem.testCase),
|
|
1023
|
-
runConfigTags: task.runConfigTags,
|
|
1024
|
-
evaluatorTags: getEvaluatorTagList(evaluator),
|
|
1025
1028
|
logDiff,
|
|
1026
1029
|
log,
|
|
1027
1030
|
createError
|
|
@@ -1500,7 +1503,8 @@ var EffectRunner = class {
|
|
|
1500
1503
|
globalEvaluationSemaphore: sem,
|
|
1501
1504
|
runConfigName: job.runConfigName,
|
|
1502
1505
|
runConfigTags: job.runConfigTags,
|
|
1503
|
-
repetitions: job.repetitions
|
|
1506
|
+
repetitions: job.repetitions,
|
|
1507
|
+
experimentName: request.experimentName
|
|
1504
1508
|
})
|
|
1505
1509
|
);
|
|
1506
1510
|
}
|
|
@@ -1535,7 +1539,8 @@ var EffectRunner = class {
|
|
|
1535
1539
|
maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
|
|
1536
1540
|
repetitions: request.repetitions,
|
|
1537
1541
|
runConfigName,
|
|
1538
|
-
runConfigTags: request.runConfigTags
|
|
1542
|
+
runConfigTags: request.runConfigTags,
|
|
1543
|
+
experimentName: request.experimentName
|
|
1539
1544
|
});
|
|
1540
1545
|
}
|
|
1541
1546
|
async startDatasetRun(params) {
|
|
@@ -1610,7 +1615,8 @@ var EffectRunner = class {
|
|
|
1610
1615
|
globalEvaluationSemaphore: params.globalEvaluationSemaphore,
|
|
1611
1616
|
runConfigName: params.runConfigName,
|
|
1612
1617
|
runConfigTags,
|
|
1613
|
-
repetitions
|
|
1618
|
+
repetitions,
|
|
1619
|
+
experimentName: params.experimentName
|
|
1614
1620
|
})
|
|
1615
1621
|
);
|
|
1616
1622
|
return snapshot;
|
|
@@ -1730,6 +1736,17 @@ function parseSimpleCliArgs(argv) {
|
|
|
1730
1736
|
index += 1;
|
|
1731
1737
|
continue;
|
|
1732
1738
|
}
|
|
1739
|
+
if (token === "--experiment" && argv[index + 1]) {
|
|
1740
|
+
const raw = argv[index + 1];
|
|
1741
|
+
if (typeof raw === "string") {
|
|
1742
|
+
const trimmed = raw.trim();
|
|
1743
|
+
if (trimmed.length > 0) {
|
|
1744
|
+
args.experimentName = trimmed;
|
|
1745
|
+
}
|
|
1746
|
+
}
|
|
1747
|
+
index += 1;
|
|
1748
|
+
continue;
|
|
1749
|
+
}
|
|
1733
1750
|
args.unknownArgs.push(token);
|
|
1734
1751
|
}
|
|
1735
1752
|
return args;
|
|
@@ -1737,12 +1754,13 @@ function parseSimpleCliArgs(argv) {
|
|
|
1737
1754
|
function getSimpleCliUsage() {
|
|
1738
1755
|
return [
|
|
1739
1756
|
"Usage:",
|
|
1740
|
-
" eval-agents-simple run --run-config <name> [--run-config <name> ...] [--concurrency N] [--ci]",
|
|
1757
|
+
" eval-agents-simple run --run-config <name> [--run-config <name> ...] [--concurrency N] [--experiment <name>] [--ci]",
|
|
1741
1758
|
" eval-agents-simple generate --dataset <datasetId>",
|
|
1742
1759
|
"",
|
|
1743
1760
|
"Options:",
|
|
1744
1761
|
" --ci With run: exit with code 1 if any test case fails.",
|
|
1745
|
-
" --concurrency, -c N Max concurrent evaluations (default: 4). Use 1 for sequential."
|
|
1762
|
+
" --concurrency, -c N Max concurrent evaluations (default: 4). Use 1 for sequential.",
|
|
1763
|
+
" --experiment <name> With run: set evaluator meta.experimentName for this invocation."
|
|
1746
1764
|
].join("\n");
|
|
1747
1765
|
}
|
|
1748
1766
|
|
|
@@ -2019,6 +2037,7 @@ function RunView({
|
|
|
2019
2037
|
runner,
|
|
2020
2038
|
runConfigNames,
|
|
2021
2039
|
concurrency,
|
|
2040
|
+
experimentName,
|
|
2022
2041
|
onComplete
|
|
2023
2042
|
}) {
|
|
2024
2043
|
const [phase, setPhase] = React.useState("loading");
|
|
@@ -2188,7 +2207,8 @@ function RunView({
|
|
|
2188
2207
|
});
|
|
2189
2208
|
const snapshots = await runner.runDatasetJobsWithSharedConcurrency({
|
|
2190
2209
|
jobs,
|
|
2191
|
-
globalConcurrency: concurrency
|
|
2210
|
+
globalConcurrency: concurrency,
|
|
2211
|
+
experimentName
|
|
2192
2212
|
});
|
|
2193
2213
|
for (let i = 0; i < snapshots.length; i += 1) {
|
|
2194
2214
|
const snap = snapshots[i];
|
|
@@ -2245,7 +2265,7 @@ function RunView({
|
|
|
2245
2265
|
setPhase("completed");
|
|
2246
2266
|
const exitCode = failedTestCases > 0 ? 1 : 0;
|
|
2247
2267
|
setTimeout(() => onComplete(void 0, exitCode), 200);
|
|
2248
|
-
}, [runner, runConfigNames, concurrency, onComplete]);
|
|
2268
|
+
}, [runner, runConfigNames, concurrency, experimentName, onComplete]);
|
|
2249
2269
|
React.useEffect(() => {
|
|
2250
2270
|
void runEval();
|
|
2251
2271
|
}, [runEval]);
|
|
@@ -2733,7 +2753,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
|
|
|
2733
2753
|
}
|
|
2734
2754
|
return lines;
|
|
2735
2755
|
}
|
|
2736
|
-
async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency) {
|
|
2756
|
+
async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency, experimentName) {
|
|
2737
2757
|
const jobs = await runner.expandRunConfigNamesToJobs(runConfigNames);
|
|
2738
2758
|
if (jobs.length === 0) {
|
|
2739
2759
|
throw new Error("No jobs expanded from RunConfigs.");
|
|
@@ -2942,7 +2962,8 @@ async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency)
|
|
|
2942
2962
|
console.log("");
|
|
2943
2963
|
const snapshots = await runner.runDatasetJobsWithSharedConcurrency({
|
|
2944
2964
|
jobs,
|
|
2945
|
-
globalConcurrency: concurrency
|
|
2965
|
+
globalConcurrency: concurrency,
|
|
2966
|
+
experimentName
|
|
2946
2967
|
});
|
|
2947
2968
|
for (let i = 0; i < snapshots.length; i += 1) {
|
|
2948
2969
|
const snap = snapshots[i];
|
|
@@ -3042,13 +3063,14 @@ async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency)
|
|
|
3042
3063
|
}
|
|
3043
3064
|
return failedTestCasesTotal > 0 ? 1 : 0;
|
|
3044
3065
|
}
|
|
3045
|
-
async function runSimpleEvalRunConfigsInk(runner, runConfigNames, concurrency) {
|
|
3066
|
+
async function runSimpleEvalRunConfigsInk(runner, runConfigNames, concurrency, experimentName) {
|
|
3046
3067
|
return new Promise((resolve5, reject) => {
|
|
3047
3068
|
const app = ink.render(
|
|
3048
3069
|
React__namespace.createElement(RunView, {
|
|
3049
3070
|
runner,
|
|
3050
3071
|
runConfigNames,
|
|
3051
3072
|
concurrency,
|
|
3073
|
+
experimentName,
|
|
3052
3074
|
onComplete: (err, exitCode) => {
|
|
3053
3075
|
app.unmount();
|
|
3054
3076
|
if (err) {
|
|
@@ -3109,7 +3131,8 @@ async function main() {
|
|
|
3109
3131
|
const exitCode = await (useInk ? runSimpleEvalRunConfigsInk : runSimpleEvalRunConfigsPlain)(
|
|
3110
3132
|
runner,
|
|
3111
3133
|
args.runConfigNames,
|
|
3112
|
-
concurrency
|
|
3134
|
+
concurrency,
|
|
3135
|
+
args.experimentName
|
|
3113
3136
|
);
|
|
3114
3137
|
if (args.ci && exitCode !== 0) {
|
|
3115
3138
|
process.exit(1);
|