@m4trix/evals 0.26.0 → 0.28.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -4
- package/dist/cli-simple.cjs +53 -23
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +53 -23
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +25 -12
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +25 -12
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +108 -79
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +65 -24
- package/dist/index.js +106 -80
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -62,7 +62,8 @@ export default defineConfig((): ConfigType => ({
|
|
|
62
62
|
import { Dataset } from '@m4trix/evals';
|
|
63
63
|
|
|
64
64
|
export const myDataset = Dataset.define({
|
|
65
|
-
name: '
|
|
65
|
+
name: 'my-dataset',
|
|
66
|
+
displayName: 'My Dataset',
|
|
66
67
|
includedTags: ['demo'],
|
|
67
68
|
});
|
|
68
69
|
```
|
|
@@ -79,6 +80,7 @@ export const myEvaluator = Evaluator.define({
|
|
|
79
80
|
inputSchema,
|
|
80
81
|
outputSchema: S.Unknown,
|
|
81
82
|
scoreSchema: S.Struct({ scores: S.Array(S.Unknown) }),
|
|
83
|
+
// optional: tags: ['suite-a'],
|
|
82
84
|
}).evaluate(async ({ input, ctx: _ctx, output, createError }) => {
|
|
83
85
|
const start = Date.now();
|
|
84
86
|
const value = 85;
|
|
@@ -131,13 +133,15 @@ export const myTestCase = TestCase.describe({
|
|
|
131
133
|
});
|
|
132
134
|
```
|
|
133
135
|
|
|
136
|
+
`tags` is optional; omit it when the test case has no declared labels. Evaluators read them as `meta.testCaseTags`.
|
|
137
|
+
|
|
134
138
|
### 4) RunConfig (optional)
|
|
135
139
|
|
|
136
140
|
Group several dataset/evaluator runs under one named config. Each row is either
|
|
137
141
|
`evaluators: [...]` (same module instances discovery loads) or `evaluatorPattern: "..."`
|
|
138
142
|
(wildcard / regex rules from `RunnerApi.resolveEvaluatorsByNamePattern`). Multiple jobs share one `--concurrency` cap.
|
|
139
143
|
|
|
140
|
-
Optional **`repetitions`** on a row (default `1`) runs each matching test case that many times. Every execution in that group shares the same **`repetitionId`** in the evaluator callback **`meta`**, with **`repetitionIndex`** / **`repetitionCount`**. Evaluator **`meta`** includes **`runConfigName
|
|
144
|
+
Optional **`repetitions`** on a row (default `1`) runs each matching test case that many times. Every execution in that group shares the same **`repetitionId`** in the evaluator callback **`meta`**, with **`repetitionIndex`** / **`repetitionCount`**. Evaluator **`meta`** includes **`datasetName`** (`Dataset.getDisplayLabel()` → `displayName ?? name`), **`runConfigName`** (the **`RunConfig`** id or **`programmatic`** from **`PROGRAMMATIC_RUN_CONFIG`** for API/TUI-only **`runDatasetWith`**), optional **`experimentName`**, and declared tag lists **`testCaseTags`**, **`runConfigTags`**, and **`evaluatorTags`** (empty arrays when unset). **`Dataset`** **`includedTags` / `excludedTags`** only filter which test cases belong to a dataset; they are not the same as **`TestCase.describe({ tags })`** or **`Evaluator.define({ tags })`**, which label the case/evaluator and show up in **`meta`**. **`Dataset`** and **`TestCase`** follow the same naming convention as **`RunConfig`**: **`name`** is the stable id; optional **`displayName`** is unrestricted for UI. Names may use **kebab-case**, **snake_case**, **camelCase**, etc. (letters, digits, `_`, `-` only, no spaces); resolution is **case-insensitive**.
|
|
141
145
|
|
|
142
146
|
```ts
|
|
143
147
|
import { RunConfig } from '@m4trix/evals';
|
|
@@ -159,13 +163,13 @@ export const nightly = RunConfig.define({
|
|
|
159
163
|
eval-agents-simple run --run-config "nightly"
|
|
160
164
|
```
|
|
161
165
|
|
|
162
|
-
Repeat **`--run-config`** to queue several configs; jobs share one **`--concurrency`** cap.
|
|
166
|
+
Repeat **`--run-config`** to queue several configs; jobs share one **`--concurrency`** cap. Use **`--experiment <name>`** to set **`meta.experimentName`** for every evaluator in that CLI run (any non-empty string; trimmed).
|
|
163
167
|
|
|
164
168
|
## CLI Commands
|
|
165
169
|
|
|
166
170
|
- `eval-agents`: interactive CLI (starts runs with synthetic meta `programmatic` / `Programmatic`)
|
|
167
171
|
- `eval-agents-simple run --run-config "<RunConfig name>"` (repeatable; case-insensitive match); add **`--ci`** to exit with code **1** if any test case fails
|
|
168
|
-
- `eval-agents-simple generate --dataset "<dataset
|
|
172
|
+
- `eval-agents-simple generate --dataset "<dataset id>"` (canonical **`Dataset` `name`**, case-insensitive)
|
|
169
173
|
|
|
170
174
|
## Default Discovery and Artifacts
|
|
171
175
|
|
package/dist/cli-simple.cjs
CHANGED
|
@@ -55,6 +55,7 @@ function makeEntityIdSchema(brand, label) {
|
|
|
55
55
|
var RunConfigNameSchema = makeEntityIdSchema("RunConfigName", "RunConfig name");
|
|
56
56
|
makeEntityIdSchema("EvaluatorName", "Evaluator name");
|
|
57
57
|
makeEntityIdSchema("TestCaseName", "Test case name");
|
|
58
|
+
makeEntityIdSchema("DatasetName", "Dataset name");
|
|
58
59
|
function validateWithSchema(schema, raw, context) {
|
|
59
60
|
const trimmed = raw.trim();
|
|
60
61
|
const decode = effect.Schema.decodeUnknownEither(
|
|
@@ -611,6 +612,14 @@ function getTestCaseTagList(testCase) {
|
|
|
611
612
|
return typeof testCase.getTags === "function" ? [...testCase.getTags()] : [];
|
|
612
613
|
}
|
|
613
614
|
|
|
615
|
+
// src/evals/dataset.ts
|
|
616
|
+
function getDatasetDisplayLabel(dataset) {
|
|
617
|
+
if (typeof dataset.getDisplayLabel === "function") {
|
|
618
|
+
return dataset.getDisplayLabel();
|
|
619
|
+
}
|
|
620
|
+
return typeof dataset.getName === "function" ? dataset.getName() : "";
|
|
621
|
+
}
|
|
622
|
+
|
|
614
623
|
// src/evals/metric.ts
|
|
615
624
|
var registry = /* @__PURE__ */ new Map();
|
|
616
625
|
var Metric = {
|
|
@@ -1004,15 +1013,16 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1004
1013
|
meta: {
|
|
1005
1014
|
triggerId: task.triggerId,
|
|
1006
1015
|
runId: evaluatorRunId,
|
|
1007
|
-
|
|
1016
|
+
datasetName: task.dataset.getDisplayLabel(),
|
|
1008
1017
|
repetitionId,
|
|
1009
1018
|
repetitionIndex,
|
|
1010
1019
|
repetitionCount,
|
|
1011
|
-
runConfigName: task.runConfigName
|
|
1020
|
+
runConfigName: task.runConfigName,
|
|
1021
|
+
...task.experimentName !== void 0 && task.experimentName !== "" ? { experimentName: task.experimentName } : {},
|
|
1022
|
+
testCaseTags: getTestCaseTagList(testCaseItem.testCase),
|
|
1023
|
+
runConfigTags: task.runConfigTags,
|
|
1024
|
+
evaluatorTags: getEvaluatorTagList(evaluator)
|
|
1012
1025
|
},
|
|
1013
|
-
testCaseTags: getTestCaseTagList(testCaseItem.testCase),
|
|
1014
|
-
runConfigTags: task.runConfigTags,
|
|
1015
|
-
evaluatorTags: getEvaluatorTagList(evaluator),
|
|
1016
1026
|
logDiff,
|
|
1017
1027
|
log,
|
|
1018
1028
|
createError
|
|
@@ -1419,7 +1429,7 @@ var EffectRunner = class {
|
|
|
1419
1429
|
);
|
|
1420
1430
|
if (!dsCollected) {
|
|
1421
1431
|
throw new Error(
|
|
1422
|
-
`RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.
|
|
1432
|
+
`RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getDisplayLabel()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
|
|
1423
1433
|
);
|
|
1424
1434
|
}
|
|
1425
1435
|
let evaluatorIds;
|
|
@@ -1491,7 +1501,8 @@ var EffectRunner = class {
|
|
|
1491
1501
|
globalEvaluationSemaphore: sem,
|
|
1492
1502
|
runConfigName: job.runConfigName,
|
|
1493
1503
|
runConfigTags: job.runConfigTags,
|
|
1494
|
-
repetitions: job.repetitions
|
|
1504
|
+
repetitions: job.repetitions,
|
|
1505
|
+
experimentName: request.experimentName
|
|
1495
1506
|
})
|
|
1496
1507
|
);
|
|
1497
1508
|
}
|
|
@@ -1526,7 +1537,8 @@ var EffectRunner = class {
|
|
|
1526
1537
|
maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
|
|
1527
1538
|
repetitions: request.repetitions,
|
|
1528
1539
|
runConfigName,
|
|
1529
|
-
runConfigTags: request.runConfigTags
|
|
1540
|
+
runConfigTags: request.runConfigTags,
|
|
1541
|
+
experimentName: request.experimentName
|
|
1530
1542
|
});
|
|
1531
1543
|
}
|
|
1532
1544
|
async startDatasetRun(params) {
|
|
@@ -1554,7 +1566,7 @@ var EffectRunner = class {
|
|
|
1554
1566
|
const snapshot = {
|
|
1555
1567
|
runId,
|
|
1556
1568
|
datasetId: params.datasetId,
|
|
1557
|
-
datasetName: dataset.dataset.
|
|
1569
|
+
datasetName: dataset.dataset.getDisplayLabel(),
|
|
1558
1570
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
1559
1571
|
queuedAt: Date.now(),
|
|
1560
1572
|
totalTestCases: totalEvaluations,
|
|
@@ -1575,7 +1587,7 @@ var EffectRunner = class {
|
|
|
1575
1587
|
type: "RunQueued",
|
|
1576
1588
|
runId,
|
|
1577
1589
|
datasetId: params.datasetId,
|
|
1578
|
-
datasetName: dataset.dataset.
|
|
1590
|
+
datasetName: dataset.dataset.getDisplayLabel(),
|
|
1579
1591
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
1580
1592
|
totalTestCases: totalEvaluations,
|
|
1581
1593
|
artifactPath
|
|
@@ -1601,7 +1613,8 @@ var EffectRunner = class {
|
|
|
1601
1613
|
globalEvaluationSemaphore: params.globalEvaluationSemaphore,
|
|
1602
1614
|
runConfigName: params.runConfigName,
|
|
1603
1615
|
runConfigTags,
|
|
1604
|
-
repetitions
|
|
1616
|
+
repetitions,
|
|
1617
|
+
experimentName: params.experimentName
|
|
1605
1618
|
})
|
|
1606
1619
|
);
|
|
1607
1620
|
return snapshot;
|
|
@@ -1721,6 +1734,17 @@ function parseSimpleCliArgs(argv) {
|
|
|
1721
1734
|
index += 1;
|
|
1722
1735
|
continue;
|
|
1723
1736
|
}
|
|
1737
|
+
if (token === "--experiment" && argv[index + 1]) {
|
|
1738
|
+
const raw = argv[index + 1];
|
|
1739
|
+
if (typeof raw === "string") {
|
|
1740
|
+
const trimmed = raw.trim();
|
|
1741
|
+
if (trimmed.length > 0) {
|
|
1742
|
+
args.experimentName = trimmed;
|
|
1743
|
+
}
|
|
1744
|
+
}
|
|
1745
|
+
index += 1;
|
|
1746
|
+
continue;
|
|
1747
|
+
}
|
|
1724
1748
|
args.unknownArgs.push(token);
|
|
1725
1749
|
}
|
|
1726
1750
|
return args;
|
|
@@ -1728,12 +1752,13 @@ function parseSimpleCliArgs(argv) {
|
|
|
1728
1752
|
function getSimpleCliUsage() {
|
|
1729
1753
|
return [
|
|
1730
1754
|
"Usage:",
|
|
1731
|
-
" eval-agents-simple run --run-config <name> [--run-config <name> ...] [--concurrency N] [--ci]",
|
|
1732
|
-
" eval-agents-simple generate --dataset <
|
|
1755
|
+
" eval-agents-simple run --run-config <name> [--run-config <name> ...] [--concurrency N] [--experiment <name>] [--ci]",
|
|
1756
|
+
" eval-agents-simple generate --dataset <datasetId>",
|
|
1733
1757
|
"",
|
|
1734
1758
|
"Options:",
|
|
1735
1759
|
" --ci With run: exit with code 1 if any test case fails.",
|
|
1736
|
-
" --concurrency, -c N Max concurrent evaluations (default: 4). Use 1 for sequential."
|
|
1760
|
+
" --concurrency, -c N Max concurrent evaluations (default: 4). Use 1 for sequential.",
|
|
1761
|
+
" --experiment <name> With run: set evaluator meta.experimentName for this invocation."
|
|
1737
1762
|
].join("\n");
|
|
1738
1763
|
}
|
|
1739
1764
|
|
|
@@ -1797,7 +1822,7 @@ function GenerateView({
|
|
|
1797
1822
|
if (!cancelled) {
|
|
1798
1823
|
setResult({
|
|
1799
1824
|
count: payload.length,
|
|
1800
|
-
datasetName: dataset.dataset
|
|
1825
|
+
datasetName: getDatasetDisplayLabel(dataset.dataset),
|
|
1801
1826
|
outputPath
|
|
1802
1827
|
});
|
|
1803
1828
|
setTimeout(() => onComplete(), 200);
|
|
@@ -1858,7 +1883,7 @@ async function generateDatasetJsonCommandPlain(runner, datasetName) {
|
|
|
1858
1883
|
const outputPath = createOutputPath(absoluteDatasetPath);
|
|
1859
1884
|
await promises.writeFile(outputPath, `${JSON.stringify(payload, null, 2)}
|
|
1860
1885
|
`, "utf8");
|
|
1861
|
-
console.log(`Generated ${payload.length} test cases for dataset "${dataset.dataset
|
|
1886
|
+
console.log(`Generated ${payload.length} test cases for dataset "${getDatasetDisplayLabel(dataset.dataset)}".`);
|
|
1862
1887
|
console.log(`Wrote ${outputPath}`);
|
|
1863
1888
|
}
|
|
1864
1889
|
async function generateDatasetJsonCommandInk(runner, datasetName) {
|
|
@@ -2010,6 +2035,7 @@ function RunView({
|
|
|
2010
2035
|
runner,
|
|
2011
2036
|
runConfigNames,
|
|
2012
2037
|
concurrency,
|
|
2038
|
+
experimentName,
|
|
2013
2039
|
onComplete
|
|
2014
2040
|
}) {
|
|
2015
2041
|
const [phase, setPhase] = React.useState("loading");
|
|
@@ -2179,7 +2205,8 @@ function RunView({
|
|
|
2179
2205
|
});
|
|
2180
2206
|
const snapshots = await runner.runDatasetJobsWithSharedConcurrency({
|
|
2181
2207
|
jobs,
|
|
2182
|
-
globalConcurrency: concurrency
|
|
2208
|
+
globalConcurrency: concurrency,
|
|
2209
|
+
experimentName
|
|
2183
2210
|
});
|
|
2184
2211
|
for (let i = 0; i < snapshots.length; i += 1) {
|
|
2185
2212
|
const snap = snapshots[i];
|
|
@@ -2236,7 +2263,7 @@ function RunView({
|
|
|
2236
2263
|
setPhase("completed");
|
|
2237
2264
|
const exitCode = failedTestCases > 0 ? 1 : 0;
|
|
2238
2265
|
setTimeout(() => onComplete(void 0, exitCode), 200);
|
|
2239
|
-
}, [runner, runConfigNames, concurrency, onComplete]);
|
|
2266
|
+
}, [runner, runConfigNames, concurrency, experimentName, onComplete]);
|
|
2240
2267
|
React.useEffect(() => {
|
|
2241
2268
|
void runEval();
|
|
2242
2269
|
}, [runEval]);
|
|
@@ -2724,7 +2751,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
|
|
|
2724
2751
|
}
|
|
2725
2752
|
return lines;
|
|
2726
2753
|
}
|
|
2727
|
-
async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency) {
|
|
2754
|
+
async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency, experimentName) {
|
|
2728
2755
|
const jobs = await runner.expandRunConfigNamesToJobs(runConfigNames);
|
|
2729
2756
|
if (jobs.length === 0) {
|
|
2730
2757
|
throw new Error("No jobs expanded from RunConfigs.");
|
|
@@ -2933,7 +2960,8 @@ async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency)
|
|
|
2933
2960
|
console.log("");
|
|
2934
2961
|
const snapshots = await runner.runDatasetJobsWithSharedConcurrency({
|
|
2935
2962
|
jobs,
|
|
2936
|
-
globalConcurrency: concurrency
|
|
2963
|
+
globalConcurrency: concurrency,
|
|
2964
|
+
experimentName
|
|
2937
2965
|
});
|
|
2938
2966
|
for (let i = 0; i < snapshots.length; i += 1) {
|
|
2939
2967
|
const snap = snapshots[i];
|
|
@@ -3033,13 +3061,14 @@ async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency)
|
|
|
3033
3061
|
}
|
|
3034
3062
|
return failedTestCasesTotal > 0 ? 1 : 0;
|
|
3035
3063
|
}
|
|
3036
|
-
async function runSimpleEvalRunConfigsInk(runner, runConfigNames, concurrency) {
|
|
3064
|
+
async function runSimpleEvalRunConfigsInk(runner, runConfigNames, concurrency, experimentName) {
|
|
3037
3065
|
return new Promise((resolve5, reject) => {
|
|
3038
3066
|
const app = ink.render(
|
|
3039
3067
|
React__namespace.createElement(RunView, {
|
|
3040
3068
|
runner,
|
|
3041
3069
|
runConfigNames,
|
|
3042
3070
|
concurrency,
|
|
3071
|
+
experimentName,
|
|
3043
3072
|
onComplete: (err, exitCode) => {
|
|
3044
3073
|
app.unmount();
|
|
3045
3074
|
if (err) {
|
|
@@ -3100,7 +3129,8 @@ async function main() {
|
|
|
3100
3129
|
const exitCode = await (useInk ? runSimpleEvalRunConfigsInk : runSimpleEvalRunConfigsPlain)(
|
|
3101
3130
|
runner,
|
|
3102
3131
|
args.runConfigNames,
|
|
3103
|
-
concurrency
|
|
3132
|
+
concurrency,
|
|
3133
|
+
args.experimentName
|
|
3104
3134
|
);
|
|
3105
3135
|
if (args.ci && exitCode !== 0) {
|
|
3106
3136
|
process.exit(1);
|
|
@@ -3109,7 +3139,7 @@ async function main() {
|
|
|
3109
3139
|
}
|
|
3110
3140
|
const genDataset = args.datasetName;
|
|
3111
3141
|
if (!genDataset) {
|
|
3112
|
-
console.error("Missing required --dataset <
|
|
3142
|
+
console.error("Missing required --dataset <datasetId> argument.");
|
|
3113
3143
|
printUsageAndExit(1);
|
|
3114
3144
|
}
|
|
3115
3145
|
await (useInk ? generateDatasetJsonCommandInk : generateDatasetJsonCommandPlain)(
|