@m4trix/evals 0.26.0 → 0.27.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -3
- package/dist/cli-simple.cjs +17 -8
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +17 -8
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +14 -5
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +14 -5
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +93 -69
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +30 -10
- package/dist/index.js +91 -70
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -62,7 +62,8 @@ export default defineConfig((): ConfigType => ({
|
|
|
62
62
|
import { Dataset } from '@m4trix/evals';
|
|
63
63
|
|
|
64
64
|
export const myDataset = Dataset.define({
|
|
65
|
-
name: '
|
|
65
|
+
name: 'my-dataset',
|
|
66
|
+
displayName: 'My Dataset',
|
|
66
67
|
includedTags: ['demo'],
|
|
67
68
|
});
|
|
68
69
|
```
|
|
@@ -137,7 +138,7 @@ Group several dataset/evaluator runs under one named config. Each row is either
|
|
|
137
138
|
`evaluators: [...]` (same module instances discovery loads) or `evaluatorPattern: "..."`
|
|
138
139
|
(wildcard / regex rules from `RunnerApi.resolveEvaluatorsByNamePattern`). Multiple jobs share one `--concurrency` cap.
|
|
139
140
|
|
|
140
|
-
Optional **`repetitions`** on a row (default `1`) runs each matching test case that many times. Every execution in that group shares the same **`repetitionId`** in the evaluator callback **`meta`**, with **`repetitionIndex`** / **`repetitionCount`**. Evaluator **`meta`** includes **`runConfigName`**: the **`RunConfig`**
|
|
141
|
+
Optional **`repetitions`** on a row (default `1`) runs each matching test case that many times. Every execution in that group shares the same **`repetitionId`** in the evaluator callback **`meta`**, with **`repetitionIndex`** / **`repetitionCount`**. Evaluator **`meta`** includes **`datasetName`** (`Dataset.getDisplayLabel()` → `displayName ?? name`) and **`runConfigName`**: the **`RunConfig`** id (or **`programmatic`** from **`PROGRAMMATIC_RUN_CONFIG`** for API/TUI-only **`runDatasetWith`**). **`Dataset`** and **`TestCase`** follow the same naming convention as **`RunConfig`**: **`name`** is the stable id; optional **`displayName`** is unrestricted for UI. Names may use **kebab-case**, **snake_case**, **camelCase**, etc. (letters, digits, `_`, `-` only, no spaces); resolution is **case-insensitive**.
|
|
141
142
|
|
|
142
143
|
```ts
|
|
143
144
|
import { RunConfig } from '@m4trix/evals';
|
|
@@ -165,7 +166,7 @@ Repeat **`--run-config`** to queue several configs; jobs share one **`--concurre
|
|
|
165
166
|
|
|
166
167
|
- `eval-agents`: interactive CLI (starts runs with synthetic meta `programmatic` / `Programmatic`)
|
|
167
168
|
- `eval-agents-simple run --run-config "<RunConfig name>"` (repeatable; case-insensitive match); add **`--ci`** to exit with code **1** if any test case fails
|
|
168
|
-
- `eval-agents-simple generate --dataset "<dataset
|
|
169
|
+
- `eval-agents-simple generate --dataset "<dataset id>"` (canonical **`Dataset` `name`**, case-insensitive)
|
|
169
170
|
|
|
170
171
|
## Default Discovery and Artifacts
|
|
171
172
|
|
package/dist/cli-simple.cjs
CHANGED
|
@@ -55,6 +55,7 @@ function makeEntityIdSchema(brand, label) {
|
|
|
55
55
|
var RunConfigNameSchema = makeEntityIdSchema("RunConfigName", "RunConfig name");
|
|
56
56
|
makeEntityIdSchema("EvaluatorName", "Evaluator name");
|
|
57
57
|
makeEntityIdSchema("TestCaseName", "Test case name");
|
|
58
|
+
makeEntityIdSchema("DatasetName", "Dataset name");
|
|
58
59
|
function validateWithSchema(schema, raw, context) {
|
|
59
60
|
const trimmed = raw.trim();
|
|
60
61
|
const decode = effect.Schema.decodeUnknownEither(
|
|
@@ -611,6 +612,14 @@ function getTestCaseTagList(testCase) {
|
|
|
611
612
|
return typeof testCase.getTags === "function" ? [...testCase.getTags()] : [];
|
|
612
613
|
}
|
|
613
614
|
|
|
615
|
+
// src/evals/dataset.ts
|
|
616
|
+
function getDatasetDisplayLabel(dataset) {
|
|
617
|
+
if (typeof dataset.getDisplayLabel === "function") {
|
|
618
|
+
return dataset.getDisplayLabel();
|
|
619
|
+
}
|
|
620
|
+
return typeof dataset.getName === "function" ? dataset.getName() : "";
|
|
621
|
+
}
|
|
622
|
+
|
|
614
623
|
// src/evals/metric.ts
|
|
615
624
|
var registry = /* @__PURE__ */ new Map();
|
|
616
625
|
var Metric = {
|
|
@@ -1004,7 +1013,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1004
1013
|
meta: {
|
|
1005
1014
|
triggerId: task.triggerId,
|
|
1006
1015
|
runId: evaluatorRunId,
|
|
1007
|
-
|
|
1016
|
+
datasetName: task.dataset.getDisplayLabel(),
|
|
1008
1017
|
repetitionId,
|
|
1009
1018
|
repetitionIndex,
|
|
1010
1019
|
repetitionCount,
|
|
@@ -1419,7 +1428,7 @@ var EffectRunner = class {
|
|
|
1419
1428
|
);
|
|
1420
1429
|
if (!dsCollected) {
|
|
1421
1430
|
throw new Error(
|
|
1422
|
-
`RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.
|
|
1431
|
+
`RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getDisplayLabel()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
|
|
1423
1432
|
);
|
|
1424
1433
|
}
|
|
1425
1434
|
let evaluatorIds;
|
|
@@ -1554,7 +1563,7 @@ var EffectRunner = class {
|
|
|
1554
1563
|
const snapshot = {
|
|
1555
1564
|
runId,
|
|
1556
1565
|
datasetId: params.datasetId,
|
|
1557
|
-
datasetName: dataset.dataset.
|
|
1566
|
+
datasetName: dataset.dataset.getDisplayLabel(),
|
|
1558
1567
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
1559
1568
|
queuedAt: Date.now(),
|
|
1560
1569
|
totalTestCases: totalEvaluations,
|
|
@@ -1575,7 +1584,7 @@ var EffectRunner = class {
|
|
|
1575
1584
|
type: "RunQueued",
|
|
1576
1585
|
runId,
|
|
1577
1586
|
datasetId: params.datasetId,
|
|
1578
|
-
datasetName: dataset.dataset.
|
|
1587
|
+
datasetName: dataset.dataset.getDisplayLabel(),
|
|
1579
1588
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
1580
1589
|
totalTestCases: totalEvaluations,
|
|
1581
1590
|
artifactPath
|
|
@@ -1729,7 +1738,7 @@ function getSimpleCliUsage() {
|
|
|
1729
1738
|
return [
|
|
1730
1739
|
"Usage:",
|
|
1731
1740
|
" eval-agents-simple run --run-config <name> [--run-config <name> ...] [--concurrency N] [--ci]",
|
|
1732
|
-
" eval-agents-simple generate --dataset <
|
|
1741
|
+
" eval-agents-simple generate --dataset <datasetId>",
|
|
1733
1742
|
"",
|
|
1734
1743
|
"Options:",
|
|
1735
1744
|
" --ci With run: exit with code 1 if any test case fails.",
|
|
@@ -1797,7 +1806,7 @@ function GenerateView({
|
|
|
1797
1806
|
if (!cancelled) {
|
|
1798
1807
|
setResult({
|
|
1799
1808
|
count: payload.length,
|
|
1800
|
-
datasetName: dataset.dataset
|
|
1809
|
+
datasetName: getDatasetDisplayLabel(dataset.dataset),
|
|
1801
1810
|
outputPath
|
|
1802
1811
|
});
|
|
1803
1812
|
setTimeout(() => onComplete(), 200);
|
|
@@ -1858,7 +1867,7 @@ async function generateDatasetJsonCommandPlain(runner, datasetName) {
|
|
|
1858
1867
|
const outputPath = createOutputPath(absoluteDatasetPath);
|
|
1859
1868
|
await promises.writeFile(outputPath, `${JSON.stringify(payload, null, 2)}
|
|
1860
1869
|
`, "utf8");
|
|
1861
|
-
console.log(`Generated ${payload.length} test cases for dataset "${dataset.dataset
|
|
1870
|
+
console.log(`Generated ${payload.length} test cases for dataset "${getDatasetDisplayLabel(dataset.dataset)}".`);
|
|
1862
1871
|
console.log(`Wrote ${outputPath}`);
|
|
1863
1872
|
}
|
|
1864
1873
|
async function generateDatasetJsonCommandInk(runner, datasetName) {
|
|
@@ -3109,7 +3118,7 @@ async function main() {
|
|
|
3109
3118
|
}
|
|
3110
3119
|
const genDataset = args.datasetName;
|
|
3111
3120
|
if (!genDataset) {
|
|
3112
|
-
console.error("Missing required --dataset <
|
|
3121
|
+
console.error("Missing required --dataset <datasetId> argument.");
|
|
3113
3122
|
printUsageAndExit(1);
|
|
3114
3123
|
}
|
|
3115
3124
|
await (useInk ? generateDatasetJsonCommandInk : generateDatasetJsonCommandPlain)(
|