@m4trix/evals 0.26.0 → 0.28.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -62,7 +62,8 @@ export default defineConfig((): ConfigType => ({
62
62
  import { Dataset } from '@m4trix/evals';
63
63
 
64
64
  export const myDataset = Dataset.define({
65
- name: 'My Dataset',
65
+ name: 'my-dataset',
66
+ displayName: 'My Dataset',
66
67
  includedTags: ['demo'],
67
68
  });
68
69
  ```
@@ -79,6 +80,7 @@ export const myEvaluator = Evaluator.define({
79
80
  inputSchema,
80
81
  outputSchema: S.Unknown,
81
82
  scoreSchema: S.Struct({ scores: S.Array(S.Unknown) }),
83
+ // optional: tags: ['suite-a'],
82
84
  }).evaluate(async ({ input, ctx: _ctx, output, createError }) => {
83
85
  const start = Date.now();
84
86
  const value = 85;
@@ -131,13 +133,15 @@ export const myTestCase = TestCase.describe({
131
133
  });
132
134
  ```
133
135
 
136
+ `tags` is optional; omit it when the test case has no declared labels. Evaluators read them as `meta.testCaseTags`.
137
+
134
138
  ### 4) RunConfig (optional)
135
139
 
136
140
  Group several dataset/evaluator runs under one named config. Each row is either
137
141
  `evaluators: [...]` (same module instances discovery loads) or `evaluatorPattern: "..."`
138
142
  (wildcard / regex rules from `RunnerApi.resolveEvaluatorsByNamePattern`). Multiple jobs share one `--concurrency` cap.
139
143
 
140
- Optional **`repetitions`** on a row (default `1`) runs each matching test case that many times. Every execution in that group shares the same **`repetitionId`** in the evaluator callback **`meta`**, with **`repetitionIndex`** / **`repetitionCount`**. Evaluator **`meta`** includes **`runConfigName`**: the **`RunConfig`** name (or **`programmatic`** from **`PROGRAMMATIC_RUN_CONFIG`** for API/TUI-only **`runDatasetWith`**). Names may use **kebab-case**, **snake_case**, **camelCase**, etc. (letters, digits, `_`, `-` only, no spaces); resolution is **case-insensitive**.
144
+ Optional **`repetitions`** on a row (default `1`) runs each matching test case that many times. Every execution in that group shares the same **`repetitionId`** in the evaluator callback **`meta`**, with **`repetitionIndex`** / **`repetitionCount`**. Evaluator **`meta`** includes **`datasetName`** (`Dataset.getDisplayLabel()` → `displayName ?? name`), **`runConfigName`** (the **`RunConfig`** id or **`programmatic`** from **`PROGRAMMATIC_RUN_CONFIG`** for API/TUI-only **`runDatasetWith`**), optional **`experimentName`**, and declared tag lists **`testCaseTags`**, **`runConfigTags`**, and **`evaluatorTags`** (empty arrays when unset). **`Dataset`** **`includedTags` / `excludedTags`** only filter which test cases belong to a dataset; they are not the same as **`TestCase.describe({ tags })`** or **`Evaluator.define({ tags })`**, which label the case/evaluator and show up in **`meta`**. **`Dataset`** and **`TestCase`** follow the same naming convention as **`RunConfig`**: **`name`** is the stable id; optional **`displayName`** is unrestricted for UI. Names may use **kebab-case**, **snake_case**, **camelCase**, etc. (letters, digits, `_`, `-` only, no spaces); resolution is **case-insensitive**.
141
145
 
142
146
  ```ts
143
147
  import { RunConfig } from '@m4trix/evals';
@@ -159,13 +163,13 @@ export const nightly = RunConfig.define({
159
163
  eval-agents-simple run --run-config "nightly"
160
164
  ```
161
165
 
162
- Repeat **`--run-config`** to queue several configs; jobs share one **`--concurrency`** cap.
166
+ Repeat **`--run-config`** to queue several configs; jobs share one **`--concurrency`** cap. Use **`--experiment <name>`** to set **`meta.experimentName`** for every evaluator in that CLI run (any non-empty string; trimmed).
163
167
 
164
168
  ## CLI Commands
165
169
 
166
170
  - `eval-agents`: interactive CLI (starts runs with synthetic meta `programmatic` / `Programmatic`)
167
171
  - `eval-agents-simple run --run-config "<RunConfig name>"` (repeatable; case-insensitive match); add **`--ci`** to exit with code **1** if any test case fails
168
- - `eval-agents-simple generate --dataset "<dataset name>"`
172
+ - `eval-agents-simple generate --dataset "<dataset id>"` (canonical **`Dataset` `name`**, case-insensitive)
169
173
 
170
174
  ## Default Discovery and Artifacts
171
175
 
@@ -55,6 +55,7 @@ function makeEntityIdSchema(brand, label) {
55
55
  var RunConfigNameSchema = makeEntityIdSchema("RunConfigName", "RunConfig name");
56
56
  makeEntityIdSchema("EvaluatorName", "Evaluator name");
57
57
  makeEntityIdSchema("TestCaseName", "Test case name");
58
+ makeEntityIdSchema("DatasetName", "Dataset name");
58
59
  function validateWithSchema(schema, raw, context) {
59
60
  const trimmed = raw.trim();
60
61
  const decode = effect.Schema.decodeUnknownEither(
@@ -611,6 +612,14 @@ function getTestCaseTagList(testCase) {
611
612
  return typeof testCase.getTags === "function" ? [...testCase.getTags()] : [];
612
613
  }
613
614
 
615
+ // src/evals/dataset.ts
616
+ function getDatasetDisplayLabel(dataset) {
617
+ if (typeof dataset.getDisplayLabel === "function") {
618
+ return dataset.getDisplayLabel();
619
+ }
620
+ return typeof dataset.getName === "function" ? dataset.getName() : "";
621
+ }
622
+
614
623
  // src/evals/metric.ts
615
624
  var registry = /* @__PURE__ */ new Map();
616
625
  var Metric = {
@@ -1004,15 +1013,16 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1004
1013
  meta: {
1005
1014
  triggerId: task.triggerId,
1006
1015
  runId: evaluatorRunId,
1007
- datasetId: task.datasetId,
1016
+ datasetName: task.dataset.getDisplayLabel(),
1008
1017
  repetitionId,
1009
1018
  repetitionIndex,
1010
1019
  repetitionCount,
1011
- runConfigName: task.runConfigName
1020
+ runConfigName: task.runConfigName,
1021
+ ...task.experimentName !== void 0 && task.experimentName !== "" ? { experimentName: task.experimentName } : {},
1022
+ testCaseTags: getTestCaseTagList(testCaseItem.testCase),
1023
+ runConfigTags: task.runConfigTags,
1024
+ evaluatorTags: getEvaluatorTagList(evaluator)
1012
1025
  },
1013
- testCaseTags: getTestCaseTagList(testCaseItem.testCase),
1014
- runConfigTags: task.runConfigTags,
1015
- evaluatorTags: getEvaluatorTagList(evaluator),
1016
1026
  logDiff,
1017
1027
  log,
1018
1028
  createError
@@ -1419,7 +1429,7 @@ var EffectRunner = class {
1419
1429
  );
1420
1430
  if (!dsCollected) {
1421
1431
  throw new Error(
1422
- `RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getName()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
1432
+ `RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getDisplayLabel()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
1423
1433
  );
1424
1434
  }
1425
1435
  let evaluatorIds;
@@ -1491,7 +1501,8 @@ var EffectRunner = class {
1491
1501
  globalEvaluationSemaphore: sem,
1492
1502
  runConfigName: job.runConfigName,
1493
1503
  runConfigTags: job.runConfigTags,
1494
- repetitions: job.repetitions
1504
+ repetitions: job.repetitions,
1505
+ experimentName: request.experimentName
1495
1506
  })
1496
1507
  );
1497
1508
  }
@@ -1526,7 +1537,8 @@ var EffectRunner = class {
1526
1537
  maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
1527
1538
  repetitions: request.repetitions,
1528
1539
  runConfigName,
1529
- runConfigTags: request.runConfigTags
1540
+ runConfigTags: request.runConfigTags,
1541
+ experimentName: request.experimentName
1530
1542
  });
1531
1543
  }
1532
1544
  async startDatasetRun(params) {
@@ -1554,7 +1566,7 @@ var EffectRunner = class {
1554
1566
  const snapshot = {
1555
1567
  runId,
1556
1568
  datasetId: params.datasetId,
1557
- datasetName: dataset.dataset.getName(),
1569
+ datasetName: dataset.dataset.getDisplayLabel(),
1558
1570
  evaluatorIds: selectedEvaluators.map((item) => item.id),
1559
1571
  queuedAt: Date.now(),
1560
1572
  totalTestCases: totalEvaluations,
@@ -1575,7 +1587,7 @@ var EffectRunner = class {
1575
1587
  type: "RunQueued",
1576
1588
  runId,
1577
1589
  datasetId: params.datasetId,
1578
- datasetName: dataset.dataset.getName(),
1590
+ datasetName: dataset.dataset.getDisplayLabel(),
1579
1591
  evaluatorIds: selectedEvaluators.map((item) => item.id),
1580
1592
  totalTestCases: totalEvaluations,
1581
1593
  artifactPath
@@ -1601,7 +1613,8 @@ var EffectRunner = class {
1601
1613
  globalEvaluationSemaphore: params.globalEvaluationSemaphore,
1602
1614
  runConfigName: params.runConfigName,
1603
1615
  runConfigTags,
1604
- repetitions
1616
+ repetitions,
1617
+ experimentName: params.experimentName
1605
1618
  })
1606
1619
  );
1607
1620
  return snapshot;
@@ -1721,6 +1734,17 @@ function parseSimpleCliArgs(argv) {
1721
1734
  index += 1;
1722
1735
  continue;
1723
1736
  }
1737
+ if (token === "--experiment" && argv[index + 1]) {
1738
+ const raw = argv[index + 1];
1739
+ if (typeof raw === "string") {
1740
+ const trimmed = raw.trim();
1741
+ if (trimmed.length > 0) {
1742
+ args.experimentName = trimmed;
1743
+ }
1744
+ }
1745
+ index += 1;
1746
+ continue;
1747
+ }
1724
1748
  args.unknownArgs.push(token);
1725
1749
  }
1726
1750
  return args;
@@ -1728,12 +1752,13 @@ function parseSimpleCliArgs(argv) {
1728
1752
  function getSimpleCliUsage() {
1729
1753
  return [
1730
1754
  "Usage:",
1731
- " eval-agents-simple run --run-config <name> [--run-config <name> ...] [--concurrency N] [--ci]",
1732
- " eval-agents-simple generate --dataset <datasetName>",
1755
+ " eval-agents-simple run --run-config <name> [--run-config <name> ...] [--concurrency N] [--experiment <name>] [--ci]",
1756
+ " eval-agents-simple generate --dataset <datasetId>",
1733
1757
  "",
1734
1758
  "Options:",
1735
1759
  " --ci With run: exit with code 1 if any test case fails.",
1736
- " --concurrency, -c N Max concurrent evaluations (default: 4). Use 1 for sequential."
1760
+ " --concurrency, -c N Max concurrent evaluations (default: 4). Use 1 for sequential.",
1761
+ " --experiment <name> With run: set evaluator meta.experimentName for this invocation."
1737
1762
  ].join("\n");
1738
1763
  }
1739
1764
 
@@ -1797,7 +1822,7 @@ function GenerateView({
1797
1822
  if (!cancelled) {
1798
1823
  setResult({
1799
1824
  count: payload.length,
1800
- datasetName: dataset.dataset.getName(),
1825
+ datasetName: getDatasetDisplayLabel(dataset.dataset),
1801
1826
  outputPath
1802
1827
  });
1803
1828
  setTimeout(() => onComplete(), 200);
@@ -1858,7 +1883,7 @@ async function generateDatasetJsonCommandPlain(runner, datasetName) {
1858
1883
  const outputPath = createOutputPath(absoluteDatasetPath);
1859
1884
  await promises.writeFile(outputPath, `${JSON.stringify(payload, null, 2)}
1860
1885
  `, "utf8");
1861
- console.log(`Generated ${payload.length} test cases for dataset "${dataset.dataset.getName()}".`);
1886
+ console.log(`Generated ${payload.length} test cases for dataset "${getDatasetDisplayLabel(dataset.dataset)}".`);
1862
1887
  console.log(`Wrote ${outputPath}`);
1863
1888
  }
1864
1889
  async function generateDatasetJsonCommandInk(runner, datasetName) {
@@ -2010,6 +2035,7 @@ function RunView({
2010
2035
  runner,
2011
2036
  runConfigNames,
2012
2037
  concurrency,
2038
+ experimentName,
2013
2039
  onComplete
2014
2040
  }) {
2015
2041
  const [phase, setPhase] = React.useState("loading");
@@ -2179,7 +2205,8 @@ function RunView({
2179
2205
  });
2180
2206
  const snapshots = await runner.runDatasetJobsWithSharedConcurrency({
2181
2207
  jobs,
2182
- globalConcurrency: concurrency
2208
+ globalConcurrency: concurrency,
2209
+ experimentName
2183
2210
  });
2184
2211
  for (let i = 0; i < snapshots.length; i += 1) {
2185
2212
  const snap = snapshots[i];
@@ -2236,7 +2263,7 @@ function RunView({
2236
2263
  setPhase("completed");
2237
2264
  const exitCode = failedTestCases > 0 ? 1 : 0;
2238
2265
  setTimeout(() => onComplete(void 0, exitCode), 200);
2239
- }, [runner, runConfigNames, concurrency, onComplete]);
2266
+ }, [runner, runConfigNames, concurrency, experimentName, onComplete]);
2240
2267
  React.useEffect(() => {
2241
2268
  void runEval();
2242
2269
  }, [runEval]);
@@ -2724,7 +2751,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
2724
2751
  }
2725
2752
  return lines;
2726
2753
  }
2727
- async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency) {
2754
+ async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency, experimentName) {
2728
2755
  const jobs = await runner.expandRunConfigNamesToJobs(runConfigNames);
2729
2756
  if (jobs.length === 0) {
2730
2757
  throw new Error("No jobs expanded from RunConfigs.");
@@ -2933,7 +2960,8 @@ async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency)
2933
2960
  console.log("");
2934
2961
  const snapshots = await runner.runDatasetJobsWithSharedConcurrency({
2935
2962
  jobs,
2936
- globalConcurrency: concurrency
2963
+ globalConcurrency: concurrency,
2964
+ experimentName
2937
2965
  });
2938
2966
  for (let i = 0; i < snapshots.length; i += 1) {
2939
2967
  const snap = snapshots[i];
@@ -3033,13 +3061,14 @@ async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency)
3033
3061
  }
3034
3062
  return failedTestCasesTotal > 0 ? 1 : 0;
3035
3063
  }
3036
- async function runSimpleEvalRunConfigsInk(runner, runConfigNames, concurrency) {
3064
+ async function runSimpleEvalRunConfigsInk(runner, runConfigNames, concurrency, experimentName) {
3037
3065
  return new Promise((resolve5, reject) => {
3038
3066
  const app = ink.render(
3039
3067
  React__namespace.createElement(RunView, {
3040
3068
  runner,
3041
3069
  runConfigNames,
3042
3070
  concurrency,
3071
+ experimentName,
3043
3072
  onComplete: (err, exitCode) => {
3044
3073
  app.unmount();
3045
3074
  if (err) {
@@ -3100,7 +3129,8 @@ async function main() {
3100
3129
  const exitCode = await (useInk ? runSimpleEvalRunConfigsInk : runSimpleEvalRunConfigsPlain)(
3101
3130
  runner,
3102
3131
  args.runConfigNames,
3103
- concurrency
3132
+ concurrency,
3133
+ args.experimentName
3104
3134
  );
3105
3135
  if (args.ci && exitCode !== 0) {
3106
3136
  process.exit(1);
@@ -3109,7 +3139,7 @@ async function main() {
3109
3139
  }
3110
3140
  const genDataset = args.datasetName;
3111
3141
  if (!genDataset) {
3112
- console.error("Missing required --dataset <datasetName> argument.");
3142
+ console.error("Missing required --dataset <datasetId> argument.");
3113
3143
  printUsageAndExit(1);
3114
3144
  }
3115
3145
  await (useInk ? generateDatasetJsonCommandInk : generateDatasetJsonCommandPlain)(