@m4trix/evals 0.26.0 → 0.27.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -62,7 +62,8 @@ export default defineConfig((): ConfigType => ({
62
62
  import { Dataset } from '@m4trix/evals';
63
63
 
64
64
  export const myDataset = Dataset.define({
65
- name: 'My Dataset',
65
+ name: 'my-dataset',
66
+ displayName: 'My Dataset',
66
67
  includedTags: ['demo'],
67
68
  });
68
69
  ```
@@ -137,7 +138,7 @@ Group several dataset/evaluator runs under one named config. Each row is either
137
138
  `evaluators: [...]` (same module instances discovery loads) or `evaluatorPattern: "..."`
138
139
  (wildcard / regex rules from `RunnerApi.resolveEvaluatorsByNamePattern`). Multiple jobs share one `--concurrency` cap.
139
140
 
140
- Optional **`repetitions`** on a row (default `1`) runs each matching test case that many times. Every execution in that group shares the same **`repetitionId`** in the evaluator callback **`meta`**, with **`repetitionIndex`** / **`repetitionCount`**. Evaluator **`meta`** includes **`runConfigName`**: the **`RunConfig`** name (or **`programmatic`** from **`PROGRAMMATIC_RUN_CONFIG`** for API/TUI-only **`runDatasetWith`**). Names may use **kebab-case**, **snake_case**, **camelCase**, etc. (letters, digits, `_`, `-` only, no spaces); resolution is **case-insensitive**.
141
+ Optional **`repetitions`** on a row (default `1`) runs each matching test case that many times. Every execution in that group shares the same **`repetitionId`** in the evaluator callback **`meta`**, with **`repetitionIndex`** / **`repetitionCount`**. Evaluator **`meta`** includes **`datasetName`** (`Dataset.getDisplayLabel()` → `displayName ?? name`) and **`runConfigName`**: the **`RunConfig`** id (or **`programmatic`** from **`PROGRAMMATIC_RUN_CONFIG`** for API/TUI-only **`runDatasetWith`**). **`Dataset`** and **`TestCase`** follow the same naming convention as **`RunConfig`**: **`name`** is the stable id; optional **`displayName`** is unrestricted for UI. Names may use **kebab-case**, **snake_case**, **camelCase**, etc. (letters, digits, `_`, `-` only, no spaces); resolution is **case-insensitive**.
141
142
 
142
143
  ```ts
143
144
  import { RunConfig } from '@m4trix/evals';
@@ -165,7 +166,7 @@ Repeat **`--run-config`** to queue several configs; jobs share one **`--concurre
165
166
 
166
167
  - `eval-agents`: interactive CLI (starts runs with synthetic meta `programmatic` / `Programmatic`)
167
168
  - `eval-agents-simple run --run-config "<RunConfig name>"` (repeatable; case-insensitive match); add **`--ci`** to exit with code **1** if any test case fails
168
- - `eval-agents-simple generate --dataset "<dataset name>"`
169
+ - `eval-agents-simple generate --dataset "<dataset id>"` (canonical **`Dataset` `name`**, case-insensitive)
169
170
 
170
171
  ## Default Discovery and Artifacts
171
172
 
@@ -55,6 +55,7 @@ function makeEntityIdSchema(brand, label) {
55
55
  var RunConfigNameSchema = makeEntityIdSchema("RunConfigName", "RunConfig name");
56
56
  makeEntityIdSchema("EvaluatorName", "Evaluator name");
57
57
  makeEntityIdSchema("TestCaseName", "Test case name");
58
+ makeEntityIdSchema("DatasetName", "Dataset name");
58
59
  function validateWithSchema(schema, raw, context) {
59
60
  const trimmed = raw.trim();
60
61
  const decode = effect.Schema.decodeUnknownEither(
@@ -611,6 +612,14 @@ function getTestCaseTagList(testCase) {
611
612
  return typeof testCase.getTags === "function" ? [...testCase.getTags()] : [];
612
613
  }
613
614
 
615
+ // src/evals/dataset.ts
616
+ function getDatasetDisplayLabel(dataset) {
617
+ if (typeof dataset.getDisplayLabel === "function") {
618
+ return dataset.getDisplayLabel();
619
+ }
620
+ return typeof dataset.getName === "function" ? dataset.getName() : "";
621
+ }
622
+
614
623
  // src/evals/metric.ts
615
624
  var registry = /* @__PURE__ */ new Map();
616
625
  var Metric = {
@@ -1004,7 +1013,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1004
1013
  meta: {
1005
1014
  triggerId: task.triggerId,
1006
1015
  runId: evaluatorRunId,
1007
- datasetId: task.datasetId,
1016
+ datasetName: task.dataset.getDisplayLabel(),
1008
1017
  repetitionId,
1009
1018
  repetitionIndex,
1010
1019
  repetitionCount,
@@ -1419,7 +1428,7 @@ var EffectRunner = class {
1419
1428
  );
1420
1429
  if (!dsCollected) {
1421
1430
  throw new Error(
1422
- `RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getName()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
1431
+ `RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getDisplayLabel()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
1423
1432
  );
1424
1433
  }
1425
1434
  let evaluatorIds;
@@ -1554,7 +1563,7 @@ var EffectRunner = class {
1554
1563
  const snapshot = {
1555
1564
  runId,
1556
1565
  datasetId: params.datasetId,
1557
- datasetName: dataset.dataset.getName(),
1566
+ datasetName: dataset.dataset.getDisplayLabel(),
1558
1567
  evaluatorIds: selectedEvaluators.map((item) => item.id),
1559
1568
  queuedAt: Date.now(),
1560
1569
  totalTestCases: totalEvaluations,
@@ -1575,7 +1584,7 @@ var EffectRunner = class {
1575
1584
  type: "RunQueued",
1576
1585
  runId,
1577
1586
  datasetId: params.datasetId,
1578
- datasetName: dataset.dataset.getName(),
1587
+ datasetName: dataset.dataset.getDisplayLabel(),
1579
1588
  evaluatorIds: selectedEvaluators.map((item) => item.id),
1580
1589
  totalTestCases: totalEvaluations,
1581
1590
  artifactPath
@@ -1729,7 +1738,7 @@ function getSimpleCliUsage() {
1729
1738
  return [
1730
1739
  "Usage:",
1731
1740
  " eval-agents-simple run --run-config <name> [--run-config <name> ...] [--concurrency N] [--ci]",
1732
- " eval-agents-simple generate --dataset <datasetName>",
1741
+ " eval-agents-simple generate --dataset <datasetId>",
1733
1742
  "",
1734
1743
  "Options:",
1735
1744
  " --ci With run: exit with code 1 if any test case fails.",
@@ -1797,7 +1806,7 @@ function GenerateView({
1797
1806
  if (!cancelled) {
1798
1807
  setResult({
1799
1808
  count: payload.length,
1800
- datasetName: dataset.dataset.getName(),
1809
+ datasetName: getDatasetDisplayLabel(dataset.dataset),
1801
1810
  outputPath
1802
1811
  });
1803
1812
  setTimeout(() => onComplete(), 200);
@@ -1858,7 +1867,7 @@ async function generateDatasetJsonCommandPlain(runner, datasetName) {
1858
1867
  const outputPath = createOutputPath(absoluteDatasetPath);
1859
1868
  await promises.writeFile(outputPath, `${JSON.stringify(payload, null, 2)}
1860
1869
  `, "utf8");
1861
- console.log(`Generated ${payload.length} test cases for dataset "${dataset.dataset.getName()}".`);
1870
+ console.log(`Generated ${payload.length} test cases for dataset "${getDatasetDisplayLabel(dataset.dataset)}".`);
1862
1871
  console.log(`Wrote ${outputPath}`);
1863
1872
  }
1864
1873
  async function generateDatasetJsonCommandInk(runner, datasetName) {
@@ -3109,7 +3118,7 @@ async function main() {
3109
3118
  }
3110
3119
  const genDataset = args.datasetName;
3111
3120
  if (!genDataset) {
3112
- console.error("Missing required --dataset <datasetName> argument.");
3121
+ console.error("Missing required --dataset <datasetId> argument.");
3113
3122
  printUsageAndExit(1);
3114
3123
  }
3115
3124
  await (useInk ? generateDatasetJsonCommandInk : generateDatasetJsonCommandPlain)(