@m4trix/evals 0.27.0 → 0.29.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -80,6 +80,7 @@ export const myEvaluator = Evaluator.define({
80
80
  inputSchema,
81
81
  outputSchema: S.Unknown,
82
82
  scoreSchema: S.Struct({ scores: S.Array(S.Unknown) }),
83
+ // optional: tags: ['suite-a'],
83
84
  }).evaluate(async ({ input, ctx: _ctx, output, createError }) => {
84
85
  const start = Date.now();
85
86
  const value = 85;
@@ -132,13 +133,15 @@ export const myTestCase = TestCase.describe({
132
133
  });
133
134
  ```
134
135
 
136
+ `tags` is optional; omit it when the test case has no declared labels. Evaluators read them as `meta.testCaseTags`.
137
+
135
138
  ### 4) RunConfig (optional)
136
139
 
137
140
  Group several dataset/evaluator runs under one named config. Each row is either
138
141
  `evaluators: [...]` (same module instances discovery loads) or `evaluatorPattern: "..."`
139
142
  (wildcard / regex rules from `RunnerApi.resolveEvaluatorsByNamePattern`). Multiple jobs share one `--concurrency` cap.
140
143
 
141
- Optional **`repetitions`** on a row (default `1`) runs each matching test case that many times. Every execution in that group shares the same **`repetitionId`** in the evaluator callback **`meta`**, with **`repetitionIndex`** / **`repetitionCount`**. Evaluator **`meta`** includes **`datasetName`** (`Dataset.getDisplayLabel()` → `displayName ?? name`) and **`runConfigName`**: the **`RunConfig`** id (or **`programmatic`** from **`PROGRAMMATIC_RUN_CONFIG`** for API/TUI-only **`runDatasetWith`**). **`Dataset`** and **`TestCase`** follow the same naming convention as **`RunConfig`**: **`name`** is the stable id; optional **`displayName`** is unrestricted for UI. Names may use **kebab-case**, **snake_case**, **camelCase**, etc. (letters, digits, `_`, `-` only, no spaces); resolution is **case-insensitive**.
144
+ Optional **`repetitions`** on a row (default `1`) runs each matching test case that many times. Every execution in that group shares the same **`repetitionId`** in the evaluator callback **`meta`**, with **`repetitionIndex`** / **`repetitionCount`**. Evaluator **`meta`** includes **`datasetName`** (`Dataset.getDisplayLabel()` → `displayName ?? name`), **`testCaseId`** (discovery id, same as runner events), **`testCaseName`** (`TestCase.getDisplayLabel()` → `displayName ?? name`), **`runConfigName`** (the **`RunConfig`** id or **`programmatic`** from **`PROGRAMMATIC_RUN_CONFIG`** for API/TUI-only **`runDatasetWith`**), optional **`experimentName`**, and declared tag lists **`testCaseTags`**, **`runConfigTags`**, and **`evaluatorTags`** (empty arrays when unset). **`Dataset`** **`includedTags` / `excludedTags`** only filter which test cases belong to a dataset; they are not the same as **`TestCase.describe({ tags })`** or **`Evaluator.define({ tags })`**, which label the case/evaluator and show up in **`meta`**. **`Dataset`** and **`TestCase`** follow the same naming convention as **`RunConfig`**: **`name`** is the stable id; optional **`displayName`** is unrestricted for UI. Names may use **kebab-case**, **snake_case**, **camelCase**, etc. (letters, digits, `_`, `-` only, no spaces); resolution is **case-insensitive**.
142
145
 
143
146
  ```ts
144
147
  import { RunConfig } from '@m4trix/evals';
@@ -160,7 +163,7 @@ export const nightly = RunConfig.define({
160
163
  eval-agents-simple run --run-config "nightly"
161
164
  ```
162
165
 
163
- Repeat **`--run-config`** to queue several configs; jobs share one **`--concurrency`** cap.
166
+ Repeat **`--run-config`** to queue several configs; jobs share one **`--concurrency`** cap. Use **`--experiment <name>`** to set **`meta.experimentName`** for every evaluator in that CLI run (any non-empty string; trimmed).
164
167
 
165
168
  ## CLI Commands
166
169
 
@@ -1014,14 +1014,17 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1014
1014
  triggerId: task.triggerId,
1015
1015
  runId: evaluatorRunId,
1016
1016
  datasetName: task.dataset.getDisplayLabel(),
1017
+ testCaseId: testCaseItem.id,
1018
+ testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
1017
1019
  repetitionId,
1018
1020
  repetitionIndex,
1019
1021
  repetitionCount,
1020
- runConfigName: task.runConfigName
1022
+ runConfigName: task.runConfigName,
1023
+ ...task.experimentName !== void 0 && task.experimentName !== "" ? { experimentName: task.experimentName } : {},
1024
+ testCaseTags: getTestCaseTagList(testCaseItem.testCase),
1025
+ runConfigTags: task.runConfigTags,
1026
+ evaluatorTags: getEvaluatorTagList(evaluator)
1021
1027
  },
1022
- testCaseTags: getTestCaseTagList(testCaseItem.testCase),
1023
- runConfigTags: task.runConfigTags,
1024
- evaluatorTags: getEvaluatorTagList(evaluator),
1025
1028
  logDiff,
1026
1029
  log,
1027
1030
  createError
@@ -1500,7 +1503,8 @@ var EffectRunner = class {
1500
1503
  globalEvaluationSemaphore: sem,
1501
1504
  runConfigName: job.runConfigName,
1502
1505
  runConfigTags: job.runConfigTags,
1503
- repetitions: job.repetitions
1506
+ repetitions: job.repetitions,
1507
+ experimentName: request.experimentName
1504
1508
  })
1505
1509
  );
1506
1510
  }
@@ -1535,7 +1539,8 @@ var EffectRunner = class {
1535
1539
  maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
1536
1540
  repetitions: request.repetitions,
1537
1541
  runConfigName,
1538
- runConfigTags: request.runConfigTags
1542
+ runConfigTags: request.runConfigTags,
1543
+ experimentName: request.experimentName
1539
1544
  });
1540
1545
  }
1541
1546
  async startDatasetRun(params) {
@@ -1610,7 +1615,8 @@ var EffectRunner = class {
1610
1615
  globalEvaluationSemaphore: params.globalEvaluationSemaphore,
1611
1616
  runConfigName: params.runConfigName,
1612
1617
  runConfigTags,
1613
- repetitions
1618
+ repetitions,
1619
+ experimentName: params.experimentName
1614
1620
  })
1615
1621
  );
1616
1622
  return snapshot;
@@ -1730,6 +1736,17 @@ function parseSimpleCliArgs(argv) {
1730
1736
  index += 1;
1731
1737
  continue;
1732
1738
  }
1739
+ if (token === "--experiment" && argv[index + 1]) {
1740
+ const raw = argv[index + 1];
1741
+ if (typeof raw === "string") {
1742
+ const trimmed = raw.trim();
1743
+ if (trimmed.length > 0) {
1744
+ args.experimentName = trimmed;
1745
+ }
1746
+ }
1747
+ index += 1;
1748
+ continue;
1749
+ }
1733
1750
  args.unknownArgs.push(token);
1734
1751
  }
1735
1752
  return args;
@@ -1737,12 +1754,13 @@ function parseSimpleCliArgs(argv) {
1737
1754
  function getSimpleCliUsage() {
1738
1755
  return [
1739
1756
  "Usage:",
1740
- " eval-agents-simple run --run-config <name> [--run-config <name> ...] [--concurrency N] [--ci]",
1757
+ " eval-agents-simple run --run-config <name> [--run-config <name> ...] [--concurrency N] [--experiment <name>] [--ci]",
1741
1758
  " eval-agents-simple generate --dataset <datasetId>",
1742
1759
  "",
1743
1760
  "Options:",
1744
1761
  " --ci With run: exit with code 1 if any test case fails.",
1745
- " --concurrency, -c N Max concurrent evaluations (default: 4). Use 1 for sequential."
1762
+ " --concurrency, -c N Max concurrent evaluations (default: 4). Use 1 for sequential.",
1763
+ " --experiment <name> With run: set evaluator meta.experimentName for this invocation."
1746
1764
  ].join("\n");
1747
1765
  }
1748
1766
 
@@ -2019,6 +2037,7 @@ function RunView({
2019
2037
  runner,
2020
2038
  runConfigNames,
2021
2039
  concurrency,
2040
+ experimentName,
2022
2041
  onComplete
2023
2042
  }) {
2024
2043
  const [phase, setPhase] = React.useState("loading");
@@ -2188,7 +2207,8 @@ function RunView({
2188
2207
  });
2189
2208
  const snapshots = await runner.runDatasetJobsWithSharedConcurrency({
2190
2209
  jobs,
2191
- globalConcurrency: concurrency
2210
+ globalConcurrency: concurrency,
2211
+ experimentName
2192
2212
  });
2193
2213
  for (let i = 0; i < snapshots.length; i += 1) {
2194
2214
  const snap = snapshots[i];
@@ -2245,7 +2265,7 @@ function RunView({
2245
2265
  setPhase("completed");
2246
2266
  const exitCode = failedTestCases > 0 ? 1 : 0;
2247
2267
  setTimeout(() => onComplete(void 0, exitCode), 200);
2248
- }, [runner, runConfigNames, concurrency, onComplete]);
2268
+ }, [runner, runConfigNames, concurrency, experimentName, onComplete]);
2249
2269
  React.useEffect(() => {
2250
2270
  void runEval();
2251
2271
  }, [runEval]);
@@ -2733,7 +2753,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
2733
2753
  }
2734
2754
  return lines;
2735
2755
  }
2736
- async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency) {
2756
+ async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency, experimentName) {
2737
2757
  const jobs = await runner.expandRunConfigNamesToJobs(runConfigNames);
2738
2758
  if (jobs.length === 0) {
2739
2759
  throw new Error("No jobs expanded from RunConfigs.");
@@ -2942,7 +2962,8 @@ async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency)
2942
2962
  console.log("");
2943
2963
  const snapshots = await runner.runDatasetJobsWithSharedConcurrency({
2944
2964
  jobs,
2945
- globalConcurrency: concurrency
2965
+ globalConcurrency: concurrency,
2966
+ experimentName
2946
2967
  });
2947
2968
  for (let i = 0; i < snapshots.length; i += 1) {
2948
2969
  const snap = snapshots[i];
@@ -3042,13 +3063,14 @@ async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency)
3042
3063
  }
3043
3064
  return failedTestCasesTotal > 0 ? 1 : 0;
3044
3065
  }
3045
- async function runSimpleEvalRunConfigsInk(runner, runConfigNames, concurrency) {
3066
+ async function runSimpleEvalRunConfigsInk(runner, runConfigNames, concurrency, experimentName) {
3046
3067
  return new Promise((resolve5, reject) => {
3047
3068
  const app = ink.render(
3048
3069
  React__namespace.createElement(RunView, {
3049
3070
  runner,
3050
3071
  runConfigNames,
3051
3072
  concurrency,
3073
+ experimentName,
3052
3074
  onComplete: (err, exitCode) => {
3053
3075
  app.unmount();
3054
3076
  if (err) {
@@ -3109,7 +3131,8 @@ async function main() {
3109
3131
  const exitCode = await (useInk ? runSimpleEvalRunConfigsInk : runSimpleEvalRunConfigsPlain)(
3110
3132
  runner,
3111
3133
  args.runConfigNames,
3112
- concurrency
3134
+ concurrency,
3135
+ args.experimentName
3113
3136
  );
3114
3137
  if (args.ci && exitCode !== 0) {
3115
3138
  process.exit(1);