@m4trix/evals 0.27.0 → 0.28.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -80,6 +80,7 @@ export const myEvaluator = Evaluator.define({
80
80
  inputSchema,
81
81
  outputSchema: S.Unknown,
82
82
  scoreSchema: S.Struct({ scores: S.Array(S.Unknown) }),
83
+ // optional: tags: ['suite-a'],
83
84
  }).evaluate(async ({ input, ctx: _ctx, output, createError }) => {
84
85
  const start = Date.now();
85
86
  const value = 85;
@@ -132,13 +133,15 @@ export const myTestCase = TestCase.describe({
132
133
  });
133
134
  ```
134
135
 
136
+ `tags` is optional; omit it when the test case has no declared labels. Evaluators read them as `meta.testCaseTags`.
137
+
135
138
  ### 4) RunConfig (optional)
136
139
 
137
140
  Group several dataset/evaluator runs under one named config. Each row is either
138
141
  `evaluators: [...]` (same module instances discovery loads) or `evaluatorPattern: "..."`
139
142
  (wildcard / regex rules from `RunnerApi.resolveEvaluatorsByNamePattern`). Multiple jobs share one `--concurrency` cap.
140
143
 
141
- Optional **`repetitions`** on a row (default `1`) runs each matching test case that many times. Every execution in that group shares the same **`repetitionId`** in the evaluator callback **`meta`**, with **`repetitionIndex`** / **`repetitionCount`**. Evaluator **`meta`** includes **`datasetName`** (`Dataset.getDisplayLabel()` → `displayName ?? name`) and **`runConfigName`**: the **`RunConfig`** id (or **`programmatic`** from **`PROGRAMMATIC_RUN_CONFIG`** for API/TUI-only **`runDatasetWith`**). **`Dataset`** and **`TestCase`** follow the same naming convention as **`RunConfig`**: **`name`** is the stable id; optional **`displayName`** is unrestricted for UI. Names may use **kebab-case**, **snake_case**, **camelCase**, etc. (letters, digits, `_`, `-` only, no spaces); resolution is **case-insensitive**.
144
+ Optional **`repetitions`** on a row (default `1`) runs each matching test case that many times. Every execution in that group shares the same **`repetitionId`** in the evaluator callback **`meta`**, with **`repetitionIndex`** / **`repetitionCount`**. Evaluator **`meta`** includes **`datasetName`** (`Dataset.getDisplayLabel()` → `displayName ?? name`), **`runConfigName`** (the **`RunConfig`** id or **`programmatic`** from **`PROGRAMMATIC_RUN_CONFIG`** for API/TUI-only **`runDatasetWith`**), optional **`experimentName`**, and declared tag lists **`testCaseTags`**, **`runConfigTags`**, and **`evaluatorTags`** (empty arrays when unset). **`Dataset`** **`includedTags` / `excludedTags`** only filter which test cases belong to a dataset; they are not the same as **`TestCase.describe({ tags })`** or **`Evaluator.define({ tags })`**, which label the case/evaluator and show up in **`meta`**. **`Dataset`** and **`TestCase`** follow the same naming convention as **`RunConfig`**: **`name`** is the stable id; optional **`displayName`** is unrestricted for UI. Names may use **kebab-case**, **snake_case**, **camelCase**, etc. (letters, digits, `_`, `-` only, no spaces); resolution is **case-insensitive**.
142
145
 
143
146
  ```ts
144
147
  import { RunConfig } from '@m4trix/evals';
@@ -160,7 +163,7 @@ export const nightly = RunConfig.define({
160
163
  eval-agents-simple run --run-config "nightly"
161
164
  ```
162
165
 
163
- Repeat **`--run-config`** to queue several configs; jobs share one **`--concurrency`** cap.
166
+ Repeat **`--run-config`** to queue several configs; jobs share one **`--concurrency`** cap. Use **`--experiment <name>`** to set **`meta.experimentName`** for every evaluator in that CLI run (any non-empty string; trimmed).
164
167
 
165
168
  ## CLI Commands
166
169
 
@@ -1017,11 +1017,12 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1017
1017
  repetitionId,
1018
1018
  repetitionIndex,
1019
1019
  repetitionCount,
1020
- runConfigName: task.runConfigName
1020
+ runConfigName: task.runConfigName,
1021
+ ...task.experimentName !== void 0 && task.experimentName !== "" ? { experimentName: task.experimentName } : {},
1022
+ testCaseTags: getTestCaseTagList(testCaseItem.testCase),
1023
+ runConfigTags: task.runConfigTags,
1024
+ evaluatorTags: getEvaluatorTagList(evaluator)
1021
1025
  },
1022
- testCaseTags: getTestCaseTagList(testCaseItem.testCase),
1023
- runConfigTags: task.runConfigTags,
1024
- evaluatorTags: getEvaluatorTagList(evaluator),
1025
1026
  logDiff,
1026
1027
  log,
1027
1028
  createError
@@ -1500,7 +1501,8 @@ var EffectRunner = class {
1500
1501
  globalEvaluationSemaphore: sem,
1501
1502
  runConfigName: job.runConfigName,
1502
1503
  runConfigTags: job.runConfigTags,
1503
- repetitions: job.repetitions
1504
+ repetitions: job.repetitions,
1505
+ experimentName: request.experimentName
1504
1506
  })
1505
1507
  );
1506
1508
  }
@@ -1535,7 +1537,8 @@ var EffectRunner = class {
1535
1537
  maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
1536
1538
  repetitions: request.repetitions,
1537
1539
  runConfigName,
1538
- runConfigTags: request.runConfigTags
1540
+ runConfigTags: request.runConfigTags,
1541
+ experimentName: request.experimentName
1539
1542
  });
1540
1543
  }
1541
1544
  async startDatasetRun(params) {
@@ -1610,7 +1613,8 @@ var EffectRunner = class {
1610
1613
  globalEvaluationSemaphore: params.globalEvaluationSemaphore,
1611
1614
  runConfigName: params.runConfigName,
1612
1615
  runConfigTags,
1613
- repetitions
1616
+ repetitions,
1617
+ experimentName: params.experimentName
1614
1618
  })
1615
1619
  );
1616
1620
  return snapshot;
@@ -1730,6 +1734,17 @@ function parseSimpleCliArgs(argv) {
1730
1734
  index += 1;
1731
1735
  continue;
1732
1736
  }
1737
+ if (token === "--experiment" && argv[index + 1]) {
1738
+ const raw = argv[index + 1];
1739
+ if (typeof raw === "string") {
1740
+ const trimmed = raw.trim();
1741
+ if (trimmed.length > 0) {
1742
+ args.experimentName = trimmed;
1743
+ }
1744
+ }
1745
+ index += 1;
1746
+ continue;
1747
+ }
1733
1748
  args.unknownArgs.push(token);
1734
1749
  }
1735
1750
  return args;
@@ -1737,12 +1752,13 @@ function parseSimpleCliArgs(argv) {
1737
1752
  function getSimpleCliUsage() {
1738
1753
  return [
1739
1754
  "Usage:",
1740
- " eval-agents-simple run --run-config <name> [--run-config <name> ...] [--concurrency N] [--ci]",
1755
+ " eval-agents-simple run --run-config <name> [--run-config <name> ...] [--concurrency N] [--experiment <name>] [--ci]",
1741
1756
  " eval-agents-simple generate --dataset <datasetId>",
1742
1757
  "",
1743
1758
  "Options:",
1744
1759
  " --ci With run: exit with code 1 if any test case fails.",
1745
- " --concurrency, -c N Max concurrent evaluations (default: 4). Use 1 for sequential."
1760
+ " --concurrency, -c N Max concurrent evaluations (default: 4). Use 1 for sequential.",
1761
+ " --experiment <name> With run: set evaluator meta.experimentName for this invocation."
1746
1762
  ].join("\n");
1747
1763
  }
1748
1764
 
@@ -2019,6 +2035,7 @@ function RunView({
2019
2035
  runner,
2020
2036
  runConfigNames,
2021
2037
  concurrency,
2038
+ experimentName,
2022
2039
  onComplete
2023
2040
  }) {
2024
2041
  const [phase, setPhase] = React.useState("loading");
@@ -2188,7 +2205,8 @@ function RunView({
2188
2205
  });
2189
2206
  const snapshots = await runner.runDatasetJobsWithSharedConcurrency({
2190
2207
  jobs,
2191
- globalConcurrency: concurrency
2208
+ globalConcurrency: concurrency,
2209
+ experimentName
2192
2210
  });
2193
2211
  for (let i = 0; i < snapshots.length; i += 1) {
2194
2212
  const snap = snapshots[i];
@@ -2245,7 +2263,7 @@ function RunView({
2245
2263
  setPhase("completed");
2246
2264
  const exitCode = failedTestCases > 0 ? 1 : 0;
2247
2265
  setTimeout(() => onComplete(void 0, exitCode), 200);
2248
- }, [runner, runConfigNames, concurrency, onComplete]);
2266
+ }, [runner, runConfigNames, concurrency, experimentName, onComplete]);
2249
2267
  React.useEffect(() => {
2250
2268
  void runEval();
2251
2269
  }, [runEval]);
@@ -2733,7 +2751,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
2733
2751
  }
2734
2752
  return lines;
2735
2753
  }
2736
- async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency) {
2754
+ async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency, experimentName) {
2737
2755
  const jobs = await runner.expandRunConfigNamesToJobs(runConfigNames);
2738
2756
  if (jobs.length === 0) {
2739
2757
  throw new Error("No jobs expanded from RunConfigs.");
@@ -2942,7 +2960,8 @@ async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency)
2942
2960
  console.log("");
2943
2961
  const snapshots = await runner.runDatasetJobsWithSharedConcurrency({
2944
2962
  jobs,
2945
- globalConcurrency: concurrency
2963
+ globalConcurrency: concurrency,
2964
+ experimentName
2946
2965
  });
2947
2966
  for (let i = 0; i < snapshots.length; i += 1) {
2948
2967
  const snap = snapshots[i];
@@ -3042,13 +3061,14 @@ async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency)
3042
3061
  }
3043
3062
  return failedTestCasesTotal > 0 ? 1 : 0;
3044
3063
  }
3045
- async function runSimpleEvalRunConfigsInk(runner, runConfigNames, concurrency) {
3064
+ async function runSimpleEvalRunConfigsInk(runner, runConfigNames, concurrency, experimentName) {
3046
3065
  return new Promise((resolve5, reject) => {
3047
3066
  const app = ink.render(
3048
3067
  React__namespace.createElement(RunView, {
3049
3068
  runner,
3050
3069
  runConfigNames,
3051
3070
  concurrency,
3071
+ experimentName,
3052
3072
  onComplete: (err, exitCode) => {
3053
3073
  app.unmount();
3054
3074
  if (err) {
@@ -3109,7 +3129,8 @@ async function main() {
3109
3129
  const exitCode = await (useInk ? runSimpleEvalRunConfigsInk : runSimpleEvalRunConfigsPlain)(
3110
3130
  runner,
3111
3131
  args.runConfigNames,
3112
- concurrency
3132
+ concurrency,
3133
+ args.experimentName
3113
3134
  );
3114
3135
  if (args.ci && exitCode !== 0) {
3115
3136
  process.exit(1);