npm - @m4trix/evals - Versions diffs - 0.27.0 → 0.29.0 - Mend

@m4trix/evals 0.27.0 → 0.29.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

package/README.md CHANGED Viewed

@@ -80,6 +80,7 @@ export const myEvaluator = Evaluator.define({
   inputSchema,
   outputSchema: S.Unknown,
   scoreSchema: S.Struct({ scores: S.Array(S.Unknown) }),
+  // optional: tags: ['suite-a'],
 }).evaluate(async ({ input, ctx: _ctx, output, createError }) => {
   const start = Date.now();
   const value = 85;
@@ -132,13 +133,15 @@ export const myTestCase = TestCase.describe({
 });
 ```
+`tags` is optional; omit it when the test case has no declared labels. Evaluators read them as `meta.testCaseTags`.
 ### 4) RunConfig (optional)
 Group several dataset/evaluator runs under one named config. Each row is either
 `evaluators: [...]` (same module instances discovery loads) or `evaluatorPattern: "..."`
 (wildcard / regex rules from `RunnerApi.resolveEvaluatorsByNamePattern`). Multiple jobs share one `--concurrency` cap.
-Optional **`repetitions`** on a row (default `1`) runs each matching test case that many times. Every execution in that group shares the same **`repetitionId`** in the evaluator callback **`meta`**, with **`repetitionIndex`** / **`repetitionCount`**. Evaluator **`meta`** includes **`datasetName`** (`Dataset.getDisplayLabel()` → `displayName ?? name`) and **`runConfigName`**: the **`RunConfig`** id (or **`programmatic`** from **`PROGRAMMATIC_RUN_CONFIG`** for API/TUI-only **`runDatasetWith`**). **`Dataset`** and **`TestCase`** follow the same naming convention as **`RunConfig`**: **`name`** is the stable id; optional **`displayName`** is unrestricted for UI. Names may use **kebab-case**, **snake_case**, **camelCase**, etc. (letters, digits, `_`, `-` only, no spaces); resolution is **case-insensitive**.
+Optional **`repetitions`** on a row (default `1`) runs each matching test case that many times. Every execution in that group shares the same **`repetitionId`** in the evaluator callback **`meta`**, with **`repetitionIndex`** / **`repetitionCount`**. Evaluator **`meta`** includes **`datasetName`** (`Dataset.getDisplayLabel()` → `displayName ?? name`), **`testCaseId`** (discovery id, same as runner events), **`testCaseName`** (`TestCase.getDisplayLabel()` → `displayName ?? name`), **`runConfigName`** (the **`RunConfig`** id or **`programmatic`** from **`PROGRAMMATIC_RUN_CONFIG`** for API/TUI-only **`runDatasetWith`**), optional **`experimentName`**, and declared tag lists **`testCaseTags`**, **`runConfigTags`**, and **`evaluatorTags`** (empty arrays when unset). **`Dataset`** **`includedTags` / `excludedTags`** only filter which test cases belong to a dataset; they are not the same as **`TestCase.describe({ tags })`** or **`Evaluator.define({ tags })`**, which label the case/evaluator and show up in **`meta`**. **`Dataset`** and **`TestCase`** follow the same naming convention as **`RunConfig`**: **`name`** is the stable id; optional **`displayName`** is unrestricted for UI. Names may use **kebab-case**, **snake_case**, **camelCase**, etc. (letters, digits, `_`, `-` only, no spaces); resolution is **case-insensitive**.
 ```ts
 import { RunConfig } from '@m4trix/evals';
@@ -160,7 +163,7 @@ export const nightly = RunConfig.define({
 eval-agents-simple run --run-config "nightly"
 ```
-Repeat **`--run-config`** to queue several configs; jobs share one **`--concurrency`** cap.
+Repeat **`--run-config`** to queue several configs; jobs share one **`--concurrency`** cap. Use **`--experiment <name>`** to set **`meta.experimentName`** for every evaluator in that CLI run (any non-empty string; trimmed).
 ## CLI Commands

package/dist/cli-simple.cjs CHANGED Viewed

@@ -1014,14 +1014,17 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
                 triggerId: task.triggerId,
                 runId: evaluatorRunId,
                 datasetName: task.dataset.getDisplayLabel(),
+                testCaseId: testCaseItem.id,
+                testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
                 repetitionId,
                 repetitionIndex,
                 repetitionCount,
-                runConfigName: task.runConfigName
+                runConfigName: task.runConfigName,
+                ...task.experimentName !== void 0 && task.experimentName !== "" ? { experimentName: task.experimentName } : {},
+                testCaseTags: getTestCaseTagList(testCaseItem.testCase),
+                runConfigTags: task.runConfigTags,
+                evaluatorTags: getEvaluatorTagList(evaluator)
               },
-              testCaseTags: getTestCaseTagList(testCaseItem.testCase),
-              runConfigTags: task.runConfigTags,
-              evaluatorTags: getEvaluatorTagList(evaluator),
               logDiff,
               log,
               createError
@@ -1500,7 +1503,8 @@ var EffectRunner = class {
           globalEvaluationSemaphore: sem,
           runConfigName: job.runConfigName,
           runConfigTags: job.runConfigTags,
-          repetitions: job.repetitions
+          repetitions: job.repetitions,
+          experimentName: request.experimentName
         })
       );
     }
@@ -1535,7 +1539,8 @@ var EffectRunner = class {
       maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
       repetitions: request.repetitions,
       runConfigName,
-      runConfigTags: request.runConfigTags
+      runConfigTags: request.runConfigTags,
+      experimentName: request.experimentName
     });
   }
   async startDatasetRun(params) {
@@ -1610,7 +1615,8 @@ var EffectRunner = class {
         globalEvaluationSemaphore: params.globalEvaluationSemaphore,
         runConfigName: params.runConfigName,
         runConfigTags,
-        repetitions
+        repetitions,
+        experimentName: params.experimentName
       })
     );
     return snapshot;
@@ -1730,6 +1736,17 @@ function parseSimpleCliArgs(argv) {
       index += 1;
       continue;
     }
+    if (token === "--experiment" && argv[index + 1]) {
+      const raw = argv[index + 1];
+      if (typeof raw === "string") {
+        const trimmed = raw.trim();
+        if (trimmed.length > 0) {
+          args.experimentName = trimmed;
+        }
+      }
+      index += 1;
+      continue;
+    }
     args.unknownArgs.push(token);
   }
   return args;
@@ -1737,12 +1754,13 @@ function parseSimpleCliArgs(argv) {
 function getSimpleCliUsage() {
   return [
     "Usage:",
-    "  eval-agents-simple run --run-config <name> [--run-config <name> ...] [--concurrency N] [--ci]",
+    "  eval-agents-simple run --run-config <name> [--run-config <name> ...] [--concurrency N] [--experiment <name>] [--ci]",
     "  eval-agents-simple generate --dataset <datasetId>",
     "",
     "Options:",
     "  --ci                  With run: exit with code 1 if any test case fails.",
-    "  --concurrency, -c N   Max concurrent evaluations (default: 4). Use 1 for sequential."
+    "  --concurrency, -c N   Max concurrent evaluations (default: 4). Use 1 for sequential.",
+    "  --experiment <name>   With run: set evaluator meta.experimentName for this invocation."
   ].join("\n");
 }
@@ -2019,6 +2037,7 @@ function RunView({
   runner,
   runConfigNames,
   concurrency,
+  experimentName,
   onComplete
 }) {
   const [phase, setPhase] = React.useState("loading");
@@ -2188,7 +2207,8 @@ function RunView({
     });
     const snapshots = await runner.runDatasetJobsWithSharedConcurrency({
       jobs,
-      globalConcurrency: concurrency
+      globalConcurrency: concurrency,
+      experimentName
     });
     for (let i = 0; i < snapshots.length; i += 1) {
       const snap = snapshots[i];
@@ -2245,7 +2265,7 @@ function RunView({
     setPhase("completed");
     const exitCode = failedTestCases > 0 ? 1 : 0;
     setTimeout(() => onComplete(void 0, exitCode), 200);
-  }, [runner, runConfigNames, concurrency, onComplete]);
+  }, [runner, runConfigNames, concurrency, experimentName, onComplete]);
   React.useEffect(() => {
     void runEval();
   }, [runEval]);
@@ -2733,7 +2753,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
   }
   return lines;
 }
-async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency) {
+async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency, experimentName) {
   const jobs = await runner.expandRunConfigNamesToJobs(runConfigNames);
   if (jobs.length === 0) {
     throw new Error("No jobs expanded from RunConfigs.");
@@ -2942,7 +2962,8 @@ async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency)
   console.log("");
   const snapshots = await runner.runDatasetJobsWithSharedConcurrency({
     jobs,
-    globalConcurrency: concurrency
+    globalConcurrency: concurrency,
+    experimentName
   });
   for (let i = 0; i < snapshots.length; i += 1) {
     const snap = snapshots[i];
@@ -3042,13 +3063,14 @@ async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency)
   }
   return failedTestCasesTotal > 0 ? 1 : 0;
 }
-async function runSimpleEvalRunConfigsInk(runner, runConfigNames, concurrency) {
+async function runSimpleEvalRunConfigsInk(runner, runConfigNames, concurrency, experimentName) {
   return new Promise((resolve5, reject) => {
     const app = ink.render(
       React__namespace.createElement(RunView, {
         runner,
         runConfigNames,
         concurrency,
+        experimentName,
         onComplete: (err, exitCode) => {
           app.unmount();
           if (err) {
@@ -3109,7 +3131,8 @@ async function main() {
       const exitCode = await (useInk ? runSimpleEvalRunConfigsInk : runSimpleEvalRunConfigsPlain)(
         runner,
         args.runConfigNames,
-        concurrency
+        concurrency,
+        args.experimentName
       );
       if (args.ci && exitCode !== 0) {
         process.exit(1);