npm - @m4trix/evals - Versions diffs - 0.25.1 → 0.27.0 - Mend

@m4trix/evals 0.25.1 → 0.27.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

package/dist/cli.js CHANGED Viewed

@@ -3,16 +3,16 @@ import { withFullScreen, useScreenSize } from 'fullscreen-ink';
 import React, { useState, useRef, useReducer, useEffect, useMemo } from 'react';
 import { useApp, useInput, Box, Text } from 'ink';
 import { jsx, jsxs, Fragment } from 'react/jsx-runtime';
-import { resolve, relative, join, dirname } from 'path';
-import { LineGraph } from '@pppp606/ink-chart';
+import { Schema, Effect, PubSub, Queue, Ref, Fiber, Either, ParseResult } from 'effect';
 import { randomUUID } from 'crypto';
-import { Effect, PubSub, Queue, Ref, Fiber } from 'effect';
+import { readdir, readFile, mkdir, appendFile } from 'fs/promises';
+import { resolve, join, relative, dirname } from 'path';
 import { existsSync } from 'fs';
 import * as jitiModule from 'jiti';
-import { readdir, readFile, mkdir, appendFile } from 'fs/promises';
 import { pathToFileURL } from 'url';
 import { diffLines } from 'diff';
 import stringify from 'fast-json-stable-stringify';
+import { LineGraph } from '@pppp606/ink-chart';
 var SEP = "  ";
 var ARROW = "\u203A";
@@ -237,6 +237,59 @@ function isPrintableCharacter(input) {
 function isBackKey(key) {
   return key.backspace || key.delete;
 }
+var ENTITY_ID_PATTERN = /^[a-zA-Z0-9_-]+$/;
+function makeEntityIdSchema(brand, label) {
+  return Schema.String.pipe(
+    Schema.trimmed(),
+    Schema.minLength(1, {
+      message: () => `${label} must be non-empty.`
+    }),
+    Schema.pattern(ENTITY_ID_PATTERN, {
+      message: () => `${label} may only contain letters, digits, underscores, and hyphens (no spaces). Examples: "my-nightly", "my_nightly", "myNightly".`
+    }),
+    Schema.brand(brand)
+  );
+}
+var RunConfigNameSchema = makeEntityIdSchema("RunConfigName", "RunConfig name");
+makeEntityIdSchema("EvaluatorName", "Evaluator name");
+makeEntityIdSchema("TestCaseName", "Test case name");
+makeEntityIdSchema("DatasetName", "Dataset name");
+function validateWithSchema(schema, raw, context) {
+  const trimmed = raw.trim();
+  const decode = Schema.decodeUnknownEither(
+    schema
+  );
+  const result = decode(trimmed);
+  if (Either.isLeft(result)) {
+    throw new Error(`${context}: ${ParseResult.TreeFormatter.formatErrorSync(result.left)}`);
+  }
+  return result.right;
+}
+function validateRunConfigName(raw, context) {
+  return validateWithSchema(RunConfigNameSchema, raw, context);
+}
+// src/evals/dataset.ts
+function getDatasetDisplayLabel(dataset) {
+  if (typeof dataset.getDisplayLabel === "function") {
+    return dataset.getDisplayLabel();
+  }
+  return typeof dataset.getName === "function" ? dataset.getName() : "";
+}
+// src/evals/evaluator.ts
+function getEvaluatorDisplayLabel(evaluator) {
+  if (typeof evaluator.getDisplayLabel === "function") {
+    const label = evaluator.getDisplayLabel();
+    if (label !== void 0) {
+      return label;
+    }
+  }
+  return typeof evaluator.getName === "function" ? evaluator.getName() : void 0;
+}
+function getEvaluatorTagList(evaluator) {
+  return typeof evaluator.getTags === "function" ? [...evaluator.getTags()] : [];
+}
 // src/cli/data.mock.json
 var data_mock_default = {
@@ -484,7 +537,7 @@ function toEvalDataset(item, snapshots) {
   const runs = snapshots.filter((snapshot) => snapshot.datasetId === item.id).sort((a, b) => b.queuedAt - a.queuedAt).map(toEvalRun);
   return {
     id: item.id,
-    name: item.dataset.getName(),
+    name: getDatasetDisplayLabel(item.dataset),
     overview: `Discovered from ${item.filePath}`,
     runs
   };
@@ -492,7 +545,7 @@ function toEvalDataset(item, snapshots) {
 function toEvaluatorOption(item) {
   return {
     id: item.id,
-    name: item.evaluator.getName() ?? toSlug(item.id),
+    name: getEvaluatorDisplayLabel(item.evaluator) ?? toSlug(item.id),
     configPreview: `Source: ${item.filePath}`
   };
 }
@@ -735,6 +788,159 @@ function reduceCliState(state, action) {
   }
   return state;
 }
+async function loadRunSnapshotsFromArtifacts(config) {
+  const baseDir = resolve(config.artifactDirectory);
+  let entries;
+  try {
+    entries = await readdir(baseDir);
+  } catch {
+    return [];
+  }
+  const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
+  const snapshots = [];
+  for (const fileName of jsonlFiles) {
+    const filePath = join(baseDir, fileName);
+    try {
+      const snapshot = await parseArtifactToSnapshot(filePath, config);
+      if (snapshot) {
+        snapshots.push(snapshot);
+      }
+    } catch {
+    }
+  }
+  return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
+}
+async function parseArtifactToSnapshot(filePath, _config) {
+  const content = await readFile(filePath, "utf8");
+  const lines = content.split("\n").filter((line) => line.trim().length > 0);
+  if (lines.length === 0) {
+    return null;
+  }
+  let runQueued = null;
+  let runCompleted = null;
+  let runFailed = null;
+  let runStarted = null;
+  for (const line of lines) {
+    try {
+      const event = JSON.parse(line);
+      const type = event.type;
+      if (type === "RunQueued") {
+        runQueued = {
+          runId: event.runId,
+          datasetId: event.datasetId,
+          datasetName: event.datasetName,
+          evaluatorIds: event.evaluatorIds,
+          totalTestCases: event.totalTestCases ?? 0,
+          artifactPath: event.artifactPath ?? filePath,
+          ts: event.ts
+        };
+      }
+      if (type === "RunStarted") {
+        runStarted = { startedAt: event.startedAt };
+      }
+      if (type === "RunCompleted") {
+        runCompleted = {
+          passedTestCases: event.passedTestCases,
+          failedTestCases: event.failedTestCases,
+          totalTestCases: event.totalTestCases,
+          finishedAt: event.finishedAt
+        };
+      }
+      if (type === "RunFailed") {
+        runFailed = {
+          finishedAt: event.finishedAt,
+          errorMessage: event.errorMessage
+        };
+      }
+    } catch {
+    }
+  }
+  if (!runQueued) {
+    return null;
+  }
+  const artifactPath = filePath;
+  const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
+  const progress = aggregateTestCaseProgress(lines);
+  const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
+  const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
+  const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
+  return {
+    runId: runQueued.runId,
+    datasetId: runQueued.datasetId,
+    datasetName: runQueued.datasetName,
+    evaluatorIds: runQueued.evaluatorIds,
+    queuedAt: runQueued.ts ?? 0,
+    startedAt: runStarted?.startedAt,
+    finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
+    totalTestCases: runQueued.totalTestCases,
+    completedTestCases,
+    passedTestCases,
+    failedTestCases,
+    status,
+    artifactPath,
+    errorMessage: runFailed?.errorMessage
+  };
+}
+function aggregateTestCaseProgress(lines) {
+  let completedTestCases = 0;
+  const testCasePassedBy = /* @__PURE__ */ new Map();
+  for (const line of lines) {
+    try {
+      const event = JSON.parse(line);
+      if (event.type === "TestCaseProgress") {
+        const ev = event;
+        completedTestCases = ev.completedTestCases ?? completedTestCases;
+        const id = ev.testCaseId;
+        const current = testCasePassedBy.get(id);
+        testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
+      }
+    } catch {
+    }
+  }
+  let passedTestCases = 0;
+  let failedTestCases = 0;
+  for (const passed of testCasePassedBy.values()) {
+    if (passed) {
+      passedTestCases += 1;
+    } else {
+      failedTestCases += 1;
+    }
+  }
+  return { completedTestCases, passedTestCases, failedTestCases };
+}
+async function parseArtifactFile(artifactPath) {
+  try {
+    const content = await readFile(artifactPath, "utf8");
+    const lines = content.split("\n").filter((line) => line.trim().length > 0);
+    const results = [];
+    for (const line of lines) {
+      try {
+        const event = JSON.parse(line);
+        if (event.type === "TestCaseProgress") {
+          const ev = event;
+          const repetitionIndex = ev.repetitionIndex ?? ev.rerunIndex;
+          const repetitionCount = ev.repetitionCount ?? ev.rerunTotal;
+          results.push({
+            testCaseId: ev.testCaseId,
+            testCaseName: ev.testCaseName,
+            completedTestCases: ev.completedTestCases,
+            totalTestCases: ev.totalTestCases,
+            repetitionId: ev.repetitionId,
+            repetitionIndex,
+            repetitionCount,
+            passed: ev.passed,
+            durationMs: ev.durationMs,
+            evaluatorScores: ev.evaluatorScores ?? []
+          });
+        }
+      } catch {
+      }
+    }
+    return results;
+  } catch {
+    return [];
+  }
+}
 // src/runner/config.ts
 var defaultRunnerConfig = {
@@ -742,6 +948,7 @@ var defaultRunnerConfig = {
     rootDir: process.cwd(),
     datasetSuffixes: [".dataset.ts", ".dataset.tsx", ".dataset.js", ".dataset.mjs"],
     evaluatorSuffixes: [".evaluator.ts", ".evaluator.tsx", ".evaluator.js", ".evaluator.mjs"],
+    runConfigSuffixes: [".run-config.ts", ".run-config.tsx", ".run-config.js", ".run-config.mjs"],
     testCaseSuffixes: [".test-case.ts", ".test-case.tsx", ".test-case.js", ".test-case.mjs"],
     excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
   },
@@ -767,6 +974,11 @@ function toRunnerConfigOverrides(config) {
   } else if (rawDiscovery?.evaluatorSuffixes !== void 0) {
     discovery.evaluatorSuffixes = rawDiscovery.evaluatorSuffixes;
   }
+  if (rawDiscovery?.runConfigFilePatterns !== void 0) {
+    discovery.runConfigSuffixes = rawDiscovery.runConfigFilePatterns;
+  } else if (rawDiscovery?.runConfigSuffixes !== void 0) {
+    discovery.runConfigSuffixes = rawDiscovery.runConfigSuffixes;
+  }
   if (rawDiscovery?.testCaseFilePatterns !== void 0) {
     discovery.testCaseSuffixes = rawDiscovery.testCaseFilePatterns;
   } else if (rawDiscovery?.testCaseSuffixes !== void 0) {
@@ -865,6 +1077,9 @@ function isDatasetLike(value) {
 function isEvaluatorLike(value) {
   return hasMethod(value, "getName") && hasMethod(value, "resolveContext") && hasMethod(value, "getEvaluateFn");
 }
+function isRunConfigLike(value) {
+  return hasMethod(value, "getName") && hasMethod(value, "getRuns") && typeof value.getRuns === "function";
+}
 function isTestCaseLike(value) {
   return hasMethod(value, "getName") && hasMethod(value, "getTags") && hasMethod(value, "getInput");
 }
@@ -953,6 +1168,23 @@ async function collectEvaluatorsFromFiles(config) {
   );
   return found.flat();
 }
+async function collectRunConfigsFromFiles(config) {
+  const files = await walkDirectory(config.rootDir, config.excludeDirectories);
+  const matched = files.filter((filePath) => hasOneSuffix(filePath, config.runConfigSuffixes));
+  const found = await Promise.all(
+    matched.map(async (absolutePath) => {
+      const exports = await loadModuleExports(absolutePath);
+      const runConfigs = exports.filter(isRunConfigLike);
+      const relPath = relative(config.rootDir, absolutePath);
+      return runConfigs.map((runConfig) => ({
+        id: runConfig.getName(),
+        filePath: relPath,
+        runConfig
+      }));
+    })
+  );
+  return found.flat();
+}
 async function collectTestCasesFromFiles(config) {
   const files = await walkDirectory(config.rootDir, config.excludeDirectories);
   const matched = files.filter((filePath) => hasOneSuffix(filePath, config.testCaseSuffixes));
@@ -1106,6 +1338,17 @@ function getDiffLines(entry) {
   });
 }
+// src/evals/test-case.ts
+function getTestCaseDisplayLabel(testCase) {
+  if (typeof testCase.getDisplayLabel === "function") {
+    return testCase.getDisplayLabel();
+  }
+  return typeof testCase.getName === "function" ? testCase.getName() : "";
+}
+function getTestCaseTagList(testCase) {
+  return typeof testCase.getTags === "function" ? [...testCase.getTags()] : [];
+}
 // src/evals/metric.ts
 var registry = /* @__PURE__ */ new Map();
 var Metric = {
@@ -1129,6 +1372,54 @@ function getMetricById(id) {
   return registry.get(id);
 }
+// src/evals/aggregators.ts
+function aggregateTokenCountSum(values) {
+  const initial = {
+    input: 0,
+    output: 0,
+    inputCached: 0,
+    outputCached: 0
+  };
+  return values.reduce(
+    (acc, v) => ({
+      input: acc.input + (v.input ?? 0),
+      output: acc.output + (v.output ?? 0),
+      inputCached: acc.inputCached + (v.inputCached ?? 0),
+      outputCached: acc.outputCached + (v.outputCached ?? 0)
+    }),
+    initial
+  );
+}
+function aggregateLatencyAverage(values) {
+  if (values.length === 0) {
+    return { ms: 0 };
+  }
+  const sum = values.reduce((s, v) => s + v.ms, 0);
+  return { ms: sum / values.length };
+}
+// src/evals/metrics/standard.ts
+Metric.of({
+  id: "token-count",
+  name: "Tokens",
+  aggregate: aggregateTokenCountSum,
+  format: (data, options) => {
+    const input = data.input ?? 0;
+    const output = data.output ?? 0;
+    const inputCached = data.inputCached ?? 0;
+    const outputCached = data.outputCached ?? 0;
+    const cached = inputCached + outputCached;
+    const base = `in:${input} out:${output} cached:${cached}`;
+    return options?.isAggregated ? `Total: ${base}` : base;
+  }
+});
+Metric.of({
+  id: "latency",
+  name: "Latency",
+  aggregate: aggregateLatencyAverage,
+  format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
+});
 // src/evals/score.ts
 var registry2 = /* @__PURE__ */ new Map();
 function formatScoreData(def, data, options) {
@@ -1237,54 +1528,6 @@ function getScoreById(id) {
   return registry2.get(id);
 }
-// src/evals/aggregators.ts
-function aggregateTokenCountSum(values) {
-  const initial = {
-    input: 0,
-    output: 0,
-    inputCached: 0,
-    outputCached: 0
-  };
-  return values.reduce(
-    (acc, v) => ({
-      input: acc.input + (v.input ?? 0),
-      output: acc.output + (v.output ?? 0),
-      inputCached: acc.inputCached + (v.inputCached ?? 0),
-      outputCached: acc.outputCached + (v.outputCached ?? 0)
-    }),
-    initial
-  );
-}
-function aggregateLatencyAverage(values) {
-  if (values.length === 0) {
-    return { ms: 0 };
-  }
-  const sum = values.reduce((s, v) => s + v.ms, 0);
-  return { ms: sum / values.length };
-}
-// src/evals/metrics/standard.ts
-Metric.of({
-  id: "token-count",
-  name: "Tokens",
-  aggregate: aggregateTokenCountSum,
-  format: (data, options) => {
-    const input = data.input ?? 0;
-    const output = data.output ?? 0;
-    const inputCached = data.inputCached ?? 0;
-    const outputCached = data.outputCached ?? 0;
-    const cached = inputCached + outputCached;
-    const base = `in:${input} out:${output} cached:${cached}`;
-    return options?.isAggregated ? `Total: ${base}` : base;
-  }
-});
-Metric.of({
-  id: "latency",
-  name: "Latency",
-  aggregate: aggregateLatencyAverage,
-  format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
-});
 // src/evals/scores/standard.ts
 Score.of({
   id: "percent",
@@ -1391,15 +1634,17 @@ function readOutput(testCase) {
   }
   return candidate.getOutput();
 }
-function buildEvaluationUnits(testCases) {
+function buildEvaluationUnits(testCases, repetitionCount) {
+  const count = Math.max(1, repetitionCount);
   const units = [];
   for (const testCaseItem of testCases) {
-    const rerunTotal = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
-    for (let r = 0; r < rerunTotal; r++) {
+    const repetitionId = `rep-${randomUUID()}`;
+    for (let r = 0; r < count; r++) {
       units.push({
         testCaseItem,
-        rerunIndex: r + 1,
-        rerunTotal
+        repetitionId,
+        repetitionIndex: r + 1,
+        repetitionCount: count
       });
     }
   }
@@ -1412,7 +1657,7 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
   return join(artifactDirectory, `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`);
 }
 function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
-  const { testCaseItem, rerunIndex, rerunTotal } = unit;
+  const { testCaseItem, repetitionId, repetitionIndex, repetitionCount } = unit;
   return Effect.gen(function* () {
     const evaluatorRunId = `run-${randomUUID()}`;
     const started = Date.now();
@@ -1421,11 +1666,12 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
       type: "TestCaseStarted",
       runId: task.runId,
       testCaseId: testCaseItem.id,
-      testCaseName: testCaseItem.testCase.getName(),
+      testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
       startedTestCases: startedEvaluations,
       totalTestCases: totalEvaluations,
-      rerunIndex,
-      rerunTotal
+      repetitionId,
+      repetitionIndex,
+      repetitionCount
     });
     const evaluatorScores = [];
     let testCaseError;
@@ -1459,8 +1705,15 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
               meta: {
                 triggerId: task.triggerId,
                 runId: evaluatorRunId,
-                datasetId: task.datasetId
+                datasetName: task.dataset.getDisplayLabel(),
+                repetitionId,
+                repetitionIndex,
+                repetitionCount,
+                runConfigName: task.runConfigName
               },
+              testCaseTags: getTestCaseTagList(testCaseItem.testCase),
+              runConfigTags: task.runConfigTags,
+              evaluatorTags: getEvaluatorTagList(evaluator),
               logDiff,
               log,
               createError
@@ -1503,18 +1756,19 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
         });
       }
     }
-    const rerunPassedThis = evaluatorScores.every((s) => s.passed);
+    const repetitionPassedThis = evaluatorScores.every((s) => s.passed);
     const completedEvaluations = yield* Ref.modify(completedRef, (n) => [n + 1, n + 1]);
     const progressEvent = {
       type: "TestCaseProgress",
       runId: task.runId,
       testCaseId: testCaseItem.id,
-      testCaseName: testCaseItem.testCase.getName(),
+      testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
       completedTestCases: completedEvaluations,
       totalTestCases: totalEvaluations,
-      rerunIndex,
-      rerunTotal,
-      passed: rerunPassedThis,
+      repetitionId,
+      repetitionIndex,
+      repetitionCount,
+      passed: repetitionPassedThis,
       durationMs: Date.now() - started,
       evaluatorScores,
       output,
@@ -1535,9 +1789,9 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
       (map) => {
         const key = testCaseItem.id;
         const existing = map.get(key) ?? { completedCount: 0, results: [] };
-        const newResults = [...existing.results, rerunPassedThis];
+        const newResults = [...existing.results, repetitionPassedThis];
         const newCompletedCount = existing.completedCount + 1;
-        const isLast = newCompletedCount === rerunTotal;
+        const isLast = newCompletedCount === repetitionCount;
         const newMap = new Map(map);
         newMap.set(key, {
           completedCount: newCompletedCount,
@@ -1574,10 +1828,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
     runId: task.runId,
     startedAt
   });
-  const totalEvaluations = task.testCases.reduce(
-    (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
-    0
-  );
+  const totalEvaluations = task.testCases.length * Math.max(1, task.repetitions);
   const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
   const completedRef = yield* Ref.make(0);
   const startedRef = yield* Ref.make(0);
@@ -1586,7 +1837,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
   const testCaseResultsRef = yield* Ref.make(
     /* @__PURE__ */ new Map()
   );
-  const evaluationUnits = buildEvaluationUnits(task.testCases);
+  const evaluationUnits = buildEvaluationUnits(task.testCases, task.repetitions);
   const processEvaluation = (unit) => processOneEvaluation(
     task,
     unit,
@@ -1600,11 +1851,20 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
     failedRef,
     testCaseResultsRef
   );
-  yield* Effect.forEach(
-    evaluationUnits,
-    processEvaluation,
-    maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
-  );
+  const globalSem = task.globalEvaluationSemaphore;
+  if (globalSem !== void 0) {
+    yield* Effect.forEach(
+      evaluationUnits,
+      (unit) => globalSem.withPermits(1)(processEvaluation(unit)),
+      { concurrency: "unbounded", discard: true }
+    );
+  } else {
+    yield* Effect.forEach(
+      evaluationUnits,
+      processEvaluation,
+      maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
+    );
+  }
   const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* Effect.all([
     Ref.get(completedRef),
     Ref.get(passedRef),
@@ -1640,155 +1900,34 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
     artifactPath: task.snapshot.artifactPath
   });
 });
-async function loadRunSnapshotsFromArtifacts(config) {
-  const baseDir = resolve(config.artifactDirectory);
-  let entries;
-  try {
-    entries = await readdir(baseDir);
-  } catch {
-    return [];
-  }
-  const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
-  const snapshots = [];
-  for (const fileName of jsonlFiles) {
-    const filePath = join(baseDir, fileName);
-    try {
-      const snapshot = await parseArtifactToSnapshot(filePath, config);
-      if (snapshot) {
-        snapshots.push(snapshot);
-      }
-    } catch {
-    }
-  }
-  return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
-}
-async function parseArtifactToSnapshot(filePath, _config) {
-  const content = await readFile(filePath, "utf8");
-  const lines = content.split("\n").filter((line) => line.trim().length > 0);
-  if (lines.length === 0) {
-    return null;
-  }
-  let runQueued = null;
-  let runCompleted = null;
-  let runFailed = null;
-  let runStarted = null;
-  for (const line of lines) {
-    try {
-      const event = JSON.parse(line);
-      const type = event.type;
-      if (type === "RunQueued") {
-        runQueued = {
-          runId: event.runId,
-          datasetId: event.datasetId,
-          datasetName: event.datasetName,
-          evaluatorIds: event.evaluatorIds,
-          totalTestCases: event.totalTestCases ?? 0,
-          artifactPath: event.artifactPath ?? filePath,
-          ts: event.ts
-        };
-      }
-      if (type === "RunStarted") {
-        runStarted = { startedAt: event.startedAt };
-      }
-      if (type === "RunCompleted") {
-        runCompleted = {
-          passedTestCases: event.passedTestCases,
-          failedTestCases: event.failedTestCases,
-          totalTestCases: event.totalTestCases,
-          finishedAt: event.finishedAt
-        };
-      }
-      if (type === "RunFailed") {
-        runFailed = {
-          finishedAt: event.finishedAt,
-          errorMessage: event.errorMessage
-        };
-      }
-    } catch {
-    }
+// src/runner/name-pattern.ts
+function parseRegexLiteral(pattern) {
+  if (!pattern.startsWith("/")) {
+    return void 0;
   }
-  if (!runQueued) {
-    return null;
+  const lastSlash = pattern.lastIndexOf("/");
+  if (lastSlash <= 0) {
+    return void 0;
   }
-  const artifactPath = filePath;
-  const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
-  const progress = aggregateTestCaseProgress(lines);
-  const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
-  const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
-  const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
   return {
-    runId: runQueued.runId,
-    datasetId: runQueued.datasetId,
-    datasetName: runQueued.datasetName,
-    evaluatorIds: runQueued.evaluatorIds,
-    queuedAt: runQueued.ts ?? 0,
-    startedAt: runStarted?.startedAt,
-    finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
-    totalTestCases: runQueued.totalTestCases,
-    completedTestCases,
-    passedTestCases,
-    failedTestCases,
-    status,
-    artifactPath,
-    errorMessage: runFailed?.errorMessage
+    source: pattern.slice(1, lastSlash),
+    flags: pattern.slice(lastSlash + 1)
   };
 }
-function aggregateTestCaseProgress(lines) {
-  let completedTestCases = 0;
-  const testCasePassedBy = /* @__PURE__ */ new Map();
-  for (const line of lines) {
-    try {
-      const event = JSON.parse(line);
-      if (event.type === "TestCaseProgress") {
-        const ev = event;
-        completedTestCases = ev.completedTestCases ?? completedTestCases;
-        const id = ev.testCaseId;
-        const current = testCasePassedBy.get(id);
-        testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
-      }
-    } catch {
-    }
-  }
-  let passedTestCases = 0;
-  let failedTestCases = 0;
-  for (const passed of testCasePassedBy.values()) {
-    if (passed) {
-      passedTestCases += 1;
-    } else {
-      failedTestCases += 1;
-    }
+function createNameMatcher(pattern) {
+  const normalizedPattern = pattern.trim();
+  const regexLiteral = parseRegexLiteral(normalizedPattern);
+  if (regexLiteral) {
+    const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
+    return (value) => regex.test(value);
   }
-  return { completedTestCases, passedTestCases, failedTestCases };
-}
-async function parseArtifactFile(artifactPath) {
-  try {
-    const content = await readFile(artifactPath, "utf8");
-    const lines = content.split("\n").filter((line) => line.trim().length > 0);
-    const results = [];
-    for (const line of lines) {
-      try {
-        const event = JSON.parse(line);
-        if (event.type === "TestCaseProgress") {
-          const ev = event;
-          results.push({
-            testCaseId: ev.testCaseId,
-            testCaseName: ev.testCaseName,
-            completedTestCases: ev.completedTestCases,
-            totalTestCases: ev.totalTestCases,
-            rerunIndex: ev.rerunIndex,
-            rerunTotal: ev.rerunTotal,
-            passed: ev.passed,
-            durationMs: ev.durationMs,
-            evaluatorScores: ev.evaluatorScores ?? []
-          });
-        }
-      } catch {
-      }
-    }
-    return results;
-  } catch {
-    return [];
+  if (normalizedPattern.includes("*")) {
+    const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
+    const regex = new RegExp(`^${escaped}$`, "i");
+    return (value) => regex.test(value);
   }
+  return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
 }
 async function appendJsonLine(artifactPath, payload) {
   await mkdir(dirname(artifactPath), { recursive: true });
@@ -1847,32 +1986,12 @@ function searchCollectedTestCases(all, query) {
 }
 // src/runner/api.ts
-function parseRegexLiteral(pattern) {
-  if (!pattern.startsWith("/")) {
-    return void 0;
-  }
-  const lastSlash = pattern.lastIndexOf("/");
-  if (lastSlash <= 0) {
-    return void 0;
+function normalizeRunRepetitions(value) {
+  const n = value ?? 1;
+  if (!Number.isInteger(n) || n < 1) {
+    throw new Error(`repetitions must be a positive integer, got ${String(value)}`);
   }
-  return {
-    source: pattern.slice(1, lastSlash),
-    flags: pattern.slice(lastSlash + 1)
-  };
-}
-function createNameMatcher(pattern) {
-  const normalizedPattern = pattern.trim();
-  const regexLiteral = parseRegexLiteral(normalizedPattern);
-  if (regexLiteral) {
-    const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
-    return (value) => regex.test(value);
-  }
-  if (normalizedPattern.includes("*")) {
-    const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
-    const regex = new RegExp(`^${escaped}$`, "i");
-    return (value) => regex.test(value);
-  }
-  return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
+  return n;
 }
 function mergeRunnerOverrides(base, next) {
   if (!base) {
@@ -1907,6 +2026,7 @@ var EffectRunner = class {
     this.listeners = /* @__PURE__ */ new Set();
     this.datasetsById = /* @__PURE__ */ new Map();
     this.evaluatorsById = /* @__PURE__ */ new Map();
+    this.runConfigsById = /* @__PURE__ */ new Map();
     this.schedulerFiber = Effect.runFork(this.createSchedulerEffect());
     this.persistenceFiber = Effect.runFork(
       createPersistenceWorker(this.persistenceQueue)
@@ -1947,6 +2067,137 @@ var EffectRunner = class {
       (item) => matcher(item.evaluator.getName() ?? "")
     );
   }
+  async collectRunConfigs() {
+    const runConfigs = await collectRunConfigsFromFiles(this.config.discovery);
+    this.runConfigsById.clear();
+    const byNameLower = /* @__PURE__ */ new Map();
+    for (const item of runConfigs) {
+      const id = item.runConfig.getName();
+      const lower = id.toLowerCase();
+      const prev = byNameLower.get(lower);
+      if (prev !== void 0 && prev.filePath !== item.filePath) {
+        throw new Error(
+          `Duplicate RunConfig name "${id}" (matches "${prev.runConfig.getName()}" case-insensitively): ${prev.filePath} and ${item.filePath}`
+        );
+      }
+      byNameLower.set(lower, item);
+      this.runConfigsById.set(id, item);
+    }
+    return runConfigs;
+  }
+  async resolveRunConfigByName(name) {
+    if (this.runConfigsById.size === 0) {
+      await this.collectRunConfigs();
+    }
+    const key = validateRunConfigName(name, `RunConfig "${name.trim()}"`);
+    const keyLower = key.toLowerCase();
+    const matches = Array.from(this.runConfigsById.values()).filter(
+      (item) => item.runConfig.getName().toLowerCase() === keyLower
+    );
+    if (matches.length === 0) {
+      return void 0;
+    }
+    if (matches.length > 1) {
+      throw new Error(
+        `Multiple RunConfigs named "${name}": ${matches.map((m) => m.filePath).join(", ")}`
+      );
+    }
+    return matches[0];
+  }
+  async expandRunConfigToJobs(collected) {
+    if (this.datasetsById.size === 0) {
+      await this.collectDatasets();
+    }
+    if (this.evaluatorsById.size === 0) {
+      await this.collectEvaluators();
+    }
+    const rcName = collected.runConfig.getName();
+    const jobs = [];
+    const runs = collected.runConfig.getRuns();
+    for (const [i, row] of runs.entries()) {
+      const dsCollected = Array.from(this.datasetsById.values()).find(
+        (d) => d.dataset === row.dataset
+      );
+      if (!dsCollected) {
+        throw new Error(
+          `RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getDisplayLabel()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
+        );
+      }
+      let evaluatorIds;
+      if ("evaluatorPattern" in row && typeof row.evaluatorPattern === "string") {
+        const matcher = createNameMatcher(row.evaluatorPattern);
+        const matched = Array.from(this.evaluatorsById.values()).filter(
+          (item) => matcher(item.evaluator.getName() ?? "")
+        );
+        if (matched.length === 0) {
+          throw new Error(
+            `RunConfig "${rcName}" run[${i}]: no evaluator matched pattern "${row.evaluatorPattern}"`
+          );
+        }
+        evaluatorIds = matched.map((item) => item.id);
+      } else {
+        const evaluators = row.evaluators;
+        evaluatorIds = [];
+        for (const ev of evaluators) {
+          const found = Array.from(this.evaluatorsById.values()).find(
+            (item) => item.evaluator === ev
+          );
+          if (!found) {
+            throw new Error(
+              `RunConfig "${rcName}" run[${i}]: evaluator "${getEvaluatorDisplayLabel(ev) ?? "unknown"}" was not found among discovered evaluator exports`
+            );
+          }
+          evaluatorIds.push(found.id);
+        }
+      }
+      const repetitions = "repetitions" in row && row.repetitions !== void 0 ? row.repetitions : 1;
+      jobs.push({
+        datasetId: dsCollected.id,
+        evaluatorIds,
+        runConfigName: rcName,
+        runConfigDisplayLabel: collected.runConfig.getDisplayLabel(),
+        runConfigTags: collected.runConfig.getTags(),
+        repetitions
+      });
+    }
+    return jobs;
+  }
+  async expandRunConfigNamesToJobs(names) {
+    const jobs = [];
+    for (const name of names) {
+      const collected = await this.resolveRunConfigByName(name);
+      if (!collected) {
+        const known = await this.collectRunConfigs();
+        const available = known.map((r) => r.runConfig.getName()).sort();
+        throw new Error(
+          available.length > 0 ? `RunConfig "${name}" not found. Available RunConfigs: ${available.join(", ")}` : `RunConfig "${name}" not found and no RunConfigs were discovered.`
+        );
+      }
+      jobs.push(...await this.expandRunConfigToJobs(collected));
+    }
+    return jobs;
+  }
+  async runDatasetJobsWithSharedConcurrency(request) {
+    const globalConcurrency = Math.max(1, request.globalConcurrency);
+    const sem = Effect.unsafeMakeSemaphore(globalConcurrency);
+    const triggerId = request.triggerId ?? `trg-${randomUUID()}`;
+    const snapshots = [];
+    for (const job of request.jobs) {
+      snapshots.push(
+        await this.startDatasetRun({
+          datasetId: job.datasetId,
+          evaluatorIds: job.evaluatorIds,
+          triggerId,
+          maxConcurrency: this.config.maxConcurrency ?? 1,
+          globalEvaluationSemaphore: sem,
+          runConfigName: job.runConfigName,
+          runConfigTags: job.runConfigTags,
+          repetitions: job.repetitions
+        })
+      );
+    }
+    return snapshots;
+  }
   async searchTestCases(query) {
     const testCases = await collectTestCasesFromFiles(this.config.discovery);
     return searchCollectedTestCases(testCases, query);
@@ -1965,36 +2216,46 @@ var EffectRunner = class {
     );
   }
   async runDatasetWith(request) {
+    const runConfigName = validateRunConfigName(
+      request.runConfigName,
+      "runDatasetWith.runConfigName"
+    );
+    return this.startDatasetRun({
+      datasetId: request.datasetId,
+      evaluatorIds: request.evaluatorIds,
+      triggerId: request.triggerId,
+      maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
+      repetitions: request.repetitions,
+      runConfigName,
+      runConfigTags: request.runConfigTags
+    });
+  }
+  async startDatasetRun(params) {
     if (this.datasetsById.size === 0) {
       await this.collectDatasets();
     }
     if (this.evaluatorsById.size === 0) {
       await this.collectEvaluators();
     }
-    const dataset = this.datasetsById.get(request.datasetId);
+    const dataset = this.datasetsById.get(params.datasetId);
     if (!dataset) {
-      throw new Error(`Unknown dataset: ${request.datasetId}`);
+      throw new Error(`Unknown dataset: ${params.datasetId}`);
     }
-    const selectedEvaluators = request.evaluatorIds.map((id) => this.evaluatorsById.get(id)).filter((value) => Boolean(value)).map((value) => ({ id: value.id, evaluator: value.evaluator }));
+    const selectedEvaluators = params.evaluatorIds.map((id) => this.evaluatorsById.get(id)).filter((value) => Boolean(value)).map((value) => ({ id: value.id, evaluator: value.evaluator }));
     if (selectedEvaluators.length === 0) {
       throw new Error("No evaluators selected for run");
     }
-    const selectedTestCases = await this.collectDatasetTestCases(request.datasetId);
-    const totalEvaluations = selectedTestCases.reduce(
-      (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
-      0
-    );
-    const triggerId = request.triggerId ?? `trg-${randomUUID()}`;
+    const selectedTestCases = await this.collectDatasetTestCases(params.datasetId);
+    const repetitions = normalizeRunRepetitions(params.repetitions);
+    const totalEvaluations = selectedTestCases.length * repetitions;
+    const runConfigTags = [...params.runConfigTags ?? []];
+    const triggerId = params.triggerId ?? `trg-${randomUUID()}`;
     const runId = `run-${randomUUID()}`;
-    const artifactPath = createArtifactPath(
-      this.config.artifactDirectory,
-      request.datasetId,
-      runId
-    );
+    const artifactPath = createArtifactPath(this.config.artifactDirectory, params.datasetId, runId);
     const snapshot = {
       runId,
-      datasetId: request.datasetId,
-      datasetName: dataset.dataset.getName(),
+      datasetId: params.datasetId,
+      datasetName: dataset.dataset.getDisplayLabel(),
       evaluatorIds: selectedEvaluators.map((item) => item.id),
       queuedAt: Date.now(),
       totalTestCases: totalEvaluations,
@@ -2014,8 +2275,8 @@ var EffectRunner = class {
     const queuedEvent = {
       type: "RunQueued",
       runId,
-      datasetId: request.datasetId,
-      datasetName: dataset.dataset.getName(),
+      datasetId: params.datasetId,
+      datasetName: dataset.dataset.getDisplayLabel(),
       evaluatorIds: selectedEvaluators.map((item) => item.id),
       totalTestCases: totalEvaluations,
       artifactPath
@@ -2028,17 +2289,20 @@ var EffectRunner = class {
         payload: queuedEvent
       })
     );
-    const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
     await Effect.runPromise(
       Queue.offer(this.runQueue, {
         runId,
         triggerId,
-        datasetId: request.datasetId,
+        datasetId: params.datasetId,
         dataset: dataset.dataset,
         evaluators: selectedEvaluators,
         testCases: selectedTestCases,
         snapshot,
-        maxConcurrency
+        maxConcurrency: params.maxConcurrency,
+        globalEvaluationSemaphore: params.globalEvaluationSemaphore,
+        runConfigName: params.runConfigName,
+        runConfigTags,
+        repetitions
       })
     );
     return snapshot;
@@ -2109,6 +2373,11 @@ var EffectRunner = class {
     );
   }
 };
+// src/runner/events.ts
+var PROGRAMMATIC_RUN_CONFIG = {
+  runConfigName: "programmatic"
+};
 var LEFT_PANE_WIDTH2 = 44;
 var MAX_RUNS_FOR_CHART = 12;
 var MAX_RUNS_FOR_TREND = 20;
@@ -2456,7 +2725,7 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
     rows.push(/* @__PURE__ */ jsx(Text, { children: " " }, "sp6"));
     rows.push(/* @__PURE__ */ jsx(SectionHeader, { children: "Test cases" }, "tc-h"));
     for (const tc of testCases) {
-      const rerunPart = tc.rerunTotal != null && tc.rerunIndex != null ? ` (${tc.rerunIndex}/${tc.rerunTotal})` : "";
+      const repetitionPart = tc.repetitionCount != null && tc.repetitionCount > 1 && tc.repetitionIndex != null ? ` (${tc.repetitionIndex}/${tc.repetitionCount})` : "";
       rows.push(
         /* @__PURE__ */ jsxs(Text, { children: [
           /* @__PURE__ */ jsxs(Text, { color: "cyan", children: [
@@ -2468,13 +2737,13 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
           ] }),
           " ",
           tc.testCaseName,
-          rerunPart ? /* @__PURE__ */ jsx(Text, { color: "cyan", children: rerunPart }) : null,
+          repetitionPart ? /* @__PURE__ */ jsx(Text, { color: "cyan", children: repetitionPart }) : null,
           /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
             " (",
             tc.durationMs,
             "ms)"
           ] })
-        ] }, `tc-${tc.testCaseId}-${tc.rerunIndex ?? 0}`)
+        ] }, `tc-${tc.testCaseId}-${tc.repetitionId ?? "x"}-${tc.repetitionIndex ?? 0}`)
       );
       for (const item of tc.evaluatorScores) {
         const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
@@ -2800,7 +3069,8 @@ function EvalsCliApp({ data, args, runner }) {
       }
       void runner.runDatasetWith({
         datasetId: selectedDataset.id,
-        evaluatorIds: clampedState.selectedEvaluatorIds
+        evaluatorIds: clampedState.selectedEvaluatorIds,
+        ...PROGRAMMATIC_RUN_CONFIG
       }).then((snapshot) => {
         setRuntimeMessage(
           `Started ${snapshot.runId} on ${selectedDataset.name} (${snapshot.totalTestCases} cases).`