npm - @m4trix/evals - Versions diffs - 0.25.0 → 0.26.0 - Mend

@m4trix/evals 0.25.0 → 0.26.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

package/dist/index.cjs CHANGED Viewed

@@ -4,10 +4,10 @@ var effect = require('effect');
 var diff = require('diff');
 var stringify = require('fast-json-stable-stringify');
 var crypto = require('crypto');
-var fs = require('fs');
+var promises = require('fs/promises');
 var path = require('path');
+var fs = require('fs');
 var jitiModule = require('jiti');
-var promises = require('fs/promises');
 var url = require('url');
 var _documentCurrentScript = typeof document !== 'undefined' ? document.currentScript : null;
@@ -34,6 +34,164 @@ function _interopNamespace(e) {
 var stringify__default = /*#__PURE__*/_interopDefault(stringify);
 var jitiModule__namespace = /*#__PURE__*/_interopNamespace(jitiModule);
+// src/index.ts
+var ENTITY_ID_PATTERN = /^[a-zA-Z0-9_-]+$/;
+function makeEntityIdSchema(brand, label) {
+  return effect.Schema.String.pipe(
+    effect.Schema.trimmed(),
+    effect.Schema.minLength(1, {
+      message: () => `${label} must be non-empty.`
+    }),
+    effect.Schema.pattern(ENTITY_ID_PATTERN, {
+      message: () => `${label} may only contain letters, digits, underscores, and hyphens (no spaces). Examples: "my-nightly", "my_nightly", "myNightly".`
+    }),
+    effect.Schema.brand(brand)
+  );
+}
+var RunConfigNameSchema = makeEntityIdSchema("RunConfigName", "RunConfig name");
+var EvaluatorNameSchema = makeEntityIdSchema("EvaluatorName", "Evaluator name");
+var TestCaseNameSchema = makeEntityIdSchema("TestCaseName", "Test case name");
+function validateWithSchema(schema, raw, context) {
+  const trimmed = raw.trim();
+  const decode = effect.Schema.decodeUnknownEither(
+    schema
+  );
+  const result = decode(trimmed);
+  if (effect.Either.isLeft(result)) {
+    throw new Error(`${context}: ${effect.ParseResult.TreeFormatter.formatErrorSync(result.left)}`);
+  }
+  return result.right;
+}
+function validateRunConfigName(raw, context) {
+  return validateWithSchema(RunConfigNameSchema, raw, context);
+}
+function validateEvaluatorName(raw, context) {
+  return validateWithSchema(EvaluatorNameSchema, raw, context);
+}
+function validateTestCaseName(raw, context) {
+  return validateWithSchema(TestCaseNameSchema, raw, context);
+}
+function normalizeOptionalDisplayName(raw) {
+  if (raw === void 0) {
+    return void 0;
+  }
+  const t = raw.trim();
+  return t.length === 0 ? void 0 : t;
+}
+// src/evals/evaluator.ts
+var Evaluator = class _Evaluator {
+  constructor(config) {
+    this._config = config;
+  }
+  getState() {
+    return {
+      name: this._config.name,
+      displayName: this._config.displayName,
+      tags: this._config.tags,
+      inputSchema: this._config.inputSchema,
+      outputSchema: this._config.outputSchema,
+      scoreSchema: this._config.scoreSchema,
+      middlewares: this._config.middlewares,
+      evaluateFn: this._config.evaluateFn,
+      passThreshold: this._config.passThreshold,
+      passCriterion: this._config.passCriterion
+    };
+  }
+  static use(middleware) {
+    return new _Evaluator({
+      middlewares: [middleware],
+      tags: []
+    });
+  }
+  use(middleware) {
+    const state = this.getState();
+    return new _Evaluator({
+      ...state,
+      middlewares: [...state.middlewares, middleware]
+    });
+  }
+  define(config) {
+    const { middlewares } = this.getState();
+    const name = validateEvaluatorName(config.name, "Evaluator.define");
+    const displayName = normalizeOptionalDisplayName(config.displayName);
+    const tags = config.tags !== void 0 ? [...config.tags] : [];
+    return new _Evaluator({
+      name,
+      displayName,
+      tags,
+      inputSchema: config.inputSchema,
+      outputSchema: config.outputSchema,
+      scoreSchema: config.scoreSchema,
+      middlewares,
+      passThreshold: config.passThreshold,
+      passCriterion: config.passCriterion
+    });
+  }
+  evaluate(fn) {
+    return new _Evaluator({
+      ...this.getState(),
+      evaluateFn: fn
+    });
+  }
+  /** Canonical evaluator id when defined; otherwise undefined (middleware-only chain). */
+  getName() {
+    return this._config.name;
+  }
+  getDisplayName() {
+    return this._config.displayName;
+  }
+  /** Label for CLI/TUI: {@link getDisplayName} if set, otherwise {@link getName}. Undefined if not yet defined. */
+  getDisplayLabel() {
+    const id = this._config.name;
+    if (id === void 0) {
+      return void 0;
+    }
+    return this._config.displayName ?? id;
+  }
+  /** Tags from `Evaluator.define({ tags })`; empty until defined. */
+  getTags() {
+    return [...this._config.tags];
+  }
+  getInputSchema() {
+    return this._config.inputSchema;
+  }
+  getOutputSchema() {
+    return this._config.outputSchema;
+  }
+  getScoreSchema() {
+    return this._config.scoreSchema;
+  }
+  getMiddlewares() {
+    return this._config.middlewares;
+  }
+  getEvaluateFn() {
+    return this._config.evaluateFn;
+  }
+  getPassThreshold() {
+    return this._config.passThreshold;
+  }
+  getPassCriterion() {
+    return this._config.passCriterion;
+  }
+  async resolveContext() {
+    const parts = await Promise.all(this._config.middlewares.map((mw) => mw.resolve()));
+    return Object.assign({}, ...parts);
+  }
+};
+function getEvaluatorDisplayLabel(evaluator) {
+  if (typeof evaluator.getDisplayLabel === "function") {
+    const label = evaluator.getDisplayLabel();
+    if (label !== void 0) {
+      return label;
+    }
+  }
+  return typeof evaluator.getName === "function" ? evaluator.getName() : void 0;
+}
+function getEvaluatorTagList(evaluator) {
+  return typeof evaluator.getTags === "function" ? [...evaluator.getTags()] : [];
+}
 // src/cli/data.mock.json
 var data_mock_default = {
   datasets: [
@@ -184,9 +342,7 @@ var data_mock_default = {
             { name: "contract_match", score: 100 },
             { name: "arg_validity", score: 100 }
           ],
-          checks: [
-            { name: "tool_calls", passed: true, detail: "0 unexpected" }
-          ],
+          checks: [{ name: "tool_calls", passed: true, detail: "0 unexpected" }],
           failures: [],
           meta: {
             model: "gpt-4o-mini",
@@ -209,9 +365,21 @@ var data_mock_default = {
     }
   ],
   evaluators: [
-    { id: "json-schema-validator", name: "JSON Schema Validator", configPreview: "strict=true" },
-    { id: "tool-call-contract-checker", name: "Tool-call Contract Checker", configPreview: "unexpectedCalls=error" },
-    { id: "rubric-judge", name: "Rubric Judge (LLM)", configPreview: "model=gpt-4o-mini; scale=0-100" },
+    {
+      id: "json-schema-validator",
+      name: "JSON Schema Validator",
+      configPreview: "strict=true"
+    },
+    {
+      id: "tool-call-contract-checker",
+      name: "Tool-call Contract Checker",
+      configPreview: "unexpectedCalls=error"
+    },
+    {
+      id: "rubric-judge",
+      name: "Rubric Judge (LLM)",
+      configPreview: "model=gpt-4o-mini; scale=0-100"
+    },
     { id: "pii-leak-detector", name: "PII Leak Detector", configPreview: "redact=false" }
   ]
 };
@@ -278,7 +446,7 @@ function toEvalDataset(item, snapshots) {
 function toEvaluatorOption(item) {
   return {
     id: item.id,
-    name: item.evaluator.getName() ?? toSlug(item.id),
+    name: getEvaluatorDisplayLabel(item.evaluator) ?? toSlug(item.id),
     configPreview: `Source: ${item.filePath}`
   };
 }
@@ -291,9 +459,7 @@ async function loadRunnerData(runner) {
   const memSnapshots = runner.getAllRunSnapshots();
   const seen = new Set(memSnapshots.map((s) => s.runId));
   const fromDisk = diskSnapshots.filter((s) => !seen.has(s.runId));
-  const snapshots = [...memSnapshots, ...fromDisk].sort(
-    (a, b) => b.queuedAt - a.queuedAt
-  );
+  const snapshots = [...memSnapshots, ...fromDisk].sort((a, b) => b.queuedAt - a.queuedAt);
   if (datasets.length === 0 && evaluators.length === 0) {
     return loadMockData();
   }
@@ -326,134 +492,6 @@ function parseStartupArgs(argv) {
   return args;
 }
-// src/evals/test-case.ts
-function resolve(value) {
-  return typeof value === "function" ? value() : value;
-}
-var TestCase = class _TestCase {
-  constructor(config) {
-    this._config = config;
-  }
-  static describe(config) {
-    const reruns = config.reruns ?? 1;
-    if (reruns < 1 || !Number.isInteger(reruns)) {
-      throw new Error(`TestCase reruns must be a positive integer, got ${reruns}`);
-    }
-    return new _TestCase({
-      name: config.name,
-      tags: config.tags,
-      reruns,
-      inputSchema: config.inputSchema,
-      input: config.input,
-      outputSchema: config.outputSchema,
-      output: config.output
-    });
-  }
-  getReruns() {
-    return this._config.reruns;
-  }
-  getName() {
-    return this._config.name;
-  }
-  getTags() {
-    return this._config.tags;
-  }
-  getInputSchema() {
-    return this._config.inputSchema;
-  }
-  getInput() {
-    return resolve(this._config.input);
-  }
-  getOutputSchema() {
-    return this._config.outputSchema;
-  }
-  getOutput() {
-    if (this._config.output === void 0) {
-      return void 0;
-    }
-    return resolve(this._config.output);
-  }
-};
-// src/evals/evaluator.ts
-var Evaluator = class _Evaluator {
-  constructor(config) {
-    this._config = config;
-  }
-  getState() {
-    return {
-      name: this._config.name,
-      inputSchema: this._config.inputSchema,
-      outputSchema: this._config.outputSchema,
-      scoreSchema: this._config.scoreSchema,
-      middlewares: this._config.middlewares,
-      evaluateFn: this._config.evaluateFn,
-      passThreshold: this._config.passThreshold,
-      passCriterion: this._config.passCriterion
-    };
-  }
-  static use(middleware) {
-    return new _Evaluator({
-      middlewares: [middleware]
-    });
-  }
-  use(middleware) {
-    const state = this.getState();
-    return new _Evaluator({
-      ...state,
-      middlewares: [...state.middlewares, middleware]
-    });
-  }
-  define(config) {
-    const { middlewares } = this.getState();
-    return new _Evaluator({
-      name: config.name,
-      inputSchema: config.inputSchema,
-      outputSchema: config.outputSchema,
-      scoreSchema: config.scoreSchema,
-      middlewares,
-      passThreshold: config.passThreshold,
-      passCriterion: config.passCriterion
-    });
-  }
-  evaluate(fn) {
-    return new _Evaluator({
-      ...this.getState(),
-      evaluateFn: fn
-    });
-  }
-  getName() {
-    return this._config.name;
-  }
-  getInputSchema() {
-    return this._config.inputSchema;
-  }
-  getOutputSchema() {
-    return this._config.outputSchema;
-  }
-  getScoreSchema() {
-    return this._config.scoreSchema;
-  }
-  getMiddlewares() {
-    return this._config.middlewares;
-  }
-  getEvaluateFn() {
-    return this._config.evaluateFn;
-  }
-  getPassThreshold() {
-    return this._config.passThreshold;
-  }
-  getPassCriterion() {
-    return this._config.passCriterion;
-  }
-  async resolveContext() {
-    const parts = await Promise.all(
-      this._config.middlewares.map((mw) => mw.resolve())
-    );
-    return Object.assign({}, ...parts);
-  }
-};
 // src/evals/dataset.ts
 function matchesAny(value, matchers) {
   return matchers.some(
@@ -517,230 +555,13 @@ var Dataset = class _Dataset {
     return tagMatch && pathMatch;
   }
 };
-// src/evals/metric.ts
-var registry = /* @__PURE__ */ new Map();
-var Metric = {
-  of(config) {
-    const def = {
-      id: config.id,
-      name: config.name,
-      aggregate: config.aggregate,
-      format: config.format,
-      make: (data, options) => ({
-        id: config.id,
-        data,
-        ...options?.name !== void 0 && { name: options.name }
-      })
-    };
-    registry.set(config.id, def);
-    return def;
-  }
-};
-function getMetricById(id) {
-  return registry.get(id);
-}
-// src/evals/score.ts
-var registry2 = /* @__PURE__ */ new Map();
-function formatScoreData(def, data, options) {
-  return options?.isAggregated ? def.formatAggregate(data) : def.formatValue(data);
-}
-var ScoreAggregate = {
-  /** Average numeric fields. Use for scores like { value, delta }. */
-  averageFields(fields) {
-    return (values) => {
-      const count = values.length || 1;
-      const result = {};
-      for (const field of fields) {
-        result[field] = values.reduce(
-          (s, v) => s + (v[field] ?? 0),
-          0
-        ) / count;
-      }
-      return result;
-    };
-  },
-  /** Average selected numeric fields, with sample std dev tracked for `value`. */
-  averageWithVariance(fields) {
-    return (values) => {
-      const count = values.length;
-      const result = {};
-      for (const field of fields) {
-        result[field] = count === 0 ? 0 : values.reduce(
-          (sum, item) => sum + (item[field] ?? 0),
-          0
-        ) / count;
-      }
-      const valueField = "value";
-      const hasValueField = fields.includes(valueField);
-      if (count === 0) {
-        if (hasValueField) {
-          result[valueField] = 0;
-        }
-        return {
-          ...result,
-          stdDev: void 0,
-          count: 0
-        };
-      }
-      let stdDev;
-      if (hasValueField && count >= 2) {
-        const sum = values.reduce(
-          (s, v) => s + (v[valueField] ?? 0),
-          0
-        );
-        const sumSq = values.reduce(
-          (s, v) => {
-            const value = v[valueField] ?? 0;
-            return s + value * value;
-          },
-          0
-        );
-        const mean = sum / count;
-        const variance = (sumSq - count * mean * mean) / (count - 1);
-        stdDev = variance > 0 ? Math.sqrt(variance) : 0;
-      }
-      return {
-        ...values[0],
-        ...result,
-        stdDev,
-        count
-      };
-    };
-  },
-  /** All runs must pass. Use for binary scores. */
-  all(values) {
-    const total = values.length;
-    const passedCount = values.filter((v) => v.passed).length;
-    return {
-      ...values[0],
-      passed: total > 0 && values.every((v) => v.passed),
-      passedCount,
-      totalCount: total
-    };
-  },
-  /** Take last value (no aggregation). Use when aggregation is not meaningful. */
-  last(values) {
-    return values[values.length - 1] ?? {};
-  }
-};
-var Score = {
-  aggregate: ScoreAggregate,
-  of(config) {
-    const def = {
-      id: config.id,
-      name: config.name,
-      displayStrategy: config.displayStrategy,
-      formatValue: config.formatValue,
-      formatAggregate: config.formatAggregate,
-      aggregateValues: config.aggregateValues,
-      make: (data, options) => {
-        const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
-        return {
-          id: config.id,
-          data,
-          ...passed !== void 0 && { passed },
-          ...options?.name !== void 0 && { name: options.name },
-          def
-          // Attach def so rendering/aggregation works without registry lookup
-        };
-      }
-    };
-    registry2.set(config.id, def);
-    return def;
-  }
-};
-function getScoreById(id) {
-  return registry2.get(id);
-}
-// src/evals/aggregators.ts
-function aggregateTokenCountSum(values) {
-  const initial = {
-    input: 0,
-    output: 0,
-    inputCached: 0,
-    outputCached: 0
-  };
-  return values.reduce(
-    (acc, v) => ({
-      input: acc.input + (v.input ?? 0),
-      output: acc.output + (v.output ?? 0),
-      inputCached: acc.inputCached + (v.inputCached ?? 0),
-      outputCached: acc.outputCached + (v.outputCached ?? 0)
-    }),
-    initial
-  );
-}
-function aggregateLatencyAverage(values) {
-  if (values.length === 0) {
-    return { ms: 0 };
-  }
-  const sum = values.reduce((s, v) => s + v.ms, 0);
-  return { ms: sum / values.length };
-}
-// src/evals/metrics/standard.ts
-var tokenCountMetric = Metric.of({
-  id: "token-count",
-  name: "Tokens",
-  aggregate: aggregateTokenCountSum,
-  format: (data, options) => {
-    const input = data.input ?? 0;
-    const output = data.output ?? 0;
-    const inputCached = data.inputCached ?? 0;
-    const outputCached = data.outputCached ?? 0;
-    const cached = inputCached + outputCached;
-    const base = `in:${input} out:${output} cached:${cached}`;
-    return options?.isAggregated ? `Total: ${base}` : base;
-  }
-});
-var latencyMetric = Metric.of({
-  id: "latency",
-  name: "Latency",
-  aggregate: aggregateLatencyAverage,
-  format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
-});
-// src/evals/scores/standard.ts
-var percentScore = Score.of({
-  id: "percent",
-  name: "Score",
-  displayStrategy: "bar",
-  formatValue: (data) => data.value.toFixed(2),
-  formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
-  aggregateValues: Score.aggregate.averageWithVariance(["value"])
-});
-var deltaScore = Score.of({
-  id: "delta",
-  name: "Delta",
-  displayStrategy: "number",
-  formatValue: (data) => `${data.value.toFixed(2)} (${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)} vs baseline)`,
-  formatAggregate: (data) => `Avg: ${data.value.toFixed(2)} (Delta: ${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)})`,
-  aggregateValues: Score.aggregate.averageFields(["value", "delta"])
-});
-var binaryScore = Score.of({
-  id: "binary",
-  name: "Result",
-  displayStrategy: "passFail",
-  formatValue: (data) => data.passed ? "PASSED" : "NOT PASSED",
-  formatAggregate: (data) => {
-    const base = data.passed ? "All: PASSED" : "Some: FAILED";
-    if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
-      return `${base} (${data.passedCount}/${data.totalCount})`;
-    }
-    return base;
-  },
-  aggregateValues: Score.aggregate.all
-});
-function preprocessForDiff(value, options) {
-  if (options?.sort && Array.isArray(value)) {
-    return [...value].sort((a, b) => {
-      const aStr = stringify__default.default(preprocessForDiff(a, options));
-      const bStr = stringify__default.default(preprocessForDiff(b, options));
-      return aStr.localeCompare(bStr);
-    }).map((item) => preprocessForDiff(item, options));
+function preprocessForDiff(value, options) {
+  if (options?.sort && Array.isArray(value)) {
+    return [...value].sort((a, b) => {
+      const aStr = stringify__default.default(preprocessForDiff(a, options));
+      const bStr = stringify__default.default(preprocessForDiff(b, options));
+      return aStr.localeCompare(bStr);
+    }).map((item) => preprocessForDiff(item, options));
   }
   if (value !== null && typeof value === "object" && !Array.isArray(value) && options?.excludeKeys) {
     const keys = Array.isArray(options.excludeKeys) ? options.excludeKeys : options.excludeKeys.split(",").map((k) => k.trim());
@@ -791,16 +612,8 @@ function createDiffString(expected, actual, diffOptions) {
   const expectedProcessed = preprocessForDiff(expected, diffOptions);
   const actualProcessed = preprocessForDiff(actual, diffOptions);
   if (diffOptions?.keysOnly) {
-    const expectedKeys = JSON.stringify(
-      extractKeys(expectedProcessed),
-      null,
-      2
-    );
-    const actualKeys = JSON.stringify(
-      extractKeys(actualProcessed),
-      null,
-      2
-    );
+    const expectedKeys = JSON.stringify(extractKeys(expectedProcessed), null, 2);
+    const actualKeys = JSON.stringify(extractKeys(actualProcessed), null, 2);
     const parts2 = diff.diffLines(expectedKeys, actualKeys);
     return formatDiffParts(parts2);
   }
@@ -811,9 +624,7 @@ function createDiffString(expected, actual, diffOptions) {
   }
   const parts = diff.diffLines(expectedStr, actualStr);
   if (diffOptions?.outputNewOnly) {
-    const filtered = parts.filter(
-      (p) => p.added === true
-    );
+    const filtered = parts.filter((p) => p.added === true);
     return formatDiffParts(filtered);
   }
   return formatDiffParts(parts);
@@ -878,14 +689,476 @@ function printJsonDiff(expected, actual, options = {}) {
       if (trimmed.startsWith("+") && !trimmed.startsWith("+++")) {
         return `\x1B[32m${line}\x1B[0m`;
       }
-      return line;
-    });
-    const colored = lines.join("\n");
-    console.log(colored || "(no differences)");
-    return colored;
+      return line;
+    });
+    const colored = lines.join("\n");
+    console.log(colored || "(no differences)");
+    return colored;
+  }
+  console.log(diff || "(no differences)");
+  return diff;
+}
+// src/evals/metric.ts
+var registry = /* @__PURE__ */ new Map();
+var Metric = {
+  of(config) {
+    const def = {
+      id: config.id,
+      name: config.name,
+      aggregate: config.aggregate,
+      format: config.format,
+      make: (data, options) => ({
+        id: config.id,
+        data,
+        ...options?.name !== void 0 && { name: options.name }
+      })
+    };
+    registry.set(config.id, def);
+    return def;
+  }
+};
+function getMetricById(id) {
+  return registry.get(id);
+}
+// src/evals/aggregators.ts
+function aggregateTokenCountSum(values) {
+  const initial = {
+    input: 0,
+    output: 0,
+    inputCached: 0,
+    outputCached: 0
+  };
+  return values.reduce(
+    (acc, v) => ({
+      input: acc.input + (v.input ?? 0),
+      output: acc.output + (v.output ?? 0),
+      inputCached: acc.inputCached + (v.inputCached ?? 0),
+      outputCached: acc.outputCached + (v.outputCached ?? 0)
+    }),
+    initial
+  );
+}
+function aggregateLatencyAverage(values) {
+  if (values.length === 0) {
+    return { ms: 0 };
+  }
+  const sum = values.reduce((s, v) => s + v.ms, 0);
+  return { ms: sum / values.length };
+}
+// src/evals/metrics/standard.ts
+var tokenCountMetric = Metric.of({
+  id: "token-count",
+  name: "Tokens",
+  aggregate: aggregateTokenCountSum,
+  format: (data, options) => {
+    const input = data.input ?? 0;
+    const output = data.output ?? 0;
+    const inputCached = data.inputCached ?? 0;
+    const outputCached = data.outputCached ?? 0;
+    const cached = inputCached + outputCached;
+    const base = `in:${input} out:${output} cached:${cached}`;
+    return options?.isAggregated ? `Total: ${base}` : base;
+  }
+});
+var latencyMetric = Metric.of({
+  id: "latency",
+  name: "Latency",
+  aggregate: aggregateLatencyAverage,
+  format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
+});
+// src/evals/run-config.ts
+function validateRow(row, index) {
+  const hasEvaluators = "evaluators" in row && row.evaluators !== void 0 && row.evaluators !== void 0;
+  const hasPattern = "evaluatorPattern" in row && typeof row.evaluatorPattern === "string" && row.evaluatorPattern.trim().length > 0;
+  if (hasEvaluators && hasPattern) {
+    throw new Error(`RunConfig run[${index}] must not set both evaluators and evaluatorPattern`);
+  }
+  if (!hasEvaluators && !hasPattern) {
+    throw new Error(`RunConfig run[${index}] must set either evaluators or evaluatorPattern`);
+  }
+  if (hasEvaluators && row.evaluators.length === 0) {
+    throw new Error(`RunConfig run[${index}]: evaluators must be non-empty`);
+  }
+  const rawRep = "repetitions" in row ? row.repetitions : void 0;
+  const repetitions = rawRep ?? 1;
+  if (!Number.isInteger(repetitions) || repetitions < 1) {
+    throw new Error(
+      `RunConfig run[${index}]: repetitions must be a positive integer, got ${String(rawRep)}`
+    );
+  }
+}
+var RunConfig = class _RunConfig {
+  constructor(name, displayName, tags, runs) {
+    this._name = name;
+    this._displayName = displayName;
+    this._tags = tags;
+    this._runs = runs;
+  }
+  static define(config) {
+    if (config.runs.length === 0) {
+      throw new Error("RunConfig runs must be non-empty");
+    }
+    config.runs.forEach(validateRow);
+    const name = validateRunConfigName(config.name, "RunConfig.define");
+    const displayName = normalizeOptionalDisplayName(config.displayName);
+    const tags = config.tags !== void 0 ? [...config.tags] : [];
+    return new _RunConfig(name, displayName, tags, config.runs);
+  }
+  /** Canonical id (branded {@link RunConfigName} at runtime; typed as `string` for ergonomics). */
+  getName() {
+    return this._name;
+  }
+  /** Optional unrestricted display label. */
+  getDisplayName() {
+    return this._displayName;
+  }
+  /** Label for CLI/TUI: {@link getDisplayName} if set, otherwise {@link getName}. */
+  getDisplayLabel() {
+    return this._displayName ?? this._name;
+  }
+  /** Tags from `RunConfig.define({ tags })`; surfaced as `runConfigTags` on evaluator callbacks. */
+  getTags() {
+    return [...this._tags];
+  }
+  getRuns() {
+    return this._runs;
+  }
+};
+// src/evals/score.ts
+var registry2 = /* @__PURE__ */ new Map();
+function formatScoreData(def, data, options) {
+  return options?.isAggregated ? def.formatAggregate(data) : def.formatValue(data);
+}
+var ScoreAggregate = {
+  /** Average numeric fields. Use for scores like { value, delta }. */
+  averageFields(fields) {
+    return (values) => {
+      const count = values.length || 1;
+      const result = {};
+      for (const field of fields) {
+        result[field] = values.reduce((s, v) => s + (v[field] ?? 0), 0) / count;
+      }
+      return result;
+    };
+  },
+  /** Average selected numeric fields, with sample std dev tracked for `value`. */
+  averageWithVariance(fields) {
+    return (values) => {
+      const count = values.length;
+      const result = {};
+      for (const field of fields) {
+        result[field] = count === 0 ? 0 : values.reduce(
+          (sum, item) => sum + (item[field] ?? 0),
+          0
+        ) / count;
+      }
+      const valueField = "value";
+      const hasValueField = fields.includes(valueField);
+      if (count === 0) {
+        if (hasValueField) {
+          result[valueField] = 0;
+        }
+        return {
+          ...result,
+          stdDev: void 0,
+          count: 0
+        };
+      }
+      let stdDev;
+      if (hasValueField && count >= 2) {
+        const sum = values.reduce(
+          (s, v) => s + (v[valueField] ?? 0),
+          0
+        );
+        const sumSq = values.reduce((s, v) => {
+          const value = v[valueField] ?? 0;
+          return s + value * value;
+        }, 0);
+        const mean = sum / count;
+        const variance = (sumSq - count * mean * mean) / (count - 1);
+        stdDev = variance > 0 ? Math.sqrt(variance) : 0;
+      }
+      return {
+        ...values[0],
+        ...result,
+        stdDev,
+        count
+      };
+    };
+  },
+  /** All runs must pass. Use for binary scores. */
+  all(values) {
+    const total = values.length;
+    const passedCount = values.filter((v) => v.passed).length;
+    return {
+      ...values[0],
+      passed: total > 0 && values.every((v) => v.passed),
+      passedCount,
+      totalCount: total
+    };
+  },
+  /** Take last value (no aggregation). Use when aggregation is not meaningful. */
+  last(values) {
+    return values[values.length - 1] ?? {};
+  }
+};
+var Score = {
+  aggregate: ScoreAggregate,
+  of(config) {
+    const def = {
+      id: config.id,
+      name: config.name,
+      displayStrategy: config.displayStrategy,
+      formatValue: config.formatValue,
+      formatAggregate: config.formatAggregate,
+      aggregateValues: config.aggregateValues,
+      make: (data, options) => {
+        const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
+        return {
+          id: config.id,
+          data,
+          ...passed !== void 0 && { passed },
+          ...options?.name !== void 0 && { name: options.name },
+          def
+          // Attach def so rendering/aggregation works without registry lookup
+        };
+      }
+    };
+    registry2.set(config.id, def);
+    return def;
+  }
+};
+function getScoreById(id) {
+  return registry2.get(id);
+}
+// src/evals/scores/standard.ts
+var percentScore = Score.of({
+  id: "percent",
+  name: "Score",
+  displayStrategy: "bar",
+  formatValue: (data) => data.value.toFixed(2),
+  formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
+  aggregateValues: Score.aggregate.averageWithVariance(["value"])
+});
+var deltaScore = Score.of({
+  id: "delta",
+  name: "Delta",
+  displayStrategy: "number",
+  formatValue: (data) => `${data.value.toFixed(2)} (${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)} vs baseline)`,
+  formatAggregate: (data) => `Avg: ${data.value.toFixed(2)} (Delta: ${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)})`,
+  aggregateValues: Score.aggregate.averageFields(["value", "delta"])
+});
+var binaryScore = Score.of({
+  id: "binary",
+  name: "Result",
+  displayStrategy: "passFail",
+  formatValue: (data) => data.passed ? "PASSED" : "NOT PASSED",
+  formatAggregate: (data) => {
+    const base = data.passed ? "All: PASSED" : "Some: FAILED";
+    if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
+      return `${base} (${data.passedCount}/${data.totalCount})`;
+    }
+    return base;
+  },
+  aggregateValues: Score.aggregate.all
+});
+// src/evals/tag-set.ts
+var TagSet = class {
+  constructor() {
+  }
+  static define(tags) {
+    const out = {};
+    for (const tag of tags) {
+      out[tag] = tag;
+    }
+    return out;
+  }
+};
+// src/evals/test-case.ts
+function resolve(value) {
+  return typeof value === "function" ? value() : value;
+}
+var TestCase = class _TestCase {
+  constructor(config) {
+    this._config = config;
+  }
+  static describe(config) {
+    const name = validateTestCaseName(config.name, "TestCase.describe");
+    const displayName = normalizeOptionalDisplayName(config.displayName);
+    return new _TestCase({
+      name,
+      displayName,
+      tags: config.tags,
+      inputSchema: config.inputSchema,
+      input: config.input,
+      outputSchema: config.outputSchema,
+      output: config.output
+    });
+  }
+  getName() {
+    return this._config.name;
+  }
+  getDisplayName() {
+    return this._config.displayName;
+  }
+  getDisplayLabel() {
+    return this._config.displayName ?? this._config.name;
+  }
+  getTags() {
+    return this._config.tags;
+  }
+  getInputSchema() {
+    return this._config.inputSchema;
+  }
+  getInput() {
+    return resolve(this._config.input);
+  }
+  getOutputSchema() {
+    return this._config.outputSchema;
+  }
+  getOutput() {
+    if (this._config.output === void 0) {
+      return void 0;
+    }
+    return resolve(this._config.output);
+  }
+};
+function getTestCaseDisplayLabel(testCase) {
+  if (typeof testCase.getDisplayLabel === "function") {
+    return testCase.getDisplayLabel();
+  }
+  return typeof testCase.getName === "function" ? testCase.getName() : "";
+}
+function getTestCaseTagList(testCase) {
+  return typeof testCase.getTags === "function" ? [...testCase.getTags()] : [];
+}
+async function loadRunSnapshotsFromArtifacts(config) {
+  const baseDir = path.resolve(config.artifactDirectory);
+  let entries;
+  try {
+    entries = await promises.readdir(baseDir);
+  } catch {
+    return [];
+  }
+  const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
+  const snapshots = [];
+  for (const fileName of jsonlFiles) {
+    const filePath = path.join(baseDir, fileName);
+    try {
+      const snapshot = await parseArtifactToSnapshot(filePath, config);
+      if (snapshot) {
+        snapshots.push(snapshot);
+      }
+    } catch {
+    }
+  }
+  return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
+}
+async function parseArtifactToSnapshot(filePath, _config) {
+  const content = await promises.readFile(filePath, "utf8");
+  const lines = content.split("\n").filter((line) => line.trim().length > 0);
+  if (lines.length === 0) {
+    return null;
+  }
+  let runQueued = null;
+  let runCompleted = null;
+  let runFailed = null;
+  let runStarted = null;
+  for (const line of lines) {
+    try {
+      const event = JSON.parse(line);
+      const type = event.type;
+      if (type === "RunQueued") {
+        runQueued = {
+          runId: event.runId,
+          datasetId: event.datasetId,
+          datasetName: event.datasetName,
+          evaluatorIds: event.evaluatorIds,
+          totalTestCases: event.totalTestCases ?? 0,
+          artifactPath: event.artifactPath ?? filePath,
+          ts: event.ts
+        };
+      }
+      if (type === "RunStarted") {
+        runStarted = { startedAt: event.startedAt };
+      }
+      if (type === "RunCompleted") {
+        runCompleted = {
+          passedTestCases: event.passedTestCases,
+          failedTestCases: event.failedTestCases,
+          totalTestCases: event.totalTestCases,
+          finishedAt: event.finishedAt
+        };
+      }
+      if (type === "RunFailed") {
+        runFailed = {
+          finishedAt: event.finishedAt,
+          errorMessage: event.errorMessage
+        };
+      }
+    } catch {
+    }
+  }
+  if (!runQueued) {
+    return null;
+  }
+  const artifactPath = filePath;
+  const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
+  const progress = aggregateTestCaseProgress(lines);
+  const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
+  const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
+  const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
+  return {
+    runId: runQueued.runId,
+    datasetId: runQueued.datasetId,
+    datasetName: runQueued.datasetName,
+    evaluatorIds: runQueued.evaluatorIds,
+    queuedAt: runQueued.ts ?? 0,
+    startedAt: runStarted?.startedAt,
+    finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
+    totalTestCases: runQueued.totalTestCases,
+    completedTestCases,
+    passedTestCases,
+    failedTestCases,
+    status,
+    artifactPath,
+    errorMessage: runFailed?.errorMessage
+  };
+}
+function aggregateTestCaseProgress(lines) {
+  let completedTestCases = 0;
+  const testCasePassedBy = /* @__PURE__ */ new Map();
+  for (const line of lines) {
+    try {
+      const event = JSON.parse(line);
+      if (event.type === "TestCaseProgress") {
+        const ev = event;
+        completedTestCases = ev.completedTestCases ?? completedTestCases;
+        const id = ev.testCaseId;
+        const current = testCasePassedBy.get(id);
+        testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
+      }
+    } catch {
+    }
   }
-  console.log(diff || "(no differences)");
-  return diff;
+  let passedTestCases = 0;
+  let failedTestCases = 0;
+  for (const passed of testCasePassedBy.values()) {
+    if (passed) {
+      passedTestCases += 1;
+    } else {
+      failedTestCases += 1;
+    }
+  }
+  return { completedTestCases, passedTestCases, failedTestCases };
 }
 // src/runner/config.ts
@@ -896,18 +1169,9 @@ var defaultRunnerConfig = {
   discovery: {
     rootDir: process.cwd(),
     datasetSuffixes: [".dataset.ts", ".dataset.tsx", ".dataset.js", ".dataset.mjs"],
-    evaluatorSuffixes: [
-      ".evaluator.ts",
-      ".evaluator.tsx",
-      ".evaluator.js",
-      ".evaluator.mjs"
-    ],
-    testCaseSuffixes: [
-      ".test-case.ts",
-      ".test-case.tsx",
-      ".test-case.js",
-      ".test-case.mjs"
-    ],
+    evaluatorSuffixes: [".evaluator.ts", ".evaluator.tsx", ".evaluator.js", ".evaluator.mjs"],
+    runConfigSuffixes: [".run-config.ts", ".run-config.tsx", ".run-config.js", ".run-config.mjs"],
+    testCaseSuffixes: [".test-case.ts", ".test-case.tsx", ".test-case.js", ".test-case.mjs"],
     excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
   },
   artifactDirectory: ".eval-results",
@@ -932,6 +1196,11 @@ function toRunnerConfigOverrides(config) {
   } else if (rawDiscovery?.evaluatorSuffixes !== void 0) {
     discovery.evaluatorSuffixes = rawDiscovery.evaluatorSuffixes;
   }
+  if (rawDiscovery?.runConfigFilePatterns !== void 0) {
+    discovery.runConfigSuffixes = rawDiscovery.runConfigFilePatterns;
+  } else if (rawDiscovery?.runConfigSuffixes !== void 0) {
+    discovery.runConfigSuffixes = rawDiscovery.runConfigSuffixes;
+  }
   if (rawDiscovery?.testCaseFilePatterns !== void 0) {
     discovery.testCaseSuffixes = rawDiscovery.testCaseFilePatterns;
   } else if (rawDiscovery?.testCaseSuffixes !== void 0) {
@@ -974,14 +1243,15 @@ function getJitiLoader() {
   }
   const createJiti2 = jitiModule__namespace.createJiti ?? jitiModule__namespace.default;
   if (typeof createJiti2 !== "function") {
-    throw new Error(
-      "Failed to initialize jiti for m4trix eval config loading."
-    );
+    throw new Error("Failed to initialize jiti for m4trix eval config loading.");
   }
-  cachedLoader = createJiti2((typeof document === 'undefined' ? require('u' + 'rl').pathToFileURL(__filename).href : (_documentCurrentScript && _documentCurrentScript.tagName.toUpperCase() === 'SCRIPT' && _documentCurrentScript.src || new URL('out.js', document.baseURI).href)), {
-    interopDefault: true,
-    moduleCache: true
-  });
+  cachedLoader = createJiti2(
+    (typeof document === 'undefined' ? require('u' + 'rl').pathToFileURL(__filename).href : (_documentCurrentScript && _documentCurrentScript.tagName.toUpperCase() === 'SCRIPT' && _documentCurrentScript.src || new URL('out.js', document.baseURI).href)),
+    {
+      interopDefault: true,
+      moduleCache: true
+    }
+  );
   return cachedLoader;
 }
 function resolveConfigModuleExport(loadedModule) {
@@ -1029,6 +1299,9 @@ function isDatasetLike(value) {
 function isEvaluatorLike(value) {
   return hasMethod(value, "getName") && hasMethod(value, "resolveContext") && hasMethod(value, "getEvaluateFn");
 }
+function isRunConfigLike(value) {
+  return hasMethod(value, "getName") && hasMethod(value, "getRuns") && typeof value.getRuns === "function";
+}
 function isTestCaseLike(value) {
   return hasMethod(value, "getName") && hasMethod(value, "getTags") && hasMethod(value, "getInput");
 }
@@ -1085,9 +1358,7 @@ async function loadModuleExports(filePath) {
 }
 async function collectDatasetsFromFiles(config) {
   const files = await walkDirectory(config.rootDir, config.excludeDirectories);
-  const matched = files.filter(
-    (filePath) => hasOneSuffix(filePath, config.datasetSuffixes)
-  );
+  const matched = files.filter((filePath) => hasOneSuffix(filePath, config.datasetSuffixes));
   const found = await Promise.all(
     matched.map(async (absolutePath) => {
       const exports = await loadModuleExports(absolutePath);
@@ -1104,9 +1375,7 @@ async function collectDatasetsFromFiles(config) {
 }
 async function collectEvaluatorsFromFiles(config) {
   const files = await walkDirectory(config.rootDir, config.excludeDirectories);
-  const matched = files.filter(
-    (filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes)
-  );
+  const matched = files.filter((filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes));
   const found = await Promise.all(
     matched.map(async (absolutePath) => {
       const exports = await loadModuleExports(absolutePath);
@@ -1121,11 +1390,26 @@ async function collectEvaluatorsFromFiles(config) {
   );
   return found.flat();
 }
-async function collectTestCasesFromFiles(config) {
+async function collectRunConfigsFromFiles(config) {
   const files = await walkDirectory(config.rootDir, config.excludeDirectories);
-  const matched = files.filter(
-    (filePath) => hasOneSuffix(filePath, config.testCaseSuffixes)
+  const matched = files.filter((filePath) => hasOneSuffix(filePath, config.runConfigSuffixes));
+  const found = await Promise.all(
+    matched.map(async (absolutePath) => {
+      const exports = await loadModuleExports(absolutePath);
+      const runConfigs = exports.filter(isRunConfigLike);
+      const relPath = path.relative(config.rootDir, absolutePath);
+      return runConfigs.map((runConfig) => ({
+        id: runConfig.getName(),
+        filePath: relPath,
+        runConfig
+      }));
+    })
   );
+  return found.flat();
+}
+async function collectTestCasesFromFiles(config) {
+  const files = await walkDirectory(config.rootDir, config.excludeDirectories);
+  const matched = files.filter((filePath) => hasOneSuffix(filePath, config.testCaseSuffixes));
   const found = await Promise.all(
     matched.map(async (absolutePath) => {
       const exports = await loadModuleExports(absolutePath);
@@ -1215,15 +1499,17 @@ function readOutput(testCase) {
   }
   return candidate.getOutput();
 }
-function buildEvaluationUnits(testCases) {
+function buildEvaluationUnits(testCases, repetitionCount) {
+  const count = Math.max(1, repetitionCount);
   const units = [];
   for (const testCaseItem of testCases) {
-    const rerunTotal = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
-    for (let r = 0; r < rerunTotal; r++) {
+    const repetitionId = `rep-${crypto.randomUUID()}`;
+    for (let r = 0; r < count; r++) {
       units.push({
         testCaseItem,
-        rerunIndex: r + 1,
-        rerunTotal
+        repetitionId,
+        repetitionIndex: r + 1,
+        repetitionCount: count
       });
     }
   }
@@ -1233,29 +1519,24 @@ function nowIsoForFile() {
   return (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
 }
 function createArtifactPath(artifactDirectory, datasetId, runId) {
-  return path.join(
-    artifactDirectory,
-    `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
-  );
+  return path.join(artifactDirectory, `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`);
 }
 function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
-  const { testCaseItem, rerunIndex, rerunTotal } = unit;
+  const { testCaseItem, repetitionId, repetitionIndex, repetitionCount } = unit;
   return effect.Effect.gen(function* () {
     const evaluatorRunId = `run-${crypto.randomUUID()}`;
     const started = Date.now();
-    const startedEvaluations = yield* effect.Ref.modify(startedRef, (n) => [
-      n + 1,
-      n + 1
-    ]);
+    const startedEvaluations = yield* effect.Ref.modify(startedRef, (n) => [n + 1, n + 1]);
     yield* publishEvent({
       type: "TestCaseStarted",
       runId: task.runId,
       testCaseId: testCaseItem.id,
-      testCaseName: testCaseItem.testCase.getName(),
+      testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
       startedTestCases: startedEvaluations,
       totalTestCases: totalEvaluations,
-      rerunIndex,
-      rerunTotal
+      repetitionId,
+      repetitionIndex,
+      repetitionCount
     });
     const evaluatorScores = [];
     let testCaseError;
@@ -1279,9 +1560,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
         return error;
       };
       try {
-        const ctx = yield* effect.Effect.promise(
-          () => Promise.resolve(evaluator.resolveContext())
-        );
+        const ctx = yield* effect.Effect.promise(() => Promise.resolve(evaluator.resolveContext()));
         const result = yield* effect.Effect.promise(
           () => Promise.resolve().then(
             () => evaluateFn({
@@ -1291,8 +1570,15 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
               meta: {
                 triggerId: task.triggerId,
                 runId: evaluatorRunId,
-                datasetId: task.datasetId
+                datasetId: task.datasetId,
+                repetitionId,
+                repetitionIndex,
+                repetitionCount,
+                runConfigName: task.runConfigName
               },
+              testCaseTags: getTestCaseTagList(testCaseItem.testCase),
+              runConfigTags: task.runConfigTags,
+              evaluatorTags: getEvaluatorTagList(evaluator),
               logDiff,
               log,
               createError
@@ -1335,21 +1621,19 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
         });
       }
     }
-    const rerunPassedThis = evaluatorScores.every((s) => s.passed);
-    const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [
-      n + 1,
-      n + 1
-    ]);
+    const repetitionPassedThis = evaluatorScores.every((s) => s.passed);
+    const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [n + 1, n + 1]);
     const progressEvent = {
       type: "TestCaseProgress",
       runId: task.runId,
       testCaseId: testCaseItem.id,
-      testCaseName: testCaseItem.testCase.getName(),
+      testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
       completedTestCases: completedEvaluations,
       totalTestCases: totalEvaluations,
-      rerunIndex,
-      rerunTotal,
-      passed: rerunPassedThis,
+      repetitionId,
+      repetitionIndex,
+      repetitionCount,
+      passed: repetitionPassedThis,
       durationMs: Date.now() - started,
       evaluatorScores,
       output,
@@ -1370,9 +1654,9 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
       (map) => {
         const key = testCaseItem.id;
         const existing = map.get(key) ?? { completedCount: 0, results: [] };
-        const newResults = [...existing.results, rerunPassedThis];
+        const newResults = [...existing.results, repetitionPassedThis];
         const newCompletedCount = existing.completedCount + 1;
-        const isLast = newCompletedCount === rerunTotal;
+        const isLast = newCompletedCount === repetitionCount;
         const newMap = new Map(map);
         newMap.set(key, {
           completedCount: newCompletedCount,
@@ -1388,10 +1672,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
       } else {
         yield* effect.Ref.update(failedRef, (n) => n + 1);
       }
-      const [passed, failed] = yield* effect.Effect.all([
-        effect.Ref.get(passedRef),
-        effect.Ref.get(failedRef)
-      ]);
+      const [passed, failed] = yield* effect.Effect.all([effect.Ref.get(passedRef), effect.Ref.get(failedRef)]);
       yield* updateSnapshot(task.runId, (snapshot) => ({
         ...snapshot,
         passedTestCases: passed,
@@ -1412,10 +1693,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
     runId: task.runId,
     startedAt
   });
-  const totalEvaluations = task.testCases.reduce(
-    (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
-    0
-  );
+  const totalEvaluations = task.testCases.length * Math.max(1, task.repetitions);
   const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
   const completedRef = yield* effect.Ref.make(0);
   const startedRef = yield* effect.Ref.make(0);
@@ -1424,7 +1702,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
   const testCaseResultsRef = yield* effect.Ref.make(
     /* @__PURE__ */ new Map()
   );
-  const evaluationUnits = buildEvaluationUnits(task.testCases);
+  const evaluationUnits = buildEvaluationUnits(task.testCases, task.repetitions);
   const processEvaluation = (unit) => processOneEvaluation(
     task,
     unit,
@@ -1438,11 +1716,20 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
     failedRef,
     testCaseResultsRef
   );
-  yield* effect.Effect.forEach(
-    evaluationUnits,
-    processEvaluation,
-    maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
-  );
+  const globalSem = task.globalEvaluationSemaphore;
+  if (globalSem !== void 0) {
+    yield* effect.Effect.forEach(
+      evaluationUnits,
+      (unit) => globalSem.withPermits(1)(processEvaluation(unit)),
+      { concurrency: "unbounded", discard: true }
+    );
+  } else {
+    yield* effect.Effect.forEach(
+      evaluationUnits,
+      processEvaluation,
+      maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
+    );
+  }
   const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* effect.Effect.all([
     effect.Ref.get(completedRef),
     effect.Ref.get(passedRef),
@@ -1478,125 +1765,34 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
     artifactPath: task.snapshot.artifactPath
   });
 });
-async function loadRunSnapshotsFromArtifacts(config) {
-  const baseDir = path.resolve(config.artifactDirectory);
-  let entries;
-  try {
-    entries = await promises.readdir(baseDir);
-  } catch {
-    return [];
-  }
-  const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
-  const snapshots = [];
-  for (const fileName of jsonlFiles) {
-    const filePath = path.join(baseDir, fileName);
-    try {
-      const snapshot = await parseArtifactToSnapshot(filePath, config);
-      if (snapshot) {
-        snapshots.push(snapshot);
-      }
-    } catch {
-    }
-  }
-  return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
-}
-async function parseArtifactToSnapshot(filePath, _config) {
-  const content = await promises.readFile(filePath, "utf8");
-  const lines = content.split("\n").filter((line) => line.trim().length > 0);
-  if (lines.length === 0) {
-    return null;
-  }
-  let runQueued = null;
-  let runCompleted = null;
-  let runFailed = null;
-  let runStarted = null;
-  for (const line of lines) {
-    try {
-      const event = JSON.parse(line);
-      const type = event.type;
-      if (type === "RunQueued") {
-        runQueued = {
-          runId: event.runId,
-          datasetId: event.datasetId,
-          datasetName: event.datasetName,
-          evaluatorIds: event.evaluatorIds,
-          totalTestCases: event.totalTestCases ?? 0,
-          artifactPath: event.artifactPath ?? filePath,
-          ts: event.ts
-        };
-      }
-      if (type === "RunStarted") {
-        runStarted = { startedAt: event.startedAt };
-      }
-      if (type === "RunCompleted") {
-        runCompleted = {
-          passedTestCases: event.passedTestCases,
-          failedTestCases: event.failedTestCases,
-          totalTestCases: event.totalTestCases,
-          finishedAt: event.finishedAt
-        };
-      }
-      if (type === "RunFailed") {
-        runFailed = {
-          finishedAt: event.finishedAt,
-          errorMessage: event.errorMessage
-        };
-      }
-    } catch {
-    }
+// src/runner/name-pattern.ts
+function parseRegexLiteral(pattern) {
+  if (!pattern.startsWith("/")) {
+    return void 0;
   }
-  if (!runQueued) {
-    return null;
+  const lastSlash = pattern.lastIndexOf("/");
+  if (lastSlash <= 0) {
+    return void 0;
   }
-  const artifactPath = filePath;
-  const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
-  const progress = aggregateTestCaseProgress(lines);
-  const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
-  const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
-  const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
   return {
-    runId: runQueued.runId,
-    datasetId: runQueued.datasetId,
-    datasetName: runQueued.datasetName,
-    evaluatorIds: runQueued.evaluatorIds,
-    queuedAt: runQueued.ts ?? 0,
-    startedAt: runStarted?.startedAt,
-    finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
-    totalTestCases: runQueued.totalTestCases,
-    completedTestCases,
-    passedTestCases,
-    failedTestCases,
-    status,
-    artifactPath,
-    errorMessage: runFailed?.errorMessage
+    source: pattern.slice(1, lastSlash),
+    flags: pattern.slice(lastSlash + 1)
   };
 }
-function aggregateTestCaseProgress(lines) {
-  let completedTestCases = 0;
-  const testCasePassedBy = /* @__PURE__ */ new Map();
-  for (const line of lines) {
-    try {
-      const event = JSON.parse(line);
-      if (event.type === "TestCaseProgress") {
-        const ev = event;
-        completedTestCases = ev.completedTestCases ?? completedTestCases;
-        const id = ev.testCaseId;
-        const current = testCasePassedBy.get(id);
-        testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
-      }
-    } catch {
-    }
+function createNameMatcher(pattern) {
+  const normalizedPattern = pattern.trim();
+  const regexLiteral = parseRegexLiteral(normalizedPattern);
+  if (regexLiteral) {
+    const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
+    return (value) => regex.test(value);
   }
-  let passedTestCases = 0;
-  let failedTestCases = 0;
-  for (const passed of testCasePassedBy.values()) {
-    if (passed) {
-      passedTestCases += 1;
-    } else {
-      failedTestCases += 1;
-    }
+  if (normalizedPattern.includes("*")) {
+    const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
+    const regex = new RegExp(`^${escaped}$`, "i");
+    return (value) => regex.test(value);
   }
-  return { completedTestCases, passedTestCases, failedTestCases };
+  return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
 }
 async function appendJsonLine(artifactPath, payload) {
   await promises.mkdir(path.dirname(artifactPath), { recursive: true });
@@ -1655,32 +1851,12 @@ function searchCollectedTestCases(all, query) {
 }
 // src/runner/api.ts
-function parseRegexLiteral(pattern) {
-  if (!pattern.startsWith("/")) {
-    return void 0;
-  }
-  const lastSlash = pattern.lastIndexOf("/");
-  if (lastSlash <= 0) {
-    return void 0;
-  }
-  return {
-    source: pattern.slice(1, lastSlash),
-    flags: pattern.slice(lastSlash + 1)
-  };
-}
-function createNameMatcher(pattern) {
-  const normalizedPattern = pattern.trim();
-  const regexLiteral = parseRegexLiteral(normalizedPattern);
-  if (regexLiteral) {
-    const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
-    return (value) => regex.test(value);
-  }
-  if (normalizedPattern.includes("*")) {
-    const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
-    const regex = new RegExp(`^${escaped}$`, "i");
-    return (value) => regex.test(value);
+function normalizeRunRepetitions(value) {
+  const n = value ?? 1;
+  if (!Number.isInteger(n) || n < 1) {
+    throw new Error(`repetitions must be a positive integer, got ${String(value)}`);
   }
-  return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
+  return n;
 }
 function mergeRunnerOverrides(base, next) {
   if (!base) {
@@ -1711,15 +1887,12 @@ var EffectRunner = class {
     this.persistenceQueue = effect.Effect.runSync(
       effect.Queue.unbounded()
     );
-    this.snapshotsRef = effect.Effect.runSync(
-      effect.Ref.make(/* @__PURE__ */ new Map())
-    );
+    this.snapshotsRef = effect.Effect.runSync(effect.Ref.make(/* @__PURE__ */ new Map()));
     this.listeners = /* @__PURE__ */ new Set();
     this.datasetsById = /* @__PURE__ */ new Map();
     this.evaluatorsById = /* @__PURE__ */ new Map();
-    this.schedulerFiber = effect.Effect.runFork(
-      this.createSchedulerEffect()
-    );
+    this.runConfigsById = /* @__PURE__ */ new Map();
+    this.schedulerFiber = effect.Effect.runFork(this.createSchedulerEffect());
     this.persistenceFiber = effect.Effect.runFork(
       createPersistenceWorker(this.persistenceQueue)
     );
@@ -1759,6 +1932,137 @@ var EffectRunner = class {
       (item) => matcher(item.evaluator.getName() ?? "")
     );
   }
+  async collectRunConfigs() {
+    const runConfigs = await collectRunConfigsFromFiles(this.config.discovery);
+    this.runConfigsById.clear();
+    const byNameLower = /* @__PURE__ */ new Map();
+    for (const item of runConfigs) {
+      const id = item.runConfig.getName();
+      const lower = id.toLowerCase();
+      const prev = byNameLower.get(lower);
+      if (prev !== void 0 && prev.filePath !== item.filePath) {
+        throw new Error(
+          `Duplicate RunConfig name "${id}" (matches "${prev.runConfig.getName()}" case-insensitively): ${prev.filePath} and ${item.filePath}`
+        );
+      }
+      byNameLower.set(lower, item);
+      this.runConfigsById.set(id, item);
+    }
+    return runConfigs;
+  }
+  async resolveRunConfigByName(name) {
+    if (this.runConfigsById.size === 0) {
+      await this.collectRunConfigs();
+    }
+    const key = validateRunConfigName(name, `RunConfig "${name.trim()}"`);
+    const keyLower = key.toLowerCase();
+    const matches = Array.from(this.runConfigsById.values()).filter(
+      (item) => item.runConfig.getName().toLowerCase() === keyLower
+    );
+    if (matches.length === 0) {
+      return void 0;
+    }
+    if (matches.length > 1) {
+      throw new Error(
+        `Multiple RunConfigs named "${name}": ${matches.map((m) => m.filePath).join(", ")}`
+      );
+    }
+    return matches[0];
+  }
+  async expandRunConfigToJobs(collected) {
+    if (this.datasetsById.size === 0) {
+      await this.collectDatasets();
+    }
+    if (this.evaluatorsById.size === 0) {
+      await this.collectEvaluators();
+    }
+    const rcName = collected.runConfig.getName();
+    const jobs = [];
+    const runs = collected.runConfig.getRuns();
+    for (const [i, row] of runs.entries()) {
+      const dsCollected = Array.from(this.datasetsById.values()).find(
+        (d) => d.dataset === row.dataset
+      );
+      if (!dsCollected) {
+        throw new Error(
+          `RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getName()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
+        );
+      }
+      let evaluatorIds;
+      if ("evaluatorPattern" in row && typeof row.evaluatorPattern === "string") {
+        const matcher = createNameMatcher(row.evaluatorPattern);
+        const matched = Array.from(this.evaluatorsById.values()).filter(
+          (item) => matcher(item.evaluator.getName() ?? "")
+        );
+        if (matched.length === 0) {
+          throw new Error(
+            `RunConfig "${rcName}" run[${i}]: no evaluator matched pattern "${row.evaluatorPattern}"`
+          );
+        }
+        evaluatorIds = matched.map((item) => item.id);
+      } else {
+        const evaluators = row.evaluators;
+        evaluatorIds = [];
+        for (const ev of evaluators) {
+          const found = Array.from(this.evaluatorsById.values()).find(
+            (item) => item.evaluator === ev
+          );
+          if (!found) {
+            throw new Error(
+              `RunConfig "${rcName}" run[${i}]: evaluator "${getEvaluatorDisplayLabel(ev) ?? "unknown"}" was not found among discovered evaluator exports`
+            );
+          }
+          evaluatorIds.push(found.id);
+        }
+      }
+      const repetitions = "repetitions" in row && row.repetitions !== void 0 ? row.repetitions : 1;
+      jobs.push({
+        datasetId: dsCollected.id,
+        evaluatorIds,
+        runConfigName: rcName,
+        runConfigDisplayLabel: collected.runConfig.getDisplayLabel(),
+        runConfigTags: collected.runConfig.getTags(),
+        repetitions
+      });
+    }
+    return jobs;
+  }
+  async expandRunConfigNamesToJobs(names) {
+    const jobs = [];
+    for (const name of names) {
+      const collected = await this.resolveRunConfigByName(name);
+      if (!collected) {
+        const known = await this.collectRunConfigs();
+        const available = known.map((r) => r.runConfig.getName()).sort();
+        throw new Error(
+          available.length > 0 ? `RunConfig "${name}" not found. Available RunConfigs: ${available.join(", ")}` : `RunConfig "${name}" not found and no RunConfigs were discovered.`
+        );
+      }
+      jobs.push(...await this.expandRunConfigToJobs(collected));
+    }
+    return jobs;
+  }
+  async runDatasetJobsWithSharedConcurrency(request) {
+    const globalConcurrency = Math.max(1, request.globalConcurrency);
+    const sem = effect.Effect.unsafeMakeSemaphore(globalConcurrency);
+    const triggerId = request.triggerId ?? `trg-${crypto.randomUUID()}`;
+    const snapshots = [];
+    for (const job of request.jobs) {
+      snapshots.push(
+        await this.startDatasetRun({
+          datasetId: job.datasetId,
+          evaluatorIds: job.evaluatorIds,
+          triggerId,
+          maxConcurrency: this.config.maxConcurrency ?? 1,
+          globalEvaluationSemaphore: sem,
+          runConfigName: job.runConfigName,
+          runConfigTags: job.runConfigTags,
+          repetitions: job.repetitions
+        })
+      );
+    }
+    return snapshots;
+  }
   async searchTestCases(query) {
     const testCases = await collectTestCasesFromFiles(this.config.discovery);
     return searchCollectedTestCases(testCases, query);
@@ -1777,35 +2081,45 @@ var EffectRunner = class {
     );
   }
   async runDatasetWith(request) {
+    const runConfigName = validateRunConfigName(
+      request.runConfigName,
+      "runDatasetWith.runConfigName"
+    );
+    return this.startDatasetRun({
+      datasetId: request.datasetId,
+      evaluatorIds: request.evaluatorIds,
+      triggerId: request.triggerId,
+      maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
+      repetitions: request.repetitions,
+      runConfigName,
+      runConfigTags: request.runConfigTags
+    });
+  }
+  async startDatasetRun(params) {
     if (this.datasetsById.size === 0) {
       await this.collectDatasets();
     }
     if (this.evaluatorsById.size === 0) {
       await this.collectEvaluators();
     }
-    const dataset = this.datasetsById.get(request.datasetId);
+    const dataset = this.datasetsById.get(params.datasetId);
     if (!dataset) {
-      throw new Error(`Unknown dataset: ${request.datasetId}`);
+      throw new Error(`Unknown dataset: ${params.datasetId}`);
     }
-    const selectedEvaluators = request.evaluatorIds.map((id) => this.evaluatorsById.get(id)).filter((value) => Boolean(value)).map((value) => ({ id: value.id, evaluator: value.evaluator }));
+    const selectedEvaluators = params.evaluatorIds.map((id) => this.evaluatorsById.get(id)).filter((value) => Boolean(value)).map((value) => ({ id: value.id, evaluator: value.evaluator }));
     if (selectedEvaluators.length === 0) {
       throw new Error("No evaluators selected for run");
     }
-    const selectedTestCases = await this.collectDatasetTestCases(request.datasetId);
-    const totalEvaluations = selectedTestCases.reduce(
-      (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
-      0
-    );
-    const triggerId = request.triggerId ?? `trg-${crypto.randomUUID()}`;
+    const selectedTestCases = await this.collectDatasetTestCases(params.datasetId);
+    const repetitions = normalizeRunRepetitions(params.repetitions);
+    const totalEvaluations = selectedTestCases.length * repetitions;
+    const runConfigTags = [...params.runConfigTags ?? []];
+    const triggerId = params.triggerId ?? `trg-${crypto.randomUUID()}`;
     const runId = `run-${crypto.randomUUID()}`;
-    const artifactPath = createArtifactPath(
-      this.config.artifactDirectory,
-      request.datasetId,
-      runId
-    );
+    const artifactPath = createArtifactPath(this.config.artifactDirectory, params.datasetId, runId);
     const snapshot = {
       runId,
-      datasetId: request.datasetId,
+      datasetId: params.datasetId,
       datasetName: dataset.dataset.getName(),
       evaluatorIds: selectedEvaluators.map((item) => item.id),
       queuedAt: Date.now(),
@@ -1826,7 +2140,7 @@ var EffectRunner = class {
     const queuedEvent = {
       type: "RunQueued",
       runId,
-      datasetId: request.datasetId,
+      datasetId: params.datasetId,
       datasetName: dataset.dataset.getName(),
       evaluatorIds: selectedEvaluators.map((item) => item.id),
       totalTestCases: totalEvaluations,
@@ -1840,17 +2154,20 @@ var EffectRunner = class {
         payload: queuedEvent
       })
     );
-    const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
     await effect.Effect.runPromise(
       effect.Queue.offer(this.runQueue, {
         runId,
         triggerId,
-        datasetId: request.datasetId,
+        datasetId: params.datasetId,
         dataset: dataset.dataset,
         evaluators: selectedEvaluators,
         testCases: selectedTestCases,
         snapshot,
-        maxConcurrency
+        maxConcurrency: params.maxConcurrency,
+        globalEvaluationSemaphore: params.globalEvaluationSemaphore,
+        runConfigName: params.runConfigName,
+        runConfigTags,
+        repetitions
       })
     );
     return snapshot;
@@ -1866,9 +2183,9 @@ var EffectRunner = class {
     return effect.Effect.runSync(effect.Ref.get(this.snapshotsRef)).get(runId);
   }
   getAllRunSnapshots() {
-    return Array.from(
-      effect.Effect.runSync(effect.Ref.get(this.snapshotsRef)).values()
-    ).sort((a, b) => b.queuedAt - a.queuedAt);
+    return Array.from(effect.Effect.runSync(effect.Ref.get(this.snapshotsRef)).values()).sort(
+      (a, b) => b.queuedAt - a.queuedAt
+    );
   }
   async loadRunSnapshotsFromArtifacts() {
     return loadRunSnapshotsFromArtifacts(this.config);
@@ -1922,15 +2239,26 @@ var EffectRunner = class {
   }
 };
+// src/runner/events.ts
+var PROGRAMMATIC_RUN_CONFIG = {
+  runConfigName: "programmatic"
+};
 Object.defineProperty(exports, 'S', {
   enumerable: true,
   get: function () { return effect.Schema; }
 });
 exports.Dataset = Dataset;
 exports.Evaluator = Evaluator;
+exports.EvaluatorNameSchema = EvaluatorNameSchema;
 exports.Metric = Metric;
+exports.PROGRAMMATIC_RUN_CONFIG = PROGRAMMATIC_RUN_CONFIG;
+exports.RunConfig = RunConfig;
+exports.RunConfigNameSchema = RunConfigNameSchema;
 exports.Score = Score;
+exports.TagSet = TagSet;
 exports.TestCase = TestCase;
+exports.TestCaseNameSchema = TestCaseNameSchema;
 exports.binaryScore = binaryScore;
 exports.createLogEntry = createLogEntry;
 exports.createRunner = createRunner;
@@ -1938,16 +2266,24 @@ exports.defaultRunnerConfig = defaultRunnerConfig;
 exports.defineConfig = defineConfig;
 exports.deltaScore = deltaScore;
 exports.formatScoreData = formatScoreData;
+exports.getEvaluatorDisplayLabel = getEvaluatorDisplayLabel;
+exports.getEvaluatorTagList = getEvaluatorTagList;
 exports.getLogLines = getLogLines;
 exports.getMetricById = getMetricById;
 exports.getScoreById = getScoreById;
+exports.getTestCaseDisplayLabel = getTestCaseDisplayLabel;
+exports.getTestCaseTagList = getTestCaseTagList;
 exports.latencyMetric = latencyMetric;
 exports.loadMockData = loadMockData;
 exports.loadRunnerData = loadRunnerData;
+exports.normalizeOptionalDisplayName = normalizeOptionalDisplayName;
 exports.parseStartupArgs = parseStartupArgs;
 exports.percentScore = percentScore;
 exports.printJsonDiff = printJsonDiff;
 exports.tokenCountMetric = tokenCountMetric;
+exports.validateEvaluatorName = validateEvaluatorName;
+exports.validateRunConfigName = validateRunConfigName;
+exports.validateTestCaseName = validateTestCaseName;
 exports.withRunnerConfig = withRunnerConfig;
 //# sourceMappingURL=out.js.map
 //# sourceMappingURL=index.cjs.map