@m4trix/evals 0.28.0 → 0.30.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -141,7 +141,7 @@ Group several dataset/evaluator runs under one named config. Each row is either
141
141
  `evaluators: [...]` (same module instances discovery loads) or `evaluatorPattern: "..."`
142
142
  (wildcard / regex rules from `RunnerApi.resolveEvaluatorsByNamePattern`). Multiple jobs share one `--concurrency` cap.
143
143
 
144
- Optional **`repetitions`** on a row (default `1`) runs each matching test case that many times. Every execution in that group shares the same **`repetitionId`** in the evaluator callback **`meta`**, with **`repetitionIndex`** / **`repetitionCount`**. Evaluator **`meta`** includes **`datasetName`** (`Dataset.getDisplayLabel()` → `displayName ?? name`), **`runConfigName`** (the **`RunConfig`** id or **`programmatic`** from **`PROGRAMMATIC_RUN_CONFIG`** for API/TUI-only **`runDatasetWith`**), optional **`experimentName`**, and declared tag lists **`testCaseTags`**, **`runConfigTags`**, and **`evaluatorTags`** (empty arrays when unset). **`Dataset`** **`includedTags` / `excludedTags`** only filter which test cases belong to a dataset; they are not the same as **`TestCase.describe({ tags })`** or **`Evaluator.define({ tags })`**, which label the case/evaluator and show up in **`meta`**. **`Dataset`** and **`TestCase`** follow the same naming convention as **`RunConfig`**: **`name`** is the stable id; optional **`displayName`** is unrestricted for UI. Names may use **kebab-case**, **snake_case**, **camelCase**, etc. (letters, digits, `_`, `-` only, no spaces); resolution is **case-insensitive**.
144
+ Optional **`repetitions`** on a row (default `1`) runs each matching test case that many times. Every execution in that group shares the same **`repetitionId`** in the evaluator callback **`meta`**, with **`repetitionIndex`** / **`repetitionCount`**. Evaluator **`meta`** includes **`triggerId`**, **`triggerTimestamp`** (ms since epoch when the run was triggered; the simple CLI sets this once at process start), **`datasetName`** (`Dataset.getDisplayLabel()` → `displayName ?? name`), **`testCaseId`** (discovery id, same as runner events), **`testCaseName`** (`TestCase.getDisplayLabel()` → `displayName ?? name`), **`runConfigName`** (the **`RunConfig`** id or **`programmatic`** from **`PROGRAMMATIC_RUN_CONFIG`** for API/TUI-only **`runDatasetWith`**), optional **`experimentName`**, and declared tag lists **`testCaseTags`**, **`runConfigTags`**, and **`evaluatorTags`** (empty arrays when unset). **`Dataset`** **`includedTags` / `excludedTags`** only filter which test cases belong to a dataset; they are not the same as **`TestCase.describe({ tags })`** or **`Evaluator.define({ tags })`**, which label the case/evaluator and show up in **`meta`**. **`Dataset`** and **`TestCase`** follow the same naming convention as **`RunConfig`**: **`name`** is the stable id; optional **`displayName`** is unrestricted for UI. Names may use **kebab-case**, **snake_case**, **camelCase**, etc. (letters, digits, `_`, `-` only, no spaces); resolution is **case-insensitive**.
145
145
 
146
146
  ```ts
147
147
  import { RunConfig } from '@m4trix/evals';
@@ -1012,8 +1012,11 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1012
1012
  output,
1013
1013
  meta: {
1014
1014
  triggerId: task.triggerId,
1015
+ triggerTimestamp: task.triggerTimestamp,
1015
1016
  runId: evaluatorRunId,
1016
1017
  datasetName: task.dataset.getDisplayLabel(),
1018
+ testCaseId: testCaseItem.id,
1019
+ testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
1017
1020
  repetitionId,
1018
1021
  repetitionIndex,
1019
1022
  repetitionCount,
@@ -1490,6 +1493,7 @@ var EffectRunner = class {
1490
1493
  const globalConcurrency = Math.max(1, request.globalConcurrency);
1491
1494
  const sem = effect.Effect.unsafeMakeSemaphore(globalConcurrency);
1492
1495
  const triggerId = request.triggerId ?? `trg-${crypto.randomUUID()}`;
1496
+ const triggerTimestamp = request.triggerTimestamp ?? Date.now();
1493
1497
  const snapshots = [];
1494
1498
  for (const job of request.jobs) {
1495
1499
  snapshots.push(
@@ -1497,6 +1501,7 @@ var EffectRunner = class {
1497
1501
  datasetId: job.datasetId,
1498
1502
  evaluatorIds: job.evaluatorIds,
1499
1503
  triggerId,
1504
+ triggerTimestamp,
1500
1505
  maxConcurrency: this.config.maxConcurrency ?? 1,
1501
1506
  globalEvaluationSemaphore: sem,
1502
1507
  runConfigName: job.runConfigName,
@@ -1534,6 +1539,7 @@ var EffectRunner = class {
1534
1539
  datasetId: request.datasetId,
1535
1540
  evaluatorIds: request.evaluatorIds,
1536
1541
  triggerId: request.triggerId,
1542
+ triggerTimestamp: request.triggerTimestamp ?? Date.now(),
1537
1543
  maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
1538
1544
  repetitions: request.repetitions,
1539
1545
  runConfigName,
@@ -1561,6 +1567,7 @@ var EffectRunner = class {
1561
1567
  const totalEvaluations = selectedTestCases.length * repetitions;
1562
1568
  const runConfigTags = [...params.runConfigTags ?? []];
1563
1569
  const triggerId = params.triggerId ?? `trg-${crypto.randomUUID()}`;
1570
+ const triggerTimestamp = params.triggerTimestamp ?? Date.now();
1564
1571
  const runId = `run-${crypto.randomUUID()}`;
1565
1572
  const artifactPath = createArtifactPath(this.config.artifactDirectory, params.datasetId, runId);
1566
1573
  const snapshot = {
@@ -1604,6 +1611,7 @@ var EffectRunner = class {
1604
1611
  effect.Queue.offer(this.runQueue, {
1605
1612
  runId,
1606
1613
  triggerId,
1614
+ triggerTimestamp,
1607
1615
  datasetId: params.datasetId,
1608
1616
  dataset: dataset.dataset,
1609
1617
  evaluators: selectedEvaluators,
@@ -2036,6 +2044,7 @@ function RunView({
2036
2044
  runConfigNames,
2037
2045
  concurrency,
2038
2046
  experimentName,
2047
+ triggerTimestamp,
2039
2048
  onComplete
2040
2049
  }) {
2041
2050
  const [phase, setPhase] = React.useState("loading");
@@ -2206,7 +2215,8 @@ function RunView({
2206
2215
  const snapshots = await runner.runDatasetJobsWithSharedConcurrency({
2207
2216
  jobs,
2208
2217
  globalConcurrency: concurrency,
2209
- experimentName
2218
+ experimentName,
2219
+ triggerTimestamp
2210
2220
  });
2211
2221
  for (let i = 0; i < snapshots.length; i += 1) {
2212
2222
  const snap = snapshots[i];
@@ -2263,7 +2273,7 @@ function RunView({
2263
2273
  setPhase("completed");
2264
2274
  const exitCode = failedTestCases > 0 ? 1 : 0;
2265
2275
  setTimeout(() => onComplete(void 0, exitCode), 200);
2266
- }, [runner, runConfigNames, concurrency, experimentName, onComplete]);
2276
+ }, [runner, runConfigNames, concurrency, experimentName, triggerTimestamp, onComplete]);
2267
2277
  React.useEffect(() => {
2268
2278
  void runEval();
2269
2279
  }, [runEval]);
@@ -2751,7 +2761,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
2751
2761
  }
2752
2762
  return lines;
2753
2763
  }
2754
- async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency, experimentName) {
2764
+ async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency, experimentName, triggerTimestamp) {
2755
2765
  const jobs = await runner.expandRunConfigNamesToJobs(runConfigNames);
2756
2766
  if (jobs.length === 0) {
2757
2767
  throw new Error("No jobs expanded from RunConfigs.");
@@ -2961,7 +2971,8 @@ async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency,
2961
2971
  const snapshots = await runner.runDatasetJobsWithSharedConcurrency({
2962
2972
  jobs,
2963
2973
  globalConcurrency: concurrency,
2964
- experimentName
2974
+ experimentName,
2975
+ triggerTimestamp
2965
2976
  });
2966
2977
  for (let i = 0; i < snapshots.length; i += 1) {
2967
2978
  const snap = snapshots[i];
@@ -3061,7 +3072,7 @@ async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency,
3061
3072
  }
3062
3073
  return failedTestCasesTotal > 0 ? 1 : 0;
3063
3074
  }
3064
- async function runSimpleEvalRunConfigsInk(runner, runConfigNames, concurrency, experimentName) {
3075
+ async function runSimpleEvalRunConfigsInk(runner, runConfigNames, concurrency, experimentName, triggerTimestamp) {
3065
3076
  return new Promise((resolve5, reject) => {
3066
3077
  const app = ink.render(
3067
3078
  React__namespace.createElement(RunView, {
@@ -3069,6 +3080,7 @@ async function runSimpleEvalRunConfigsInk(runner, runConfigNames, concurrency, e
3069
3080
  runConfigNames,
3070
3081
  concurrency,
3071
3082
  experimentName,
3083
+ triggerTimestamp,
3072
3084
  onComplete: (err, exitCode) => {
3073
3085
  app.unmount();
3074
3086
  if (err) {
@@ -3126,11 +3138,13 @@ async function main() {
3126
3138
  try {
3127
3139
  if (args.command === "run") {
3128
3140
  const concurrency = args.concurrency ?? getDefaultConcurrency();
3141
+ const triggerTimestamp = Date.now();
3129
3142
  const exitCode = await (useInk ? runSimpleEvalRunConfigsInk : runSimpleEvalRunConfigsPlain)(
3130
3143
  runner,
3131
3144
  args.runConfigNames,
3132
3145
  concurrency,
3133
- args.experimentName
3146
+ args.experimentName,
3147
+ triggerTimestamp
3134
3148
  );
3135
3149
  if (args.ci && exitCode !== 0) {
3136
3150
  process.exit(1);