@m4trix/evals 0.29.0 → 0.30.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -141,7 +141,7 @@ Group several dataset/evaluator runs under one named config. Each row is either
141
141
  `evaluators: [...]` (same module instances discovery loads) or `evaluatorPattern: "..."`
142
142
  (wildcard / regex rules from `RunnerApi.resolveEvaluatorsByNamePattern`). Multiple jobs share one `--concurrency` cap.
143
143
 
144
- Optional **`repetitions`** on a row (default `1`) runs each matching test case that many times. Every execution in that group shares the same **`repetitionId`** in the evaluator callback **`meta`**, with **`repetitionIndex`** / **`repetitionCount`**. Evaluator **`meta`** includes **`datasetName`** (`Dataset.getDisplayLabel()` → `displayName ?? name`), **`testCaseId`** (discovery id, same as runner events), **`testCaseName`** (`TestCase.getDisplayLabel()` → `displayName ?? name`), **`runConfigName`** (the **`RunConfig`** id or **`programmatic`** from **`PROGRAMMATIC_RUN_CONFIG`** for API/TUI-only **`runDatasetWith`**), optional **`experimentName`**, and declared tag lists **`testCaseTags`**, **`runConfigTags`**, and **`evaluatorTags`** (empty arrays when unset). **`Dataset`** **`includedTags` / `excludedTags`** only filter which test cases belong to a dataset; they are not the same as **`TestCase.describe({ tags })`** or **`Evaluator.define({ tags })`**, which label the case/evaluator and show up in **`meta`**. **`Dataset`** and **`TestCase`** follow the same naming convention as **`RunConfig`**: **`name`** is the stable id; optional **`displayName`** is unrestricted for UI. Names may use **kebab-case**, **snake_case**, **camelCase**, etc. (letters, digits, `_`, `-` only, no spaces); resolution is **case-insensitive**.
144
+ Optional **`repetitions`** on a row (default `1`) runs each matching test case that many times. Every execution in that group shares the same **`repetitionId`** in the evaluator callback **`meta`**, with **`repetitionIndex`** / **`repetitionCount`**. Evaluator **`meta`** includes **`triggerId`**, **`triggerTimestamp`** (ms since epoch when the run was triggered; the simple CLI sets this once at process start), **`datasetName`** (`Dataset.getDisplayLabel()` → `displayName ?? name`), **`testCaseId`** (discovery id, same as runner events), **`testCaseName`** (`TestCase.getDisplayLabel()` → `displayName ?? name`), **`runConfigName`** (the **`RunConfig`** id or **`programmatic`** from **`PROGRAMMATIC_RUN_CONFIG`** for API/TUI-only **`runDatasetWith`**), optional **`experimentName`**, and declared tag lists **`testCaseTags`**, **`runConfigTags`**, and **`evaluatorTags`** (empty arrays when unset). **`Dataset`** **`includedTags` / `excludedTags`** only filter which test cases belong to a dataset; they are not the same as **`TestCase.describe({ tags })`** or **`Evaluator.define({ tags })`**, which label the case/evaluator and show up in **`meta`**. **`Dataset`** and **`TestCase`** follow the same naming convention as **`RunConfig`**: **`name`** is the stable id; optional **`displayName`** is unrestricted for UI. Names may use **kebab-case**, **snake_case**, **camelCase**, etc. (letters, digits, `_`, `-` only, no spaces); resolution is **case-insensitive**.
145
145
 
146
146
  ```ts
147
147
  import { RunConfig } from '@m4trix/evals';
@@ -1012,6 +1012,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1012
1012
  output,
1013
1013
  meta: {
1014
1014
  triggerId: task.triggerId,
1015
+ triggerTimestamp: task.triggerTimestamp,
1015
1016
  runId: evaluatorRunId,
1016
1017
  datasetName: task.dataset.getDisplayLabel(),
1017
1018
  testCaseId: testCaseItem.id,
@@ -1492,6 +1493,7 @@ var EffectRunner = class {
1492
1493
  const globalConcurrency = Math.max(1, request.globalConcurrency);
1493
1494
  const sem = effect.Effect.unsafeMakeSemaphore(globalConcurrency);
1494
1495
  const triggerId = request.triggerId ?? `trg-${crypto.randomUUID()}`;
1496
+ const triggerTimestamp = request.triggerTimestamp ?? Date.now();
1495
1497
  const snapshots = [];
1496
1498
  for (const job of request.jobs) {
1497
1499
  snapshots.push(
@@ -1499,6 +1501,7 @@ var EffectRunner = class {
1499
1501
  datasetId: job.datasetId,
1500
1502
  evaluatorIds: job.evaluatorIds,
1501
1503
  triggerId,
1504
+ triggerTimestamp,
1502
1505
  maxConcurrency: this.config.maxConcurrency ?? 1,
1503
1506
  globalEvaluationSemaphore: sem,
1504
1507
  runConfigName: job.runConfigName,
@@ -1536,6 +1539,7 @@ var EffectRunner = class {
1536
1539
  datasetId: request.datasetId,
1537
1540
  evaluatorIds: request.evaluatorIds,
1538
1541
  triggerId: request.triggerId,
1542
+ triggerTimestamp: request.triggerTimestamp ?? Date.now(),
1539
1543
  maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
1540
1544
  repetitions: request.repetitions,
1541
1545
  runConfigName,
@@ -1563,6 +1567,7 @@ var EffectRunner = class {
1563
1567
  const totalEvaluations = selectedTestCases.length * repetitions;
1564
1568
  const runConfigTags = [...params.runConfigTags ?? []];
1565
1569
  const triggerId = params.triggerId ?? `trg-${crypto.randomUUID()}`;
1570
+ const triggerTimestamp = params.triggerTimestamp ?? Date.now();
1566
1571
  const runId = `run-${crypto.randomUUID()}`;
1567
1572
  const artifactPath = createArtifactPath(this.config.artifactDirectory, params.datasetId, runId);
1568
1573
  const snapshot = {
@@ -1606,6 +1611,7 @@ var EffectRunner = class {
1606
1611
  effect.Queue.offer(this.runQueue, {
1607
1612
  runId,
1608
1613
  triggerId,
1614
+ triggerTimestamp,
1609
1615
  datasetId: params.datasetId,
1610
1616
  dataset: dataset.dataset,
1611
1617
  evaluators: selectedEvaluators,
@@ -2038,6 +2044,7 @@ function RunView({
2038
2044
  runConfigNames,
2039
2045
  concurrency,
2040
2046
  experimentName,
2047
+ triggerTimestamp,
2041
2048
  onComplete
2042
2049
  }) {
2043
2050
  const [phase, setPhase] = React.useState("loading");
@@ -2208,7 +2215,8 @@ function RunView({
2208
2215
  const snapshots = await runner.runDatasetJobsWithSharedConcurrency({
2209
2216
  jobs,
2210
2217
  globalConcurrency: concurrency,
2211
- experimentName
2218
+ experimentName,
2219
+ triggerTimestamp
2212
2220
  });
2213
2221
  for (let i = 0; i < snapshots.length; i += 1) {
2214
2222
  const snap = snapshots[i];
@@ -2265,7 +2273,7 @@ function RunView({
2265
2273
  setPhase("completed");
2266
2274
  const exitCode = failedTestCases > 0 ? 1 : 0;
2267
2275
  setTimeout(() => onComplete(void 0, exitCode), 200);
2268
- }, [runner, runConfigNames, concurrency, experimentName, onComplete]);
2276
+ }, [runner, runConfigNames, concurrency, experimentName, triggerTimestamp, onComplete]);
2269
2277
  React.useEffect(() => {
2270
2278
  void runEval();
2271
2279
  }, [runEval]);
@@ -2753,7 +2761,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
2753
2761
  }
2754
2762
  return lines;
2755
2763
  }
2756
- async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency, experimentName) {
2764
+ async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency, experimentName, triggerTimestamp) {
2757
2765
  const jobs = await runner.expandRunConfigNamesToJobs(runConfigNames);
2758
2766
  if (jobs.length === 0) {
2759
2767
  throw new Error("No jobs expanded from RunConfigs.");
@@ -2963,7 +2971,8 @@ async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency,
2963
2971
  const snapshots = await runner.runDatasetJobsWithSharedConcurrency({
2964
2972
  jobs,
2965
2973
  globalConcurrency: concurrency,
2966
- experimentName
2974
+ experimentName,
2975
+ triggerTimestamp
2967
2976
  });
2968
2977
  for (let i = 0; i < snapshots.length; i += 1) {
2969
2978
  const snap = snapshots[i];
@@ -3063,7 +3072,7 @@ async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency,
3063
3072
  }
3064
3073
  return failedTestCasesTotal > 0 ? 1 : 0;
3065
3074
  }
3066
- async function runSimpleEvalRunConfigsInk(runner, runConfigNames, concurrency, experimentName) {
3075
+ async function runSimpleEvalRunConfigsInk(runner, runConfigNames, concurrency, experimentName, triggerTimestamp) {
3067
3076
  return new Promise((resolve5, reject) => {
3068
3077
  const app = ink.render(
3069
3078
  React__namespace.createElement(RunView, {
@@ -3071,6 +3080,7 @@ async function runSimpleEvalRunConfigsInk(runner, runConfigNames, concurrency, e
3071
3080
  runConfigNames,
3072
3081
  concurrency,
3073
3082
  experimentName,
3083
+ triggerTimestamp,
3074
3084
  onComplete: (err, exitCode) => {
3075
3085
  app.unmount();
3076
3086
  if (err) {
@@ -3128,11 +3138,13 @@ async function main() {
3128
3138
  try {
3129
3139
  if (args.command === "run") {
3130
3140
  const concurrency = args.concurrency ?? getDefaultConcurrency();
3141
+ const triggerTimestamp = Date.now();
3131
3142
  const exitCode = await (useInk ? runSimpleEvalRunConfigsInk : runSimpleEvalRunConfigsPlain)(
3132
3143
  runner,
3133
3144
  args.runConfigNames,
3134
3145
  concurrency,
3135
- args.experimentName
3146
+ args.experimentName,
3147
+ triggerTimestamp
3136
3148
  );
3137
3149
  if (args.ci && exitCode !== 0) {
3138
3150
  process.exit(1);