@m4trix/evals 0.29.0 → 0.31.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -141,7 +141,7 @@ Group several dataset/evaluator runs under one named config. Each row is either
141
141
  `evaluators: [...]` (same module instances discovery loads) or `evaluatorPattern: "..."`
142
142
  (wildcard / regex rules from `RunnerApi.resolveEvaluatorsByNamePattern`). Multiple jobs share one `--concurrency` cap.
143
143
 
144
- Optional **`repetitions`** on a row (default `1`) runs each matching test case that many times. Every execution in that group shares the same **`repetitionId`** in the evaluator callback **`meta`**, with **`repetitionIndex`** / **`repetitionCount`**. Evaluator **`meta`** includes **`datasetName`** (`Dataset.getDisplayLabel()` → `displayName ?? name`), **`testCaseId`** (discovery id, same as runner events), **`testCaseName`** (`TestCase.getDisplayLabel()` → `displayName ?? name`), **`runConfigName`** (the **`RunConfig`** id or **`programmatic`** from **`PROGRAMMATIC_RUN_CONFIG`** for API/TUI-only **`runDatasetWith`**), optional **`experimentName`**, and declared tag lists **`testCaseTags`**, **`runConfigTags`**, and **`evaluatorTags`** (empty arrays when unset). **`Dataset`** **`includedTags` / `excludedTags`** only filter which test cases belong to a dataset; they are not the same as **`TestCase.describe({ tags })`** or **`Evaluator.define({ tags })`**, which label the case/evaluator and show up in **`meta`**. **`Dataset`** and **`TestCase`** follow the same naming convention as **`RunConfig`**: **`name`** is the stable id; optional **`displayName`** is unrestricted for UI. Names may use **kebab-case**, **snake_case**, **camelCase**, etc. (letters, digits, `_`, `-` only, no spaces); resolution is **case-insensitive**.
144
+ Optional **`repetitions`** on a row (default `1`) runs each matching test case that many times. Every execution in that group shares the same **`repetitionId`** in the evaluator callback **`meta`**, with **`repetitionIndex`** / **`repetitionCount`**. Evaluator **`meta`** includes **`triggerId`**, **`triggerTimestamp`** (ms since epoch when the run was triggered; the simple CLI sets this once at process start), **`triggeredAt`** (same instant as ISO 8601), **`datasetName`** (`Dataset.getDisplayLabel()` → `displayName ?? name`), **`testCaseId`** (discovery id, same as runner events), **`testCaseName`** (`TestCase.getDisplayLabel()` → `displayName ?? name`), **`runConfigName`** (the **`RunConfig`** id or **`programmatic`** from **`PROGRAMMATIC_RUN_CONFIG`** for API/TUI-only **`runDatasetWith`**), optional **`experimentName`**, and declared tag lists **`testCaseTags`**, **`runConfigTags`**, and **`evaluatorTags`** (empty arrays when unset). **`Dataset`** **`includedTags` / `excludedTags`** only filter which test cases belong to a dataset; they are not the same as **`TestCase.describe({ tags })`** or **`Evaluator.define({ tags })`**, which label the case/evaluator and show up in **`meta`**. **`Dataset`** and **`TestCase`** follow the same naming convention as **`RunConfig`**: **`name`** is the stable id; optional **`displayName`** is unrestricted for UI. Names may use **kebab-case**, **snake_case**, **camelCase**, etc. (letters, digits, `_`, `-` only, no spaces); resolution is **case-insensitive**.
145
145
 
146
146
  ```ts
147
147
  import { RunConfig } from '@m4trix/evals';
@@ -1012,6 +1012,8 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1012
1012
  output,
1013
1013
  meta: {
1014
1014
  triggerId: task.triggerId,
1015
+ triggerTimestamp: task.triggerTimestamp,
1016
+ triggeredAt: new Date(task.triggerTimestamp).toISOString(),
1015
1017
  runId: evaluatorRunId,
1016
1018
  datasetName: task.dataset.getDisplayLabel(),
1017
1019
  testCaseId: testCaseItem.id,
@@ -1492,6 +1494,7 @@ var EffectRunner = class {
1492
1494
  const globalConcurrency = Math.max(1, request.globalConcurrency);
1493
1495
  const sem = effect.Effect.unsafeMakeSemaphore(globalConcurrency);
1494
1496
  const triggerId = request.triggerId ?? `trg-${crypto.randomUUID()}`;
1497
+ const triggerTimestamp = request.triggerTimestamp ?? Date.now();
1495
1498
  const snapshots = [];
1496
1499
  for (const job of request.jobs) {
1497
1500
  snapshots.push(
@@ -1499,6 +1502,7 @@ var EffectRunner = class {
1499
1502
  datasetId: job.datasetId,
1500
1503
  evaluatorIds: job.evaluatorIds,
1501
1504
  triggerId,
1505
+ triggerTimestamp,
1502
1506
  maxConcurrency: this.config.maxConcurrency ?? 1,
1503
1507
  globalEvaluationSemaphore: sem,
1504
1508
  runConfigName: job.runConfigName,
@@ -1536,6 +1540,7 @@ var EffectRunner = class {
1536
1540
  datasetId: request.datasetId,
1537
1541
  evaluatorIds: request.evaluatorIds,
1538
1542
  triggerId: request.triggerId,
1543
+ triggerTimestamp: request.triggerTimestamp ?? Date.now(),
1539
1544
  maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
1540
1545
  repetitions: request.repetitions,
1541
1546
  runConfigName,
@@ -1563,6 +1568,7 @@ var EffectRunner = class {
1563
1568
  const totalEvaluations = selectedTestCases.length * repetitions;
1564
1569
  const runConfigTags = [...params.runConfigTags ?? []];
1565
1570
  const triggerId = params.triggerId ?? `trg-${crypto.randomUUID()}`;
1571
+ const triggerTimestamp = params.triggerTimestamp ?? Date.now();
1566
1572
  const runId = `run-${crypto.randomUUID()}`;
1567
1573
  const artifactPath = createArtifactPath(this.config.artifactDirectory, params.datasetId, runId);
1568
1574
  const snapshot = {
@@ -1606,6 +1612,7 @@ var EffectRunner = class {
1606
1612
  effect.Queue.offer(this.runQueue, {
1607
1613
  runId,
1608
1614
  triggerId,
1615
+ triggerTimestamp,
1609
1616
  datasetId: params.datasetId,
1610
1617
  dataset: dataset.dataset,
1611
1618
  evaluators: selectedEvaluators,
@@ -2038,6 +2045,7 @@ function RunView({
2038
2045
  runConfigNames,
2039
2046
  concurrency,
2040
2047
  experimentName,
2048
+ triggerTimestamp,
2041
2049
  onComplete
2042
2050
  }) {
2043
2051
  const [phase, setPhase] = React.useState("loading");
@@ -2208,7 +2216,8 @@ function RunView({
2208
2216
  const snapshots = await runner.runDatasetJobsWithSharedConcurrency({
2209
2217
  jobs,
2210
2218
  globalConcurrency: concurrency,
2211
- experimentName
2219
+ experimentName,
2220
+ triggerTimestamp
2212
2221
  });
2213
2222
  for (let i = 0; i < snapshots.length; i += 1) {
2214
2223
  const snap = snapshots[i];
@@ -2265,7 +2274,7 @@ function RunView({
2265
2274
  setPhase("completed");
2266
2275
  const exitCode = failedTestCases > 0 ? 1 : 0;
2267
2276
  setTimeout(() => onComplete(void 0, exitCode), 200);
2268
- }, [runner, runConfigNames, concurrency, experimentName, onComplete]);
2277
+ }, [runner, runConfigNames, concurrency, experimentName, triggerTimestamp, onComplete]);
2269
2278
  React.useEffect(() => {
2270
2279
  void runEval();
2271
2280
  }, [runEval]);
@@ -2753,7 +2762,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
2753
2762
  }
2754
2763
  return lines;
2755
2764
  }
2756
- async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency, experimentName) {
2765
+ async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency, experimentName, triggerTimestamp) {
2757
2766
  const jobs = await runner.expandRunConfigNamesToJobs(runConfigNames);
2758
2767
  if (jobs.length === 0) {
2759
2768
  throw new Error("No jobs expanded from RunConfigs.");
@@ -2963,7 +2972,8 @@ async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency,
2963
2972
  const snapshots = await runner.runDatasetJobsWithSharedConcurrency({
2964
2973
  jobs,
2965
2974
  globalConcurrency: concurrency,
2966
- experimentName
2975
+ experimentName,
2976
+ triggerTimestamp
2967
2977
  });
2968
2978
  for (let i = 0; i < snapshots.length; i += 1) {
2969
2979
  const snap = snapshots[i];
@@ -3063,7 +3073,7 @@ async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency,
3063
3073
  }
3064
3074
  return failedTestCasesTotal > 0 ? 1 : 0;
3065
3075
  }
3066
- async function runSimpleEvalRunConfigsInk(runner, runConfigNames, concurrency, experimentName) {
3076
+ async function runSimpleEvalRunConfigsInk(runner, runConfigNames, concurrency, experimentName, triggerTimestamp) {
3067
3077
  return new Promise((resolve5, reject) => {
3068
3078
  const app = ink.render(
3069
3079
  React__namespace.createElement(RunView, {
@@ -3071,6 +3081,7 @@ async function runSimpleEvalRunConfigsInk(runner, runConfigNames, concurrency, e
3071
3081
  runConfigNames,
3072
3082
  concurrency,
3073
3083
  experimentName,
3084
+ triggerTimestamp,
3074
3085
  onComplete: (err, exitCode) => {
3075
3086
  app.unmount();
3076
3087
  if (err) {
@@ -3128,11 +3139,13 @@ async function main() {
3128
3139
  try {
3129
3140
  if (args.command === "run") {
3130
3141
  const concurrency = args.concurrency ?? getDefaultConcurrency();
3142
+ const triggerTimestamp = Date.now();
3131
3143
  const exitCode = await (useInk ? runSimpleEvalRunConfigsInk : runSimpleEvalRunConfigsPlain)(
3132
3144
  runner,
3133
3145
  args.runConfigNames,
3134
3146
  concurrency,
3135
- args.experimentName
3147
+ args.experimentName,
3148
+ triggerTimestamp
3136
3149
  );
3137
3150
  if (args.ci && exitCode !== 0) {
3138
3151
  process.exit(1);