@m4trix/evals 0.29.0 → 0.31.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/cli-simple.cjs +19 -6
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +19 -6
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +9 -1
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +9 -1
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +7 -0
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +15 -0
- package/dist/index.js +7 -0
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -141,7 +141,7 @@ Group several dataset/evaluator runs under one named config. Each row is either
|
|
|
141
141
|
`evaluators: [...]` (same module instances discovery loads) or `evaluatorPattern: "..."`
|
|
142
142
|
(wildcard / regex rules from `RunnerApi.resolveEvaluatorsByNamePattern`). Multiple jobs share one `--concurrency` cap.
|
|
143
143
|
|
|
144
|
-
Optional **`repetitions`** on a row (default `1`) runs each matching test case that many times. Every execution in that group shares the same **`repetitionId`** in the evaluator callback **`meta`**, with **`repetitionIndex`** / **`repetitionCount`**. Evaluator **`meta`** includes **`datasetName`** (`Dataset.getDisplayLabel()` → `displayName ?? name`), **`testCaseId`** (discovery id, same as runner events), **`testCaseName`** (`TestCase.getDisplayLabel()` → `displayName ?? name`), **`runConfigName`** (the **`RunConfig`** id or **`programmatic`** from **`PROGRAMMATIC_RUN_CONFIG`** for API/TUI-only **`runDatasetWith`**), optional **`experimentName`**, and declared tag lists **`testCaseTags`**, **`runConfigTags`**, and **`evaluatorTags`** (empty arrays when unset). **`Dataset`** **`includedTags` / `excludedTags`** only filter which test cases belong to a dataset; they are not the same as **`TestCase.describe({ tags })`** or **`Evaluator.define({ tags })`**, which label the case/evaluator and show up in **`meta`**. **`Dataset`** and **`TestCase`** follow the same naming convention as **`RunConfig`**: **`name`** is the stable id; optional **`displayName`** is unrestricted for UI. Names may use **kebab-case**, **snake_case**, **camelCase**, etc. (letters, digits, `_`, `-` only, no spaces); resolution is **case-insensitive**.
|
|
144
|
+
Optional **`repetitions`** on a row (default `1`) runs each matching test case that many times. Every execution in that group shares the same **`repetitionId`** in the evaluator callback **`meta`**, with **`repetitionIndex`** / **`repetitionCount`**. Evaluator **`meta`** includes **`triggerId`**, **`triggerTimestamp`** (ms since epoch when the run was triggered; the simple CLI sets this once at process start), **`triggeredAt`** (same instant as ISO 8601), **`datasetName`** (`Dataset.getDisplayLabel()` → `displayName ?? name`), **`testCaseId`** (discovery id, same as runner events), **`testCaseName`** (`TestCase.getDisplayLabel()` → `displayName ?? name`), **`runConfigName`** (the **`RunConfig`** id or **`programmatic`** from **`PROGRAMMATIC_RUN_CONFIG`** for API/TUI-only **`runDatasetWith`**), optional **`experimentName`**, and declared tag lists **`testCaseTags`**, **`runConfigTags`**, and **`evaluatorTags`** (empty arrays when unset). **`Dataset`** **`includedTags` / `excludedTags`** only filter which test cases belong to a dataset; they are not the same as **`TestCase.describe({ tags })`** or **`Evaluator.define({ tags })`**, which label the case/evaluator and show up in **`meta`**. **`Dataset`** and **`TestCase`** follow the same naming convention as **`RunConfig`**: **`name`** is the stable id; optional **`displayName`** is unrestricted for UI. Names may use **kebab-case**, **snake_case**, **camelCase**, etc. (letters, digits, `_`, `-` only, no spaces); resolution is **case-insensitive**.
|
|
145
145
|
|
|
146
146
|
```ts
|
|
147
147
|
import { RunConfig } from '@m4trix/evals';
|
package/dist/cli-simple.cjs
CHANGED
|
@@ -1012,6 +1012,8 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1012
1012
|
output,
|
|
1013
1013
|
meta: {
|
|
1014
1014
|
triggerId: task.triggerId,
|
|
1015
|
+
triggerTimestamp: task.triggerTimestamp,
|
|
1016
|
+
triggeredAt: new Date(task.triggerTimestamp).toISOString(),
|
|
1015
1017
|
runId: evaluatorRunId,
|
|
1016
1018
|
datasetName: task.dataset.getDisplayLabel(),
|
|
1017
1019
|
testCaseId: testCaseItem.id,
|
|
@@ -1492,6 +1494,7 @@ var EffectRunner = class {
|
|
|
1492
1494
|
const globalConcurrency = Math.max(1, request.globalConcurrency);
|
|
1493
1495
|
const sem = effect.Effect.unsafeMakeSemaphore(globalConcurrency);
|
|
1494
1496
|
const triggerId = request.triggerId ?? `trg-${crypto.randomUUID()}`;
|
|
1497
|
+
const triggerTimestamp = request.triggerTimestamp ?? Date.now();
|
|
1495
1498
|
const snapshots = [];
|
|
1496
1499
|
for (const job of request.jobs) {
|
|
1497
1500
|
snapshots.push(
|
|
@@ -1499,6 +1502,7 @@ var EffectRunner = class {
|
|
|
1499
1502
|
datasetId: job.datasetId,
|
|
1500
1503
|
evaluatorIds: job.evaluatorIds,
|
|
1501
1504
|
triggerId,
|
|
1505
|
+
triggerTimestamp,
|
|
1502
1506
|
maxConcurrency: this.config.maxConcurrency ?? 1,
|
|
1503
1507
|
globalEvaluationSemaphore: sem,
|
|
1504
1508
|
runConfigName: job.runConfigName,
|
|
@@ -1536,6 +1540,7 @@ var EffectRunner = class {
|
|
|
1536
1540
|
datasetId: request.datasetId,
|
|
1537
1541
|
evaluatorIds: request.evaluatorIds,
|
|
1538
1542
|
triggerId: request.triggerId,
|
|
1543
|
+
triggerTimestamp: request.triggerTimestamp ?? Date.now(),
|
|
1539
1544
|
maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
|
|
1540
1545
|
repetitions: request.repetitions,
|
|
1541
1546
|
runConfigName,
|
|
@@ -1563,6 +1568,7 @@ var EffectRunner = class {
|
|
|
1563
1568
|
const totalEvaluations = selectedTestCases.length * repetitions;
|
|
1564
1569
|
const runConfigTags = [...params.runConfigTags ?? []];
|
|
1565
1570
|
const triggerId = params.triggerId ?? `trg-${crypto.randomUUID()}`;
|
|
1571
|
+
const triggerTimestamp = params.triggerTimestamp ?? Date.now();
|
|
1566
1572
|
const runId = `run-${crypto.randomUUID()}`;
|
|
1567
1573
|
const artifactPath = createArtifactPath(this.config.artifactDirectory, params.datasetId, runId);
|
|
1568
1574
|
const snapshot = {
|
|
@@ -1606,6 +1612,7 @@ var EffectRunner = class {
|
|
|
1606
1612
|
effect.Queue.offer(this.runQueue, {
|
|
1607
1613
|
runId,
|
|
1608
1614
|
triggerId,
|
|
1615
|
+
triggerTimestamp,
|
|
1609
1616
|
datasetId: params.datasetId,
|
|
1610
1617
|
dataset: dataset.dataset,
|
|
1611
1618
|
evaluators: selectedEvaluators,
|
|
@@ -2038,6 +2045,7 @@ function RunView({
|
|
|
2038
2045
|
runConfigNames,
|
|
2039
2046
|
concurrency,
|
|
2040
2047
|
experimentName,
|
|
2048
|
+
triggerTimestamp,
|
|
2041
2049
|
onComplete
|
|
2042
2050
|
}) {
|
|
2043
2051
|
const [phase, setPhase] = React.useState("loading");
|
|
@@ -2208,7 +2216,8 @@ function RunView({
|
|
|
2208
2216
|
const snapshots = await runner.runDatasetJobsWithSharedConcurrency({
|
|
2209
2217
|
jobs,
|
|
2210
2218
|
globalConcurrency: concurrency,
|
|
2211
|
-
experimentName
|
|
2219
|
+
experimentName,
|
|
2220
|
+
triggerTimestamp
|
|
2212
2221
|
});
|
|
2213
2222
|
for (let i = 0; i < snapshots.length; i += 1) {
|
|
2214
2223
|
const snap = snapshots[i];
|
|
@@ -2265,7 +2274,7 @@ function RunView({
|
|
|
2265
2274
|
setPhase("completed");
|
|
2266
2275
|
const exitCode = failedTestCases > 0 ? 1 : 0;
|
|
2267
2276
|
setTimeout(() => onComplete(void 0, exitCode), 200);
|
|
2268
|
-
}, [runner, runConfigNames, concurrency, experimentName, onComplete]);
|
|
2277
|
+
}, [runner, runConfigNames, concurrency, experimentName, triggerTimestamp, onComplete]);
|
|
2269
2278
|
React.useEffect(() => {
|
|
2270
2279
|
void runEval();
|
|
2271
2280
|
}, [runEval]);
|
|
@@ -2753,7 +2762,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
|
|
|
2753
2762
|
}
|
|
2754
2763
|
return lines;
|
|
2755
2764
|
}
|
|
2756
|
-
async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency, experimentName) {
|
|
2765
|
+
async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency, experimentName, triggerTimestamp) {
|
|
2757
2766
|
const jobs = await runner.expandRunConfigNamesToJobs(runConfigNames);
|
|
2758
2767
|
if (jobs.length === 0) {
|
|
2759
2768
|
throw new Error("No jobs expanded from RunConfigs.");
|
|
@@ -2963,7 +2972,8 @@ async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency,
|
|
|
2963
2972
|
const snapshots = await runner.runDatasetJobsWithSharedConcurrency({
|
|
2964
2973
|
jobs,
|
|
2965
2974
|
globalConcurrency: concurrency,
|
|
2966
|
-
experimentName
|
|
2975
|
+
experimentName,
|
|
2976
|
+
triggerTimestamp
|
|
2967
2977
|
});
|
|
2968
2978
|
for (let i = 0; i < snapshots.length; i += 1) {
|
|
2969
2979
|
const snap = snapshots[i];
|
|
@@ -3063,7 +3073,7 @@ async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency,
|
|
|
3063
3073
|
}
|
|
3064
3074
|
return failedTestCasesTotal > 0 ? 1 : 0;
|
|
3065
3075
|
}
|
|
3066
|
-
async function runSimpleEvalRunConfigsInk(runner, runConfigNames, concurrency, experimentName) {
|
|
3076
|
+
async function runSimpleEvalRunConfigsInk(runner, runConfigNames, concurrency, experimentName, triggerTimestamp) {
|
|
3067
3077
|
return new Promise((resolve5, reject) => {
|
|
3068
3078
|
const app = ink.render(
|
|
3069
3079
|
React__namespace.createElement(RunView, {
|
|
@@ -3071,6 +3081,7 @@ async function runSimpleEvalRunConfigsInk(runner, runConfigNames, concurrency, e
|
|
|
3071
3081
|
runConfigNames,
|
|
3072
3082
|
concurrency,
|
|
3073
3083
|
experimentName,
|
|
3084
|
+
triggerTimestamp,
|
|
3074
3085
|
onComplete: (err, exitCode) => {
|
|
3075
3086
|
app.unmount();
|
|
3076
3087
|
if (err) {
|
|
@@ -3128,11 +3139,13 @@ async function main() {
|
|
|
3128
3139
|
try {
|
|
3129
3140
|
if (args.command === "run") {
|
|
3130
3141
|
const concurrency = args.concurrency ?? getDefaultConcurrency();
|
|
3142
|
+
const triggerTimestamp = Date.now();
|
|
3131
3143
|
const exitCode = await (useInk ? runSimpleEvalRunConfigsInk : runSimpleEvalRunConfigsPlain)(
|
|
3132
3144
|
runner,
|
|
3133
3145
|
args.runConfigNames,
|
|
3134
3146
|
concurrency,
|
|
3135
|
-
args.experimentName
|
|
3147
|
+
args.experimentName,
|
|
3148
|
+
triggerTimestamp
|
|
3136
3149
|
);
|
|
3137
3150
|
if (args.ci && exitCode !== 0) {
|
|
3138
3151
|
process.exit(1);
|