@m4trix/evals 0.28.0 → 0.30.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/cli-simple.cjs +20 -6
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +20 -6
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +10 -1
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +10 -1
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +8 -0
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +17 -0
- package/dist/index.js +8 -0
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -141,7 +141,7 @@ Group several dataset/evaluator runs under one named config. Each row is either
|
|
|
141
141
|
`evaluators: [...]` (same module instances discovery loads) or `evaluatorPattern: "..."`
|
|
142
142
|
(wildcard / regex rules from `RunnerApi.resolveEvaluatorsByNamePattern`). Multiple jobs share one `--concurrency` cap.
|
|
143
143
|
|
|
144
|
-
Optional **`repetitions`** on a row (default `1`) runs each matching test case that many times. Every execution in that group shares the same **`repetitionId`** in the evaluator callback **`meta`**, with **`repetitionIndex`** / **`repetitionCount`**. Evaluator **`meta`** includes **`datasetName`** (`Dataset.getDisplayLabel()` → `displayName ?? name`), **`runConfigName`** (the **`RunConfig`** id or **`programmatic`** from **`PROGRAMMATIC_RUN_CONFIG`** for API/TUI-only **`runDatasetWith`**), optional **`experimentName`**, and declared tag lists **`testCaseTags`**, **`runConfigTags`**, and **`evaluatorTags`** (empty arrays when unset). **`Dataset`** **`includedTags` / `excludedTags`** only filter which test cases belong to a dataset; they are not the same as **`TestCase.describe({ tags })`** or **`Evaluator.define({ tags })`**, which label the case/evaluator and show up in **`meta`**. **`Dataset`** and **`TestCase`** follow the same naming convention as **`RunConfig`**: **`name`** is the stable id; optional **`displayName`** is unrestricted for UI. Names may use **kebab-case**, **snake_case**, **camelCase**, etc. (letters, digits, `_`, `-` only, no spaces); resolution is **case-insensitive**.
|
|
144
|
+
Optional **`repetitions`** on a row (default `1`) runs each matching test case that many times. Every execution in that group shares the same **`repetitionId`** in the evaluator callback **`meta`**, with **`repetitionIndex`** / **`repetitionCount`**. Evaluator **`meta`** includes **`triggerId`**, **`triggerTimestamp`** (ms since epoch when the run was triggered; the simple CLI sets this once at process start), **`datasetName`** (`Dataset.getDisplayLabel()` → `displayName ?? name`), **`testCaseId`** (discovery id, same as runner events), **`testCaseName`** (`TestCase.getDisplayLabel()` → `displayName ?? name`), **`runConfigName`** (the **`RunConfig`** id or **`programmatic`** from **`PROGRAMMATIC_RUN_CONFIG`** for API/TUI-only **`runDatasetWith`**), optional **`experimentName`**, and declared tag lists **`testCaseTags`**, **`runConfigTags`**, and **`evaluatorTags`** (empty arrays when unset). **`Dataset`** **`includedTags` / `excludedTags`** only filter which test cases belong to a dataset; they are not the same as **`TestCase.describe({ tags })`** or **`Evaluator.define({ tags })`**, which label the case/evaluator and show up in **`meta`**. **`Dataset`** and **`TestCase`** follow the same naming convention as **`RunConfig`**: **`name`** is the stable id; optional **`displayName`** is unrestricted for UI. Names may use **kebab-case**, **snake_case**, **camelCase**, etc. (letters, digits, `_`, `-` only, no spaces); resolution is **case-insensitive**.
|
|
145
145
|
|
|
146
146
|
```ts
|
|
147
147
|
import { RunConfig } from '@m4trix/evals';
|
package/dist/cli-simple.cjs
CHANGED
|
@@ -1012,8 +1012,11 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1012
1012
|
output,
|
|
1013
1013
|
meta: {
|
|
1014
1014
|
triggerId: task.triggerId,
|
|
1015
|
+
triggerTimestamp: task.triggerTimestamp,
|
|
1015
1016
|
runId: evaluatorRunId,
|
|
1016
1017
|
datasetName: task.dataset.getDisplayLabel(),
|
|
1018
|
+
testCaseId: testCaseItem.id,
|
|
1019
|
+
testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
|
|
1017
1020
|
repetitionId,
|
|
1018
1021
|
repetitionIndex,
|
|
1019
1022
|
repetitionCount,
|
|
@@ -1490,6 +1493,7 @@ var EffectRunner = class {
|
|
|
1490
1493
|
const globalConcurrency = Math.max(1, request.globalConcurrency);
|
|
1491
1494
|
const sem = effect.Effect.unsafeMakeSemaphore(globalConcurrency);
|
|
1492
1495
|
const triggerId = request.triggerId ?? `trg-${crypto.randomUUID()}`;
|
|
1496
|
+
const triggerTimestamp = request.triggerTimestamp ?? Date.now();
|
|
1493
1497
|
const snapshots = [];
|
|
1494
1498
|
for (const job of request.jobs) {
|
|
1495
1499
|
snapshots.push(
|
|
@@ -1497,6 +1501,7 @@ var EffectRunner = class {
|
|
|
1497
1501
|
datasetId: job.datasetId,
|
|
1498
1502
|
evaluatorIds: job.evaluatorIds,
|
|
1499
1503
|
triggerId,
|
|
1504
|
+
triggerTimestamp,
|
|
1500
1505
|
maxConcurrency: this.config.maxConcurrency ?? 1,
|
|
1501
1506
|
globalEvaluationSemaphore: sem,
|
|
1502
1507
|
runConfigName: job.runConfigName,
|
|
@@ -1534,6 +1539,7 @@ var EffectRunner = class {
|
|
|
1534
1539
|
datasetId: request.datasetId,
|
|
1535
1540
|
evaluatorIds: request.evaluatorIds,
|
|
1536
1541
|
triggerId: request.triggerId,
|
|
1542
|
+
triggerTimestamp: request.triggerTimestamp ?? Date.now(),
|
|
1537
1543
|
maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
|
|
1538
1544
|
repetitions: request.repetitions,
|
|
1539
1545
|
runConfigName,
|
|
@@ -1561,6 +1567,7 @@ var EffectRunner = class {
|
|
|
1561
1567
|
const totalEvaluations = selectedTestCases.length * repetitions;
|
|
1562
1568
|
const runConfigTags = [...params.runConfigTags ?? []];
|
|
1563
1569
|
const triggerId = params.triggerId ?? `trg-${crypto.randomUUID()}`;
|
|
1570
|
+
const triggerTimestamp = params.triggerTimestamp ?? Date.now();
|
|
1564
1571
|
const runId = `run-${crypto.randomUUID()}`;
|
|
1565
1572
|
const artifactPath = createArtifactPath(this.config.artifactDirectory, params.datasetId, runId);
|
|
1566
1573
|
const snapshot = {
|
|
@@ -1604,6 +1611,7 @@ var EffectRunner = class {
|
|
|
1604
1611
|
effect.Queue.offer(this.runQueue, {
|
|
1605
1612
|
runId,
|
|
1606
1613
|
triggerId,
|
|
1614
|
+
triggerTimestamp,
|
|
1607
1615
|
datasetId: params.datasetId,
|
|
1608
1616
|
dataset: dataset.dataset,
|
|
1609
1617
|
evaluators: selectedEvaluators,
|
|
@@ -2036,6 +2044,7 @@ function RunView({
|
|
|
2036
2044
|
runConfigNames,
|
|
2037
2045
|
concurrency,
|
|
2038
2046
|
experimentName,
|
|
2047
|
+
triggerTimestamp,
|
|
2039
2048
|
onComplete
|
|
2040
2049
|
}) {
|
|
2041
2050
|
const [phase, setPhase] = React.useState("loading");
|
|
@@ -2206,7 +2215,8 @@ function RunView({
|
|
|
2206
2215
|
const snapshots = await runner.runDatasetJobsWithSharedConcurrency({
|
|
2207
2216
|
jobs,
|
|
2208
2217
|
globalConcurrency: concurrency,
|
|
2209
|
-
experimentName
|
|
2218
|
+
experimentName,
|
|
2219
|
+
triggerTimestamp
|
|
2210
2220
|
});
|
|
2211
2221
|
for (let i = 0; i < snapshots.length; i += 1) {
|
|
2212
2222
|
const snap = snapshots[i];
|
|
@@ -2263,7 +2273,7 @@ function RunView({
|
|
|
2263
2273
|
setPhase("completed");
|
|
2264
2274
|
const exitCode = failedTestCases > 0 ? 1 : 0;
|
|
2265
2275
|
setTimeout(() => onComplete(void 0, exitCode), 200);
|
|
2266
|
-
}, [runner, runConfigNames, concurrency, experimentName, onComplete]);
|
|
2276
|
+
}, [runner, runConfigNames, concurrency, experimentName, triggerTimestamp, onComplete]);
|
|
2267
2277
|
React.useEffect(() => {
|
|
2268
2278
|
void runEval();
|
|
2269
2279
|
}, [runEval]);
|
|
@@ -2751,7 +2761,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
|
|
|
2751
2761
|
}
|
|
2752
2762
|
return lines;
|
|
2753
2763
|
}
|
|
2754
|
-
async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency, experimentName) {
|
|
2764
|
+
async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency, experimentName, triggerTimestamp) {
|
|
2755
2765
|
const jobs = await runner.expandRunConfigNamesToJobs(runConfigNames);
|
|
2756
2766
|
if (jobs.length === 0) {
|
|
2757
2767
|
throw new Error("No jobs expanded from RunConfigs.");
|
|
@@ -2961,7 +2971,8 @@ async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency,
|
|
|
2961
2971
|
const snapshots = await runner.runDatasetJobsWithSharedConcurrency({
|
|
2962
2972
|
jobs,
|
|
2963
2973
|
globalConcurrency: concurrency,
|
|
2964
|
-
experimentName
|
|
2974
|
+
experimentName,
|
|
2975
|
+
triggerTimestamp
|
|
2965
2976
|
});
|
|
2966
2977
|
for (let i = 0; i < snapshots.length; i += 1) {
|
|
2967
2978
|
const snap = snapshots[i];
|
|
@@ -3061,7 +3072,7 @@ async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency,
|
|
|
3061
3072
|
}
|
|
3062
3073
|
return failedTestCasesTotal > 0 ? 1 : 0;
|
|
3063
3074
|
}
|
|
3064
|
-
async function runSimpleEvalRunConfigsInk(runner, runConfigNames, concurrency, experimentName) {
|
|
3075
|
+
async function runSimpleEvalRunConfigsInk(runner, runConfigNames, concurrency, experimentName, triggerTimestamp) {
|
|
3065
3076
|
return new Promise((resolve5, reject) => {
|
|
3066
3077
|
const app = ink.render(
|
|
3067
3078
|
React__namespace.createElement(RunView, {
|
|
@@ -3069,6 +3080,7 @@ async function runSimpleEvalRunConfigsInk(runner, runConfigNames, concurrency, e
|
|
|
3069
3080
|
runConfigNames,
|
|
3070
3081
|
concurrency,
|
|
3071
3082
|
experimentName,
|
|
3083
|
+
triggerTimestamp,
|
|
3072
3084
|
onComplete: (err, exitCode) => {
|
|
3073
3085
|
app.unmount();
|
|
3074
3086
|
if (err) {
|
|
@@ -3126,11 +3138,13 @@ async function main() {
|
|
|
3126
3138
|
try {
|
|
3127
3139
|
if (args.command === "run") {
|
|
3128
3140
|
const concurrency = args.concurrency ?? getDefaultConcurrency();
|
|
3141
|
+
const triggerTimestamp = Date.now();
|
|
3129
3142
|
const exitCode = await (useInk ? runSimpleEvalRunConfigsInk : runSimpleEvalRunConfigsPlain)(
|
|
3130
3143
|
runner,
|
|
3131
3144
|
args.runConfigNames,
|
|
3132
3145
|
concurrency,
|
|
3133
|
-
args.experimentName
|
|
3146
|
+
args.experimentName,
|
|
3147
|
+
triggerTimestamp
|
|
3134
3148
|
);
|
|
3135
3149
|
if (args.ci && exitCode !== 0) {
|
|
3136
3150
|
process.exit(1);
|