@m4trix/evals 0.29.0 → 0.30.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/cli-simple.cjs +18 -6
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +18 -6
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +8 -1
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +8 -1
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +6 -0
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +13 -0
- package/dist/index.js +6 -0
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -141,7 +141,7 @@ Group several dataset/evaluator runs under one named config. Each row is either
|
|
|
141
141
|
`evaluators: [...]` (same module instances discovery loads) or `evaluatorPattern: "..."`
|
|
142
142
|
(wildcard / regex rules from `RunnerApi.resolveEvaluatorsByNamePattern`). Multiple jobs share one `--concurrency` cap.
|
|
143
143
|
|
|
144
|
-
Optional **`repetitions`** on a row (default `1`) runs each matching test case that many times. Every execution in that group shares the same **`repetitionId`** in the evaluator callback **`meta`**, with **`repetitionIndex`** / **`repetitionCount`**. Evaluator **`meta`** includes **`datasetName`** (`Dataset.getDisplayLabel()` → `displayName ?? name`), **`testCaseId`** (discovery id, same as runner events), **`testCaseName`** (`TestCase.getDisplayLabel()` → `displayName ?? name`), **`runConfigName`** (the **`RunConfig`** id or **`programmatic`** from **`PROGRAMMATIC_RUN_CONFIG`** for API/TUI-only **`runDatasetWith`**), optional **`experimentName`**, and declared tag lists **`testCaseTags`**, **`runConfigTags`**, and **`evaluatorTags`** (empty arrays when unset). **`Dataset`** **`includedTags` / `excludedTags`** only filter which test cases belong to a dataset; they are not the same as **`TestCase.describe({ tags })`** or **`Evaluator.define({ tags })`**, which label the case/evaluator and show up in **`meta`**. **`Dataset`** and **`TestCase`** follow the same naming convention as **`RunConfig`**: **`name`** is the stable id; optional **`displayName`** is unrestricted for UI. Names may use **kebab-case**, **snake_case**, **camelCase**, etc. (letters, digits, `_`, `-` only, no spaces); resolution is **case-insensitive**.
|
|
144
|
+
Optional **`repetitions`** on a row (default `1`) runs each matching test case that many times. Every execution in that group shares the same **`repetitionId`** in the evaluator callback **`meta`**, with **`repetitionIndex`** / **`repetitionCount`**. Evaluator **`meta`** includes **`triggerId`**, **`triggerTimestamp`** (ms since epoch when the run was triggered; the simple CLI sets this once at process start), **`datasetName`** (`Dataset.getDisplayLabel()` → `displayName ?? name`), **`testCaseId`** (discovery id, same as runner events), **`testCaseName`** (`TestCase.getDisplayLabel()` → `displayName ?? name`), **`runConfigName`** (the **`RunConfig`** id or **`programmatic`** from **`PROGRAMMATIC_RUN_CONFIG`** for API/TUI-only **`runDatasetWith`**), optional **`experimentName`**, and declared tag lists **`testCaseTags`**, **`runConfigTags`**, and **`evaluatorTags`** (empty arrays when unset). **`Dataset`** **`includedTags` / `excludedTags`** only filter which test cases belong to a dataset; they are not the same as **`TestCase.describe({ tags })`** or **`Evaluator.define({ tags })`**, which label the case/evaluator and show up in **`meta`**. **`Dataset`** and **`TestCase`** follow the same naming convention as **`RunConfig`**: **`name`** is the stable id; optional **`displayName`** is unrestricted for UI. Names may use **kebab-case**, **snake_case**, **camelCase**, etc. (letters, digits, `_`, `-` only, no spaces); resolution is **case-insensitive**.
|
|
145
145
|
|
|
146
146
|
```ts
|
|
147
147
|
import { RunConfig } from '@m4trix/evals';
|
package/dist/cli-simple.cjs
CHANGED
|
@@ -1012,6 +1012,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1012
1012
|
output,
|
|
1013
1013
|
meta: {
|
|
1014
1014
|
triggerId: task.triggerId,
|
|
1015
|
+
triggerTimestamp: task.triggerTimestamp,
|
|
1015
1016
|
runId: evaluatorRunId,
|
|
1016
1017
|
datasetName: task.dataset.getDisplayLabel(),
|
|
1017
1018
|
testCaseId: testCaseItem.id,
|
|
@@ -1492,6 +1493,7 @@ var EffectRunner = class {
|
|
|
1492
1493
|
const globalConcurrency = Math.max(1, request.globalConcurrency);
|
|
1493
1494
|
const sem = effect.Effect.unsafeMakeSemaphore(globalConcurrency);
|
|
1494
1495
|
const triggerId = request.triggerId ?? `trg-${crypto.randomUUID()}`;
|
|
1496
|
+
const triggerTimestamp = request.triggerTimestamp ?? Date.now();
|
|
1495
1497
|
const snapshots = [];
|
|
1496
1498
|
for (const job of request.jobs) {
|
|
1497
1499
|
snapshots.push(
|
|
@@ -1499,6 +1501,7 @@ var EffectRunner = class {
|
|
|
1499
1501
|
datasetId: job.datasetId,
|
|
1500
1502
|
evaluatorIds: job.evaluatorIds,
|
|
1501
1503
|
triggerId,
|
|
1504
|
+
triggerTimestamp,
|
|
1502
1505
|
maxConcurrency: this.config.maxConcurrency ?? 1,
|
|
1503
1506
|
globalEvaluationSemaphore: sem,
|
|
1504
1507
|
runConfigName: job.runConfigName,
|
|
@@ -1536,6 +1539,7 @@ var EffectRunner = class {
|
|
|
1536
1539
|
datasetId: request.datasetId,
|
|
1537
1540
|
evaluatorIds: request.evaluatorIds,
|
|
1538
1541
|
triggerId: request.triggerId,
|
|
1542
|
+
triggerTimestamp: request.triggerTimestamp ?? Date.now(),
|
|
1539
1543
|
maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
|
|
1540
1544
|
repetitions: request.repetitions,
|
|
1541
1545
|
runConfigName,
|
|
@@ -1563,6 +1567,7 @@ var EffectRunner = class {
|
|
|
1563
1567
|
const totalEvaluations = selectedTestCases.length * repetitions;
|
|
1564
1568
|
const runConfigTags = [...params.runConfigTags ?? []];
|
|
1565
1569
|
const triggerId = params.triggerId ?? `trg-${crypto.randomUUID()}`;
|
|
1570
|
+
const triggerTimestamp = params.triggerTimestamp ?? Date.now();
|
|
1566
1571
|
const runId = `run-${crypto.randomUUID()}`;
|
|
1567
1572
|
const artifactPath = createArtifactPath(this.config.artifactDirectory, params.datasetId, runId);
|
|
1568
1573
|
const snapshot = {
|
|
@@ -1606,6 +1611,7 @@ var EffectRunner = class {
|
|
|
1606
1611
|
effect.Queue.offer(this.runQueue, {
|
|
1607
1612
|
runId,
|
|
1608
1613
|
triggerId,
|
|
1614
|
+
triggerTimestamp,
|
|
1609
1615
|
datasetId: params.datasetId,
|
|
1610
1616
|
dataset: dataset.dataset,
|
|
1611
1617
|
evaluators: selectedEvaluators,
|
|
@@ -2038,6 +2044,7 @@ function RunView({
|
|
|
2038
2044
|
runConfigNames,
|
|
2039
2045
|
concurrency,
|
|
2040
2046
|
experimentName,
|
|
2047
|
+
triggerTimestamp,
|
|
2041
2048
|
onComplete
|
|
2042
2049
|
}) {
|
|
2043
2050
|
const [phase, setPhase] = React.useState("loading");
|
|
@@ -2208,7 +2215,8 @@ function RunView({
|
|
|
2208
2215
|
const snapshots = await runner.runDatasetJobsWithSharedConcurrency({
|
|
2209
2216
|
jobs,
|
|
2210
2217
|
globalConcurrency: concurrency,
|
|
2211
|
-
experimentName
|
|
2218
|
+
experimentName,
|
|
2219
|
+
triggerTimestamp
|
|
2212
2220
|
});
|
|
2213
2221
|
for (let i = 0; i < snapshots.length; i += 1) {
|
|
2214
2222
|
const snap = snapshots[i];
|
|
@@ -2265,7 +2273,7 @@ function RunView({
|
|
|
2265
2273
|
setPhase("completed");
|
|
2266
2274
|
const exitCode = failedTestCases > 0 ? 1 : 0;
|
|
2267
2275
|
setTimeout(() => onComplete(void 0, exitCode), 200);
|
|
2268
|
-
}, [runner, runConfigNames, concurrency, experimentName, onComplete]);
|
|
2276
|
+
}, [runner, runConfigNames, concurrency, experimentName, triggerTimestamp, onComplete]);
|
|
2269
2277
|
React.useEffect(() => {
|
|
2270
2278
|
void runEval();
|
|
2271
2279
|
}, [runEval]);
|
|
@@ -2753,7 +2761,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
|
|
|
2753
2761
|
}
|
|
2754
2762
|
return lines;
|
|
2755
2763
|
}
|
|
2756
|
-
async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency, experimentName) {
|
|
2764
|
+
async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency, experimentName, triggerTimestamp) {
|
|
2757
2765
|
const jobs = await runner.expandRunConfigNamesToJobs(runConfigNames);
|
|
2758
2766
|
if (jobs.length === 0) {
|
|
2759
2767
|
throw new Error("No jobs expanded from RunConfigs.");
|
|
@@ -2963,7 +2971,8 @@ async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency,
|
|
|
2963
2971
|
const snapshots = await runner.runDatasetJobsWithSharedConcurrency({
|
|
2964
2972
|
jobs,
|
|
2965
2973
|
globalConcurrency: concurrency,
|
|
2966
|
-
experimentName
|
|
2974
|
+
experimentName,
|
|
2975
|
+
triggerTimestamp
|
|
2967
2976
|
});
|
|
2968
2977
|
for (let i = 0; i < snapshots.length; i += 1) {
|
|
2969
2978
|
const snap = snapshots[i];
|
|
@@ -3063,7 +3072,7 @@ async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency,
|
|
|
3063
3072
|
}
|
|
3064
3073
|
return failedTestCasesTotal > 0 ? 1 : 0;
|
|
3065
3074
|
}
|
|
3066
|
-
async function runSimpleEvalRunConfigsInk(runner, runConfigNames, concurrency, experimentName) {
|
|
3075
|
+
async function runSimpleEvalRunConfigsInk(runner, runConfigNames, concurrency, experimentName, triggerTimestamp) {
|
|
3067
3076
|
return new Promise((resolve5, reject) => {
|
|
3068
3077
|
const app = ink.render(
|
|
3069
3078
|
React__namespace.createElement(RunView, {
|
|
@@ -3071,6 +3080,7 @@ async function runSimpleEvalRunConfigsInk(runner, runConfigNames, concurrency, e
|
|
|
3071
3080
|
runConfigNames,
|
|
3072
3081
|
concurrency,
|
|
3073
3082
|
experimentName,
|
|
3083
|
+
triggerTimestamp,
|
|
3074
3084
|
onComplete: (err, exitCode) => {
|
|
3075
3085
|
app.unmount();
|
|
3076
3086
|
if (err) {
|
|
@@ -3128,11 +3138,13 @@ async function main() {
|
|
|
3128
3138
|
try {
|
|
3129
3139
|
if (args.command === "run") {
|
|
3130
3140
|
const concurrency = args.concurrency ?? getDefaultConcurrency();
|
|
3141
|
+
const triggerTimestamp = Date.now();
|
|
3131
3142
|
const exitCode = await (useInk ? runSimpleEvalRunConfigsInk : runSimpleEvalRunConfigsPlain)(
|
|
3132
3143
|
runner,
|
|
3133
3144
|
args.runConfigNames,
|
|
3134
3145
|
concurrency,
|
|
3135
|
-
args.experimentName
|
|
3146
|
+
args.experimentName,
|
|
3147
|
+
triggerTimestamp
|
|
3136
3148
|
);
|
|
3137
3149
|
if (args.ci && exitCode !== 0) {
|
|
3138
3150
|
process.exit(1);
|