@m4trix/evals 0.28.0 → 0.30.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/cli-simple.cjs +20 -6
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +20 -6
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +10 -1
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +10 -1
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +8 -0
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +17 -0
- package/dist/index.js +8 -0
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.d.ts
CHANGED
|
@@ -274,6 +274,11 @@ interface EvalMiddleware<TCtx> {
|
|
|
274
274
|
interface EvaluateMeta {
|
|
275
275
|
/** Identifier of the trigger that started the run (for example, a CLI invocation). */
|
|
276
276
|
triggerId: string;
|
|
277
|
+
/**
|
|
278
|
+
* Milliseconds since Unix epoch when the run was triggered (e.g. `Date.now()` at CLI start, or when
|
|
279
|
+
* `runDatasetWith` / `runDatasetJobsWithSharedConcurrency` was invoked). Shared across all jobs in a batch.
|
|
280
|
+
*/
|
|
281
|
+
triggerTimestamp: number;
|
|
277
282
|
/**
|
|
278
283
|
* Identifier of the current test-case execution shared across all evaluators
|
|
279
284
|
* for this specific test-case run.
|
|
@@ -281,6 +286,10 @@ interface EvaluateMeta {
|
|
|
281
286
|
runId: string;
|
|
282
287
|
/** Display label for the dataset (`Dataset.getDisplayLabel()`, i.e. `displayName ?? name`). */
|
|
283
288
|
datasetName: string;
|
|
289
|
+
/** Discovery id for the current test case (same as runner events’ `testCaseId`). */
|
|
290
|
+
testCaseId: string;
|
|
291
|
+
/** Display label for the test case (`TestCase.getDisplayLabel()`, i.e. `displayName ?? name`). */
|
|
292
|
+
testCaseName: string;
|
|
284
293
|
/** Canonical `RunConfig` name (or `programmatic` for API/TUI-only runs). */
|
|
285
294
|
runConfigName: string;
|
|
286
295
|
/**
|
|
@@ -589,6 +598,10 @@ interface RunDatasetRequest {
|
|
|
589
598
|
* When omitted, the runner generates one in the format `trg-[uuid]`.
|
|
590
599
|
*/
|
|
591
600
|
triggerId?: string;
|
|
601
|
+
/**
|
|
602
|
+
* When the run was triggered (`Date.now()` ms); defaults to now. Forwarded as `meta.triggerTimestamp`.
|
|
603
|
+
*/
|
|
604
|
+
triggerTimestamp?: number;
|
|
592
605
|
datasetId: string;
|
|
593
606
|
evaluatorIds: ReadonlyArray<string>;
|
|
594
607
|
/** RunConfig name surfaced on evaluator `meta` (from the job or `PROGRAMMATIC_RUN_CONFIG`). */
|
|
@@ -693,6 +706,10 @@ interface RunDatasetJobsWithSharedConcurrencyRequest {
|
|
|
693
706
|
jobs: ReadonlyArray<RunDatasetJob>;
|
|
694
707
|
globalConcurrency: number;
|
|
695
708
|
triggerId?: string;
|
|
709
|
+
/**
|
|
710
|
+
* When the batch was triggered (`Date.now()` ms); defaults to now. CLI sets this once at command start.
|
|
711
|
+
*/
|
|
712
|
+
triggerTimestamp?: number;
|
|
696
713
|
/** Applied to every job in this batch (e.g. CLI `--experiment`). */
|
|
697
714
|
experimentName?: string;
|
|
698
715
|
}
|
package/dist/index.js
CHANGED
|
@@ -1566,8 +1566,11 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1566
1566
|
output,
|
|
1567
1567
|
meta: {
|
|
1568
1568
|
triggerId: task.triggerId,
|
|
1569
|
+
triggerTimestamp: task.triggerTimestamp,
|
|
1569
1570
|
runId: evaluatorRunId,
|
|
1570
1571
|
datasetName: task.dataset.getDisplayLabel(),
|
|
1572
|
+
testCaseId: testCaseItem.id,
|
|
1573
|
+
testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
|
|
1571
1574
|
repetitionId,
|
|
1572
1575
|
repetitionIndex,
|
|
1573
1576
|
repetitionCount,
|
|
@@ -2044,6 +2047,7 @@ var EffectRunner = class {
|
|
|
2044
2047
|
const globalConcurrency = Math.max(1, request.globalConcurrency);
|
|
2045
2048
|
const sem = Effect.unsafeMakeSemaphore(globalConcurrency);
|
|
2046
2049
|
const triggerId = request.triggerId ?? `trg-${randomUUID()}`;
|
|
2050
|
+
const triggerTimestamp = request.triggerTimestamp ?? Date.now();
|
|
2047
2051
|
const snapshots = [];
|
|
2048
2052
|
for (const job of request.jobs) {
|
|
2049
2053
|
snapshots.push(
|
|
@@ -2051,6 +2055,7 @@ var EffectRunner = class {
|
|
|
2051
2055
|
datasetId: job.datasetId,
|
|
2052
2056
|
evaluatorIds: job.evaluatorIds,
|
|
2053
2057
|
triggerId,
|
|
2058
|
+
triggerTimestamp,
|
|
2054
2059
|
maxConcurrency: this.config.maxConcurrency ?? 1,
|
|
2055
2060
|
globalEvaluationSemaphore: sem,
|
|
2056
2061
|
runConfigName: job.runConfigName,
|
|
@@ -2088,6 +2093,7 @@ var EffectRunner = class {
|
|
|
2088
2093
|
datasetId: request.datasetId,
|
|
2089
2094
|
evaluatorIds: request.evaluatorIds,
|
|
2090
2095
|
triggerId: request.triggerId,
|
|
2096
|
+
triggerTimestamp: request.triggerTimestamp ?? Date.now(),
|
|
2091
2097
|
maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
|
|
2092
2098
|
repetitions: request.repetitions,
|
|
2093
2099
|
runConfigName,
|
|
@@ -2115,6 +2121,7 @@ var EffectRunner = class {
|
|
|
2115
2121
|
const totalEvaluations = selectedTestCases.length * repetitions;
|
|
2116
2122
|
const runConfigTags = [...params.runConfigTags ?? []];
|
|
2117
2123
|
const triggerId = params.triggerId ?? `trg-${randomUUID()}`;
|
|
2124
|
+
const triggerTimestamp = params.triggerTimestamp ?? Date.now();
|
|
2118
2125
|
const runId = `run-${randomUUID()}`;
|
|
2119
2126
|
const artifactPath = createArtifactPath(this.config.artifactDirectory, params.datasetId, runId);
|
|
2120
2127
|
const snapshot = {
|
|
@@ -2158,6 +2165,7 @@ var EffectRunner = class {
|
|
|
2158
2165
|
Queue.offer(this.runQueue, {
|
|
2159
2166
|
runId,
|
|
2160
2167
|
triggerId,
|
|
2168
|
+
triggerTimestamp,
|
|
2161
2169
|
datasetId: params.datasetId,
|
|
2162
2170
|
dataset: dataset.dataset,
|
|
2163
2171
|
evaluators: selectedEvaluators,
|