@m4trix/evals 0.27.0 → 0.29.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -2
- package/dist/cli-simple.cjs +38 -15
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +38 -15
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +13 -7
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +13 -7
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +17 -10
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +39 -14
- package/dist/index.js +17 -10
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.d.ts
CHANGED
|
@@ -132,7 +132,12 @@ interface TestCaseDescribeConfig<TI extends Schema.Schema.Any, TO extends Schema
|
|
|
132
132
|
name: string;
|
|
133
133
|
/** Optional human-readable label for CLI/TUI and evaluator args (any characters). */
|
|
134
134
|
displayName?: string;
|
|
135
|
-
|
|
135
|
+
/**
|
|
136
|
+
* Declared tags on this test case (not `Dataset` filter options). Use `Dataset` `includedTags` /
|
|
137
|
+
* `excludedTags` to select which cases belong to a dataset; evaluators read the resolved tags as
|
|
138
|
+
* `meta.testCaseTags`.
|
|
139
|
+
*/
|
|
140
|
+
tags?: ReadonlyArray<string>;
|
|
136
141
|
inputSchema: TI;
|
|
137
142
|
input: InputOrBuilder<Schema.Schema.Type<TI>>;
|
|
138
143
|
outputSchema?: TO;
|
|
@@ -156,7 +161,7 @@ declare function getTestCaseDisplayLabel(testCase: {
|
|
|
156
161
|
getDisplayLabel?: () => string;
|
|
157
162
|
getName?: () => string;
|
|
158
163
|
}): string;
|
|
159
|
-
/** Tags for evaluator `
|
|
164
|
+
/** Tags for evaluator `meta.testCaseTags` (supports plain test-case-shaped objects without `getTags`). */
|
|
160
165
|
declare function getTestCaseTagList(testCase: {
|
|
161
166
|
getTags?: () => ReadonlyArray<string>;
|
|
162
167
|
}): string[];
|
|
@@ -276,8 +281,16 @@ interface EvaluateMeta {
|
|
|
276
281
|
runId: string;
|
|
277
282
|
/** Display label for the dataset (`Dataset.getDisplayLabel()`, i.e. `displayName ?? name`). */
|
|
278
283
|
datasetName: string;
|
|
284
|
+
/** Discovery id for the current test case (same as runner events’ `testCaseId`). */
|
|
285
|
+
testCaseId: string;
|
|
286
|
+
/** Display label for the test case (`TestCase.getDisplayLabel()`, i.e. `displayName ?? name`). */
|
|
287
|
+
testCaseName: string;
|
|
279
288
|
/** Canonical `RunConfig` name (or `programmatic` for API/TUI-only runs). */
|
|
280
289
|
runConfigName: string;
|
|
290
|
+
/**
|
|
291
|
+
* Optional label for this invocation (e.g. CLI `--experiment`); omitted when not set.
|
|
292
|
+
*/
|
|
293
|
+
experimentName?: string;
|
|
281
294
|
/**
|
|
282
295
|
* Stable id shared by every execution of the same logical test case when `repetitionCount > 1`
|
|
283
296
|
* (and present with count 1 for consistency).
|
|
@@ -287,6 +300,15 @@ interface EvaluateMeta {
|
|
|
287
300
|
repetitionIndex: number;
|
|
288
301
|
/** Total scheduled executions for this logical test case in the current run. */
|
|
289
302
|
repetitionCount: number;
|
|
303
|
+
/** Declared tags on the current test case (`TestCase.describe({ tags })`); empty when none. */
|
|
304
|
+
testCaseTags: string[];
|
|
305
|
+
/**
|
|
306
|
+
* Declared tags on the current run config or programmatic request (`RunConfig.define({ tags })`,
|
|
307
|
+
* `RunDatasetRequest.runConfigTags`); empty when none.
|
|
308
|
+
*/
|
|
309
|
+
runConfigTags: string[];
|
|
310
|
+
/** Declared tags on this evaluator (`Evaluator.define({ tags })`); empty when none. */
|
|
311
|
+
evaluatorTags: string[];
|
|
290
312
|
}
|
|
291
313
|
interface EvaluateArgs<TInput, TOutput = unknown, TCtx = Record<string, never>> {
|
|
292
314
|
input: TInput;
|
|
@@ -294,12 +316,6 @@ interface EvaluateArgs<TInput, TOutput = unknown, TCtx = Record<string, never>>
|
|
|
294
316
|
output?: TOutput;
|
|
295
317
|
/** Metadata about the current evaluator invocation. */
|
|
296
318
|
meta: EvaluateMeta;
|
|
297
|
-
/** Tags from `TestCase.describe({ tags })` for the current test case. */
|
|
298
|
-
testCaseTags: string[];
|
|
299
|
-
/** Tags from `RunConfig.define({ tags })` for this job; empty for programmatic runs unless set on the request. */
|
|
300
|
-
runConfigTags: string[];
|
|
301
|
-
/** Tags from `Evaluator.define({ tags })` for this evaluator. */
|
|
302
|
-
evaluatorTags: string[];
|
|
303
319
|
/** Records a diff for this test case; stored in run artifact and shown by CLI */
|
|
304
320
|
logDiff: (expected: unknown, actual: unknown, options?: CreateDiffLogEntryOptions) => void;
|
|
305
321
|
/** Logs a message or object for this test case; stored in run artifact and shown by CLI */
|
|
@@ -328,7 +344,10 @@ interface EvaluatorDefineConfig<TI extends Schema.Schema.Any, TO extends Schema.
|
|
|
328
344
|
scoreSchema: TS;
|
|
329
345
|
passThreshold?: number;
|
|
330
346
|
passCriterion?: (score: unknown) => boolean;
|
|
331
|
-
/**
|
|
347
|
+
/**
|
|
348
|
+
* Declared tags for this evaluator (not dataset filter rules); echoed on every `evaluate` call as
|
|
349
|
+
* `meta.evaluatorTags`.
|
|
350
|
+
*/
|
|
332
351
|
tags?: ReadonlyArray<string>;
|
|
333
352
|
}
|
|
334
353
|
declare class Evaluator<TInput = unknown, TOutput = unknown, TScore = unknown, TCtx = Record<string, never>> {
|
|
@@ -360,7 +379,7 @@ declare function getEvaluatorDisplayLabel(evaluator: {
|
|
|
360
379
|
getDisplayLabel?: () => string | undefined;
|
|
361
380
|
getName?: () => string | undefined;
|
|
362
381
|
}): string | undefined;
|
|
363
|
-
/** Tags for evaluator `
|
|
382
|
+
/** Tags for evaluator `meta.evaluatorTags` (plain evaluator-shaped objects without `getTags` yield `[]`). */
|
|
364
383
|
declare function getEvaluatorTagList(evaluator: {
|
|
365
384
|
getTags?: () => ReadonlyArray<string>;
|
|
366
385
|
}): string[];
|
|
@@ -441,7 +460,7 @@ interface RunConfigDefineConfig {
|
|
|
441
460
|
name: string;
|
|
442
461
|
/** Optional human-readable label for CLI/TUI (any characters). */
|
|
443
462
|
displayName?: string;
|
|
444
|
-
/** Optional tags; copied to every evaluation as `runConfigTags
|
|
463
|
+
/** Optional declared tags for this run config; copied to every evaluation as `meta.runConfigTags`. */
|
|
445
464
|
tags?: ReadonlyArray<string>;
|
|
446
465
|
runs: ReadonlyArray<RunConfigRow>;
|
|
447
466
|
}
|
|
@@ -458,7 +477,7 @@ declare class RunConfig {
|
|
|
458
477
|
getDisplayName(): string | undefined;
|
|
459
478
|
/** Label for CLI/TUI: {@link getDisplayName} if set, otherwise {@link getName}. */
|
|
460
479
|
getDisplayLabel(): string;
|
|
461
|
-
/** Tags from `RunConfig.define({ tags })`; surfaced as `runConfigTags` on evaluator callbacks. */
|
|
480
|
+
/** Tags from `RunConfig.define({ tags })`; surfaced as `meta.runConfigTags` on evaluator callbacks. */
|
|
462
481
|
getTags(): string[];
|
|
463
482
|
getRuns(): ReadonlyArray<RunConfigRow>;
|
|
464
483
|
}
|
|
@@ -547,7 +566,7 @@ interface RunDatasetJob {
|
|
|
547
566
|
*/
|
|
548
567
|
runConfigDisplayLabel?: string;
|
|
549
568
|
/**
|
|
550
|
-
* Tags from `RunConfig.define({ tags })` for this job; forwarded as `runConfigTags` on evaluator callbacks.
|
|
569
|
+
* Tags from `RunConfig.define({ tags })` for this job; forwarded as `meta.runConfigTags` on evaluator callbacks.
|
|
551
570
|
*/
|
|
552
571
|
runConfigTags?: ReadonlyArray<string>;
|
|
553
572
|
/** Evaluates each matching test case this many times (default 1). */
|
|
@@ -584,9 +603,13 @@ interface RunDatasetRequest {
|
|
|
584
603
|
*/
|
|
585
604
|
repetitions?: number;
|
|
586
605
|
/**
|
|
587
|
-
* Optional tags for this run; forwarded as `runConfigTags` on evaluator callbacks (e.g. suite labels).
|
|
606
|
+
* Optional tags for this run; forwarded as `meta.runConfigTags` on evaluator callbacks (e.g. suite labels).
|
|
588
607
|
*/
|
|
589
608
|
runConfigTags?: ReadonlyArray<string>;
|
|
609
|
+
/**
|
|
610
|
+
* Optional label for this run; forwarded as `experimentName` on evaluator `meta`.
|
|
611
|
+
*/
|
|
612
|
+
experimentName?: string;
|
|
590
613
|
}
|
|
591
614
|
interface RunSnapshot {
|
|
592
615
|
runId: string;
|
|
@@ -674,6 +697,8 @@ interface RunDatasetJobsWithSharedConcurrencyRequest {
|
|
|
674
697
|
jobs: ReadonlyArray<RunDatasetJob>;
|
|
675
698
|
globalConcurrency: number;
|
|
676
699
|
triggerId?: string;
|
|
700
|
+
/** Applied to every job in this batch (e.g. CLI `--experiment`). */
|
|
701
|
+
experimentName?: string;
|
|
677
702
|
}
|
|
678
703
|
interface RunnerApi {
|
|
679
704
|
collectDatasets(): Promise<ReadonlyArray<CollectedDataset>>;
|
package/dist/index.js
CHANGED
|
@@ -816,7 +816,7 @@ var RunConfig = class _RunConfig {
|
|
|
816
816
|
getDisplayLabel() {
|
|
817
817
|
return this._displayName ?? this._name;
|
|
818
818
|
}
|
|
819
|
-
/** Tags from `RunConfig.define({ tags })`; surfaced as `runConfigTags` on evaluator callbacks. */
|
|
819
|
+
/** Tags from `RunConfig.define({ tags })`; surfaced as `meta.runConfigTags` on evaluator callbacks. */
|
|
820
820
|
getTags() {
|
|
821
821
|
return [...this._tags];
|
|
822
822
|
}
|
|
@@ -989,10 +989,11 @@ var TestCase = class _TestCase {
|
|
|
989
989
|
static describe(config) {
|
|
990
990
|
const name = validateTestCaseName(config.name, "TestCase.describe");
|
|
991
991
|
const displayName = normalizeOptionalDisplayName(config.displayName);
|
|
992
|
+
const tags = config.tags !== void 0 ? [...config.tags] : [];
|
|
992
993
|
return new _TestCase({
|
|
993
994
|
name,
|
|
994
995
|
displayName,
|
|
995
|
-
tags
|
|
996
|
+
tags,
|
|
996
997
|
inputSchema: config.inputSchema,
|
|
997
998
|
input: config.input,
|
|
998
999
|
outputSchema: config.outputSchema,
|
|
@@ -1009,7 +1010,7 @@ var TestCase = class _TestCase {
|
|
|
1009
1010
|
return this._config.displayName ?? this._config.name;
|
|
1010
1011
|
}
|
|
1011
1012
|
getTags() {
|
|
1012
|
-
return this._config.tags;
|
|
1013
|
+
return [...this._config.tags];
|
|
1013
1014
|
}
|
|
1014
1015
|
getInputSchema() {
|
|
1015
1016
|
return this._config.inputSchema;
|
|
@@ -1567,14 +1568,17 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1567
1568
|
triggerId: task.triggerId,
|
|
1568
1569
|
runId: evaluatorRunId,
|
|
1569
1570
|
datasetName: task.dataset.getDisplayLabel(),
|
|
1571
|
+
testCaseId: testCaseItem.id,
|
|
1572
|
+
testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
|
|
1570
1573
|
repetitionId,
|
|
1571
1574
|
repetitionIndex,
|
|
1572
1575
|
repetitionCount,
|
|
1573
|
-
runConfigName: task.runConfigName
|
|
1576
|
+
runConfigName: task.runConfigName,
|
|
1577
|
+
...task.experimentName !== void 0 && task.experimentName !== "" ? { experimentName: task.experimentName } : {},
|
|
1578
|
+
testCaseTags: getTestCaseTagList(testCaseItem.testCase),
|
|
1579
|
+
runConfigTags: task.runConfigTags,
|
|
1580
|
+
evaluatorTags: getEvaluatorTagList(evaluator)
|
|
1574
1581
|
},
|
|
1575
|
-
testCaseTags: getTestCaseTagList(testCaseItem.testCase),
|
|
1576
|
-
runConfigTags: task.runConfigTags,
|
|
1577
|
-
evaluatorTags: getEvaluatorTagList(evaluator),
|
|
1578
1582
|
logDiff,
|
|
1579
1583
|
log,
|
|
1580
1584
|
createError
|
|
@@ -2053,7 +2057,8 @@ var EffectRunner = class {
|
|
|
2053
2057
|
globalEvaluationSemaphore: sem,
|
|
2054
2058
|
runConfigName: job.runConfigName,
|
|
2055
2059
|
runConfigTags: job.runConfigTags,
|
|
2056
|
-
repetitions: job.repetitions
|
|
2060
|
+
repetitions: job.repetitions,
|
|
2061
|
+
experimentName: request.experimentName
|
|
2057
2062
|
})
|
|
2058
2063
|
);
|
|
2059
2064
|
}
|
|
@@ -2088,7 +2093,8 @@ var EffectRunner = class {
|
|
|
2088
2093
|
maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
|
|
2089
2094
|
repetitions: request.repetitions,
|
|
2090
2095
|
runConfigName,
|
|
2091
|
-
runConfigTags: request.runConfigTags
|
|
2096
|
+
runConfigTags: request.runConfigTags,
|
|
2097
|
+
experimentName: request.experimentName
|
|
2092
2098
|
});
|
|
2093
2099
|
}
|
|
2094
2100
|
async startDatasetRun(params) {
|
|
@@ -2163,7 +2169,8 @@ var EffectRunner = class {
|
|
|
2163
2169
|
globalEvaluationSemaphore: params.globalEvaluationSemaphore,
|
|
2164
2170
|
runConfigName: params.runConfigName,
|
|
2165
2171
|
runConfigTags,
|
|
2166
|
-
repetitions
|
|
2172
|
+
repetitions,
|
|
2173
|
+
experimentName: params.experimentName
|
|
2167
2174
|
})
|
|
2168
2175
|
);
|
|
2169
2176
|
return snapshot;
|