@m4trix/evals 0.27.0 → 0.28.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -2
- package/dist/cli-simple.cjs +36 -15
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +36 -15
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +11 -7
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +11 -7
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +15 -10
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +35 -14
- package/dist/index.js +15 -10
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.d.ts
CHANGED
|
@@ -132,7 +132,12 @@ interface TestCaseDescribeConfig<TI extends Schema.Schema.Any, TO extends Schema
|
|
|
132
132
|
name: string;
|
|
133
133
|
/** Optional human-readable label for CLI/TUI and evaluator args (any characters). */
|
|
134
134
|
displayName?: string;
|
|
135
|
-
|
|
135
|
+
/**
|
|
136
|
+
* Declared tags on this test case (not `Dataset` filter options). Use `Dataset` `includedTags` /
|
|
137
|
+
* `excludedTags` to select which cases belong to a dataset; evaluators read the resolved tags as
|
|
138
|
+
* `meta.testCaseTags`.
|
|
139
|
+
*/
|
|
140
|
+
tags?: ReadonlyArray<string>;
|
|
136
141
|
inputSchema: TI;
|
|
137
142
|
input: InputOrBuilder<Schema.Schema.Type<TI>>;
|
|
138
143
|
outputSchema?: TO;
|
|
@@ -156,7 +161,7 @@ declare function getTestCaseDisplayLabel(testCase: {
|
|
|
156
161
|
getDisplayLabel?: () => string;
|
|
157
162
|
getName?: () => string;
|
|
158
163
|
}): string;
|
|
159
|
-
/** Tags for evaluator `
|
|
164
|
+
/** Tags for evaluator `meta.testCaseTags` (supports plain test-case-shaped objects without `getTags`). */
|
|
160
165
|
declare function getTestCaseTagList(testCase: {
|
|
161
166
|
getTags?: () => ReadonlyArray<string>;
|
|
162
167
|
}): string[];
|
|
@@ -278,6 +283,10 @@ interface EvaluateMeta {
|
|
|
278
283
|
datasetName: string;
|
|
279
284
|
/** Canonical `RunConfig` name (or `programmatic` for API/TUI-only runs). */
|
|
280
285
|
runConfigName: string;
|
|
286
|
+
/**
|
|
287
|
+
* Optional label for this invocation (e.g. CLI `--experiment`); omitted when not set.
|
|
288
|
+
*/
|
|
289
|
+
experimentName?: string;
|
|
281
290
|
/**
|
|
282
291
|
* Stable id shared by every execution of the same logical test case when `repetitionCount > 1`
|
|
283
292
|
* (and present with count 1 for consistency).
|
|
@@ -287,6 +296,15 @@ interface EvaluateMeta {
|
|
|
287
296
|
repetitionIndex: number;
|
|
288
297
|
/** Total scheduled executions for this logical test case in the current run. */
|
|
289
298
|
repetitionCount: number;
|
|
299
|
+
/** Declared tags on the current test case (`TestCase.describe({ tags })`); empty when none. */
|
|
300
|
+
testCaseTags: string[];
|
|
301
|
+
/**
|
|
302
|
+
* Declared tags on the current run config or programmatic request (`RunConfig.define({ tags })`,
|
|
303
|
+
* `RunDatasetRequest.runConfigTags`); empty when none.
|
|
304
|
+
*/
|
|
305
|
+
runConfigTags: string[];
|
|
306
|
+
/** Declared tags on this evaluator (`Evaluator.define({ tags })`); empty when none. */
|
|
307
|
+
evaluatorTags: string[];
|
|
290
308
|
}
|
|
291
309
|
interface EvaluateArgs<TInput, TOutput = unknown, TCtx = Record<string, never>> {
|
|
292
310
|
input: TInput;
|
|
@@ -294,12 +312,6 @@ interface EvaluateArgs<TInput, TOutput = unknown, TCtx = Record<string, never>>
|
|
|
294
312
|
output?: TOutput;
|
|
295
313
|
/** Metadata about the current evaluator invocation. */
|
|
296
314
|
meta: EvaluateMeta;
|
|
297
|
-
/** Tags from `TestCase.describe({ tags })` for the current test case. */
|
|
298
|
-
testCaseTags: string[];
|
|
299
|
-
/** Tags from `RunConfig.define({ tags })` for this job; empty for programmatic runs unless set on the request. */
|
|
300
|
-
runConfigTags: string[];
|
|
301
|
-
/** Tags from `Evaluator.define({ tags })` for this evaluator. */
|
|
302
|
-
evaluatorTags: string[];
|
|
303
315
|
/** Records a diff for this test case; stored in run artifact and shown by CLI */
|
|
304
316
|
logDiff: (expected: unknown, actual: unknown, options?: CreateDiffLogEntryOptions) => void;
|
|
305
317
|
/** Logs a message or object for this test case; stored in run artifact and shown by CLI */
|
|
@@ -328,7 +340,10 @@ interface EvaluatorDefineConfig<TI extends Schema.Schema.Any, TO extends Schema.
|
|
|
328
340
|
scoreSchema: TS;
|
|
329
341
|
passThreshold?: number;
|
|
330
342
|
passCriterion?: (score: unknown) => boolean;
|
|
331
|
-
/**
|
|
343
|
+
/**
|
|
344
|
+
* Declared tags for this evaluator (not dataset filter rules); echoed on every `evaluate` call as
|
|
345
|
+
* `meta.evaluatorTags`.
|
|
346
|
+
*/
|
|
332
347
|
tags?: ReadonlyArray<string>;
|
|
333
348
|
}
|
|
334
349
|
declare class Evaluator<TInput = unknown, TOutput = unknown, TScore = unknown, TCtx = Record<string, never>> {
|
|
@@ -360,7 +375,7 @@ declare function getEvaluatorDisplayLabel(evaluator: {
|
|
|
360
375
|
getDisplayLabel?: () => string | undefined;
|
|
361
376
|
getName?: () => string | undefined;
|
|
362
377
|
}): string | undefined;
|
|
363
|
-
/** Tags for evaluator `
|
|
378
|
+
/** Tags for evaluator `meta.evaluatorTags` (plain evaluator-shaped objects without `getTags` yield `[]`). */
|
|
364
379
|
declare function getEvaluatorTagList(evaluator: {
|
|
365
380
|
getTags?: () => ReadonlyArray<string>;
|
|
366
381
|
}): string[];
|
|
@@ -441,7 +456,7 @@ interface RunConfigDefineConfig {
|
|
|
441
456
|
name: string;
|
|
442
457
|
/** Optional human-readable label for CLI/TUI (any characters). */
|
|
443
458
|
displayName?: string;
|
|
444
|
-
/** Optional tags; copied to every evaluation as `runConfigTags
|
|
459
|
+
/** Optional declared tags for this run config; copied to every evaluation as `meta.runConfigTags`. */
|
|
445
460
|
tags?: ReadonlyArray<string>;
|
|
446
461
|
runs: ReadonlyArray<RunConfigRow>;
|
|
447
462
|
}
|
|
@@ -458,7 +473,7 @@ declare class RunConfig {
|
|
|
458
473
|
getDisplayName(): string | undefined;
|
|
459
474
|
/** Label for CLI/TUI: {@link getDisplayName} if set, otherwise {@link getName}. */
|
|
460
475
|
getDisplayLabel(): string;
|
|
461
|
-
/** Tags from `RunConfig.define({ tags })`; surfaced as `runConfigTags` on evaluator callbacks. */
|
|
476
|
+
/** Tags from `RunConfig.define({ tags })`; surfaced as `meta.runConfigTags` on evaluator callbacks. */
|
|
462
477
|
getTags(): string[];
|
|
463
478
|
getRuns(): ReadonlyArray<RunConfigRow>;
|
|
464
479
|
}
|
|
@@ -547,7 +562,7 @@ interface RunDatasetJob {
|
|
|
547
562
|
*/
|
|
548
563
|
runConfigDisplayLabel?: string;
|
|
549
564
|
/**
|
|
550
|
-
* Tags from `RunConfig.define({ tags })` for this job; forwarded as `runConfigTags` on evaluator callbacks.
|
|
565
|
+
* Tags from `RunConfig.define({ tags })` for this job; forwarded as `meta.runConfigTags` on evaluator callbacks.
|
|
551
566
|
*/
|
|
552
567
|
runConfigTags?: ReadonlyArray<string>;
|
|
553
568
|
/** Evaluates each matching test case this many times (default 1). */
|
|
@@ -584,9 +599,13 @@ interface RunDatasetRequest {
|
|
|
584
599
|
*/
|
|
585
600
|
repetitions?: number;
|
|
586
601
|
/**
|
|
587
|
-
* Optional tags for this run; forwarded as `runConfigTags` on evaluator callbacks (e.g. suite labels).
|
|
602
|
+
* Optional tags for this run; forwarded as `meta.runConfigTags` on evaluator callbacks (e.g. suite labels).
|
|
588
603
|
*/
|
|
589
604
|
runConfigTags?: ReadonlyArray<string>;
|
|
605
|
+
/**
|
|
606
|
+
* Optional label for this run; forwarded as `experimentName` on evaluator `meta`.
|
|
607
|
+
*/
|
|
608
|
+
experimentName?: string;
|
|
590
609
|
}
|
|
591
610
|
interface RunSnapshot {
|
|
592
611
|
runId: string;
|
|
@@ -674,6 +693,8 @@ interface RunDatasetJobsWithSharedConcurrencyRequest {
|
|
|
674
693
|
jobs: ReadonlyArray<RunDatasetJob>;
|
|
675
694
|
globalConcurrency: number;
|
|
676
695
|
triggerId?: string;
|
|
696
|
+
/** Applied to every job in this batch (e.g. CLI `--experiment`). */
|
|
697
|
+
experimentName?: string;
|
|
677
698
|
}
|
|
678
699
|
interface RunnerApi {
|
|
679
700
|
collectDatasets(): Promise<ReadonlyArray<CollectedDataset>>;
|
package/dist/index.js
CHANGED
|
@@ -816,7 +816,7 @@ var RunConfig = class _RunConfig {
|
|
|
816
816
|
getDisplayLabel() {
|
|
817
817
|
return this._displayName ?? this._name;
|
|
818
818
|
}
|
|
819
|
-
/** Tags from `RunConfig.define({ tags })`; surfaced as `runConfigTags` on evaluator callbacks. */
|
|
819
|
+
/** Tags from `RunConfig.define({ tags })`; surfaced as `meta.runConfigTags` on evaluator callbacks. */
|
|
820
820
|
getTags() {
|
|
821
821
|
return [...this._tags];
|
|
822
822
|
}
|
|
@@ -989,10 +989,11 @@ var TestCase = class _TestCase {
|
|
|
989
989
|
static describe(config) {
|
|
990
990
|
const name = validateTestCaseName(config.name, "TestCase.describe");
|
|
991
991
|
const displayName = normalizeOptionalDisplayName(config.displayName);
|
|
992
|
+
const tags = config.tags !== void 0 ? [...config.tags] : [];
|
|
992
993
|
return new _TestCase({
|
|
993
994
|
name,
|
|
994
995
|
displayName,
|
|
995
|
-
tags
|
|
996
|
+
tags,
|
|
996
997
|
inputSchema: config.inputSchema,
|
|
997
998
|
input: config.input,
|
|
998
999
|
outputSchema: config.outputSchema,
|
|
@@ -1009,7 +1010,7 @@ var TestCase = class _TestCase {
|
|
|
1009
1010
|
return this._config.displayName ?? this._config.name;
|
|
1010
1011
|
}
|
|
1011
1012
|
getTags() {
|
|
1012
|
-
return this._config.tags;
|
|
1013
|
+
return [...this._config.tags];
|
|
1013
1014
|
}
|
|
1014
1015
|
getInputSchema() {
|
|
1015
1016
|
return this._config.inputSchema;
|
|
@@ -1570,11 +1571,12 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1570
1571
|
repetitionId,
|
|
1571
1572
|
repetitionIndex,
|
|
1572
1573
|
repetitionCount,
|
|
1573
|
-
runConfigName: task.runConfigName
|
|
1574
|
+
runConfigName: task.runConfigName,
|
|
1575
|
+
...task.experimentName !== void 0 && task.experimentName !== "" ? { experimentName: task.experimentName } : {},
|
|
1576
|
+
testCaseTags: getTestCaseTagList(testCaseItem.testCase),
|
|
1577
|
+
runConfigTags: task.runConfigTags,
|
|
1578
|
+
evaluatorTags: getEvaluatorTagList(evaluator)
|
|
1574
1579
|
},
|
|
1575
|
-
testCaseTags: getTestCaseTagList(testCaseItem.testCase),
|
|
1576
|
-
runConfigTags: task.runConfigTags,
|
|
1577
|
-
evaluatorTags: getEvaluatorTagList(evaluator),
|
|
1578
1580
|
logDiff,
|
|
1579
1581
|
log,
|
|
1580
1582
|
createError
|
|
@@ -2053,7 +2055,8 @@ var EffectRunner = class {
|
|
|
2053
2055
|
globalEvaluationSemaphore: sem,
|
|
2054
2056
|
runConfigName: job.runConfigName,
|
|
2055
2057
|
runConfigTags: job.runConfigTags,
|
|
2056
|
-
repetitions: job.repetitions
|
|
2058
|
+
repetitions: job.repetitions,
|
|
2059
|
+
experimentName: request.experimentName
|
|
2057
2060
|
})
|
|
2058
2061
|
);
|
|
2059
2062
|
}
|
|
@@ -2088,7 +2091,8 @@ var EffectRunner = class {
|
|
|
2088
2091
|
maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
|
|
2089
2092
|
repetitions: request.repetitions,
|
|
2090
2093
|
runConfigName,
|
|
2091
|
-
runConfigTags: request.runConfigTags
|
|
2094
|
+
runConfigTags: request.runConfigTags,
|
|
2095
|
+
experimentName: request.experimentName
|
|
2092
2096
|
});
|
|
2093
2097
|
}
|
|
2094
2098
|
async startDatasetRun(params) {
|
|
@@ -2163,7 +2167,8 @@ var EffectRunner = class {
|
|
|
2163
2167
|
globalEvaluationSemaphore: params.globalEvaluationSemaphore,
|
|
2164
2168
|
runConfigName: params.runConfigName,
|
|
2165
2169
|
runConfigTags,
|
|
2166
|
-
repetitions
|
|
2170
|
+
repetitions,
|
|
2171
|
+
experimentName: params.experimentName
|
|
2167
2172
|
})
|
|
2168
2173
|
);
|
|
2169
2174
|
return snapshot;
|