@m4trix/evals 0.25.1 → 0.26.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +29 -7
- package/dist/cli-simple.cjs +831 -450
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +832 -451
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +531 -270
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +531 -270
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +888 -509
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +201 -7
- package/dist/index.js +878 -513
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.d.ts
CHANGED
|
@@ -86,6 +86,7 @@ interface RunnerDiscoveryConfig {
|
|
|
86
86
|
rootDir: string;
|
|
87
87
|
datasetSuffixes: ReadonlyArray<string>;
|
|
88
88
|
evaluatorSuffixes: ReadonlyArray<string>;
|
|
89
|
+
runConfigSuffixes: ReadonlyArray<string>;
|
|
89
90
|
testCaseSuffixes: ReadonlyArray<string>;
|
|
90
91
|
excludeDirectories: ReadonlyArray<string>;
|
|
91
92
|
}
|
|
@@ -102,9 +103,11 @@ interface M4trixEvalConfigDiscovery {
|
|
|
102
103
|
rootDir?: string;
|
|
103
104
|
datasetFilePatterns?: ReadonlyArray<string>;
|
|
104
105
|
evaluatorFilePatterns?: ReadonlyArray<string>;
|
|
106
|
+
runConfigFilePatterns?: ReadonlyArray<string>;
|
|
105
107
|
testCaseFilePatterns?: ReadonlyArray<string>;
|
|
106
108
|
datasetSuffixes?: ReadonlyArray<string>;
|
|
107
109
|
evaluatorSuffixes?: ReadonlyArray<string>;
|
|
110
|
+
runConfigSuffixes?: ReadonlyArray<string>;
|
|
108
111
|
testCaseSuffixes?: ReadonlyArray<string>;
|
|
109
112
|
excludeDirectories?: ReadonlyArray<string>;
|
|
110
113
|
}
|
|
@@ -127,9 +130,14 @@ type PathMatcher = string | RegExp;
|
|
|
127
130
|
|
|
128
131
|
type InputOrBuilder<T> = T | (() => T);
|
|
129
132
|
interface TestCaseDescribeConfig<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any = Schema.Schema<unknown>> {
|
|
133
|
+
/**
|
|
134
|
+
* Stable id (letters, digits, `_`, `-`).
|
|
135
|
+
* For an unrestricted UI label, set {@link displayName}.
|
|
136
|
+
*/
|
|
130
137
|
name: string;
|
|
138
|
+
/** Optional human-readable label for CLI/TUI (any characters). */
|
|
139
|
+
displayName?: string;
|
|
131
140
|
tags: string[];
|
|
132
|
-
reruns?: number;
|
|
133
141
|
inputSchema: TI;
|
|
134
142
|
input: InputOrBuilder<Schema.Schema.Type<TI>>;
|
|
135
143
|
outputSchema?: TO;
|
|
@@ -139,14 +147,24 @@ declare class TestCase<TInput = unknown, TOutput = unknown> {
|
|
|
139
147
|
private readonly _config;
|
|
140
148
|
private constructor();
|
|
141
149
|
static describe<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any = Schema.Schema<unknown>>(config: TestCaseDescribeConfig<TI, TO>): TestCase<Schema.Schema.Type<TI>, Schema.Schema.Type<TO>>;
|
|
142
|
-
getReruns(): number;
|
|
143
150
|
getName(): string;
|
|
151
|
+
getDisplayName(): string | undefined;
|
|
152
|
+
getDisplayLabel(): string;
|
|
144
153
|
getTags(): string[];
|
|
145
154
|
getInputSchema(): Schema.Schema.Any;
|
|
146
155
|
getInput(): TInput;
|
|
147
156
|
getOutputSchema(): Schema.Schema.Any | undefined;
|
|
148
157
|
getOutput(): TOutput | undefined;
|
|
149
158
|
}
|
|
159
|
+
/** CLI-friendly label: {@link TestCase.getDisplayLabel} when present, else {@link TestCase.getName} (supports plain test-case-shaped objects). */
|
|
160
|
+
declare function getTestCaseDisplayLabel(testCase: {
|
|
161
|
+
getDisplayLabel?: () => string;
|
|
162
|
+
getName?: () => string;
|
|
163
|
+
}): string;
|
|
164
|
+
/** Tags for evaluator `args.testCaseTags` (supports plain test-case-shaped objects without `getTags`). */
|
|
165
|
+
declare function getTestCaseTagList(testCase: {
|
|
166
|
+
getTags?: () => ReadonlyArray<string>;
|
|
167
|
+
}): string[];
|
|
150
168
|
|
|
151
169
|
interface DatasetDefineConfig {
|
|
152
170
|
name: string;
|
|
@@ -243,6 +261,17 @@ interface EvaluateMeta {
|
|
|
243
261
|
runId: string;
|
|
244
262
|
/** Identifier of the dataset currently being evaluated. */
|
|
245
263
|
datasetId: string;
|
|
264
|
+
/** Canonical `RunConfig` name (or `programmatic` for API/TUI-only runs). */
|
|
265
|
+
runConfigName: string;
|
|
266
|
+
/**
|
|
267
|
+
* Stable id shared by every execution of the same logical test case when `repetitionCount > 1`
|
|
268
|
+
* (and present with count 1 for consistency).
|
|
269
|
+
*/
|
|
270
|
+
repetitionId: string;
|
|
271
|
+
/** 1-based index of this execution within the repetition group. */
|
|
272
|
+
repetitionIndex: number;
|
|
273
|
+
/** Total scheduled executions for this logical test case in the current run. */
|
|
274
|
+
repetitionCount: number;
|
|
246
275
|
}
|
|
247
276
|
interface EvaluateArgs<TInput, TOutput = unknown, TCtx = Record<string, never>> {
|
|
248
277
|
input: TInput;
|
|
@@ -250,6 +279,12 @@ interface EvaluateArgs<TInput, TOutput = unknown, TCtx = Record<string, never>>
|
|
|
250
279
|
output?: TOutput;
|
|
251
280
|
/** Metadata about the current evaluator invocation. */
|
|
252
281
|
meta: EvaluateMeta;
|
|
282
|
+
/** Tags from `TestCase.describe({ tags })` for the current test case. */
|
|
283
|
+
testCaseTags: string[];
|
|
284
|
+
/** Tags from `RunConfig.define({ tags })` for this job; empty for programmatic runs unless set on the request. */
|
|
285
|
+
runConfigTags: string[];
|
|
286
|
+
/** Tags from `Evaluator.define({ tags })` for this evaluator. */
|
|
287
|
+
evaluatorTags: string[];
|
|
253
288
|
/** Records a diff for this test case; stored in run artifact and shown by CLI */
|
|
254
289
|
logDiff: (expected: unknown, actual: unknown, options?: CreateDiffLogEntryOptions) => void;
|
|
255
290
|
/** Logs a message or object for this test case; stored in run artifact and shown by CLI */
|
|
@@ -266,12 +301,20 @@ interface EvaluateArgs<TInput, TOutput = unknown, TCtx = Record<string, never>>
|
|
|
266
301
|
}
|
|
267
302
|
type EvaluateFn<TInput, TOutput, TScore, TCtx> = (args: EvaluateArgs<TInput, TOutput, TCtx>) => TScore | Error | Promise<TScore | Error>;
|
|
268
303
|
interface EvaluatorDefineConfig<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any, TS extends Schema.Schema.Any> {
|
|
304
|
+
/**
|
|
305
|
+
* Stable id (letters, digits, `_`, `-`); used for discovery, name patterns, and `meta`.
|
|
306
|
+
* For an unrestricted UI label, set {@link displayName}.
|
|
307
|
+
*/
|
|
269
308
|
name: string;
|
|
309
|
+
/** Optional human-readable label for CLI/TUI (any characters). */
|
|
310
|
+
displayName?: string;
|
|
270
311
|
inputSchema: TI;
|
|
271
312
|
outputSchema: TO;
|
|
272
313
|
scoreSchema: TS;
|
|
273
314
|
passThreshold?: number;
|
|
274
315
|
passCriterion?: (score: unknown) => boolean;
|
|
316
|
+
/** Optional tags for this evaluator; surfaced on every `evaluate` invocation. */
|
|
317
|
+
tags?: ReadonlyArray<string>;
|
|
275
318
|
}
|
|
276
319
|
declare class Evaluator<TInput = unknown, TOutput = unknown, TScore = unknown, TCtx = Record<string, never>> {
|
|
277
320
|
private readonly _config;
|
|
@@ -281,7 +324,13 @@ declare class Evaluator<TInput = unknown, TOutput = unknown, TScore = unknown, T
|
|
|
281
324
|
use<TNew>(middleware: EvalMiddleware<TNew>): Evaluator<TInput, TOutput, TScore, TCtx & TNew>;
|
|
282
325
|
define<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any, TS extends Schema.Schema.Any>(config: EvaluatorDefineConfig<TI, TO, TS>): Evaluator<Schema.Schema.Type<TI>, Schema.Schema.Type<TO>, Schema.Schema.Type<TS>, TCtx>;
|
|
283
326
|
evaluate(fn: EvaluateFn<TInput, TOutput, TScore, TCtx>): Evaluator<TInput, TOutput, TScore, TCtx>;
|
|
327
|
+
/** Canonical evaluator id when defined; otherwise undefined (middleware-only chain). */
|
|
284
328
|
getName(): string | undefined;
|
|
329
|
+
getDisplayName(): string | undefined;
|
|
330
|
+
/** Label for CLI/TUI: {@link getDisplayName} if set, otherwise {@link getName}. Undefined if not yet defined. */
|
|
331
|
+
getDisplayLabel(): string | undefined;
|
|
332
|
+
/** Tags from `Evaluator.define({ tags })`; empty until defined. */
|
|
333
|
+
getTags(): string[];
|
|
285
334
|
getInputSchema(): Schema.Schema.Any | undefined;
|
|
286
335
|
getOutputSchema(): Schema.Schema.Any | undefined;
|
|
287
336
|
getScoreSchema(): Schema.Schema.Any | undefined;
|
|
@@ -291,6 +340,15 @@ declare class Evaluator<TInput = unknown, TOutput = unknown, TScore = unknown, T
|
|
|
291
340
|
getPassCriterion(): ((score: unknown) => boolean) | undefined;
|
|
292
341
|
resolveContext(): Promise<TCtx>;
|
|
293
342
|
}
|
|
343
|
+
/** CLI-friendly label: {@link Evaluator.getDisplayLabel} when present, else {@link Evaluator.getName} (supports plain evaluator-shaped objects from discovery). */
|
|
344
|
+
declare function getEvaluatorDisplayLabel(evaluator: {
|
|
345
|
+
getDisplayLabel?: () => string | undefined;
|
|
346
|
+
getName?: () => string | undefined;
|
|
347
|
+
}): string | undefined;
|
|
348
|
+
/** Tags for evaluator `args.evaluatorTags` (plain evaluator-shaped objects without `getTags` yield `[]`). */
|
|
349
|
+
declare function getEvaluatorTagList(evaluator: {
|
|
350
|
+
getTags?: () => ReadonlyArray<string>;
|
|
351
|
+
}): string[];
|
|
294
352
|
|
|
295
353
|
interface MetricItem<TData = unknown> {
|
|
296
354
|
readonly id: string;
|
|
@@ -320,6 +378,72 @@ declare const Metric: {
|
|
|
320
378
|
};
|
|
321
379
|
declare function getMetricById(id: string): MetricDef<unknown> | undefined;
|
|
322
380
|
|
|
381
|
+
/** Branded id for `RunConfig` `name` (decode with {@link RunConfigNameSchema}). */
|
|
382
|
+
declare const RunConfigNameSchema: Schema.brand<Schema.filter<Schema.filter<Schema.filter<typeof Schema.String>>>, "RunConfigName">;
|
|
383
|
+
/** Branded id for `Evaluator.define({ name })` (decode with {@link EvaluatorNameSchema}). */
|
|
384
|
+
declare const EvaluatorNameSchema: Schema.brand<Schema.filter<Schema.filter<Schema.filter<typeof Schema.String>>>, "EvaluatorName">;
|
|
385
|
+
/** Branded id for `TestCase.describe({ name })` (decode with {@link TestCaseNameSchema}). */
|
|
386
|
+
declare const TestCaseNameSchema: Schema.brand<Schema.filter<Schema.filter<Schema.filter<typeof Schema.String>>>, "TestCaseName">;
|
|
387
|
+
type RunConfigName = Schema.Schema.Type<typeof RunConfigNameSchema>;
|
|
388
|
+
type EvaluatorName = Schema.Schema.Type<typeof EvaluatorNameSchema>;
|
|
389
|
+
type TestCaseName = Schema.Schema.Type<typeof TestCaseNameSchema>;
|
|
390
|
+
declare function validateRunConfigName(raw: string, context: string): RunConfigName;
|
|
391
|
+
declare function validateEvaluatorName(raw: string, context: string): EvaluatorName;
|
|
392
|
+
declare function validateTestCaseName(raw: string, context: string): TestCaseName;
|
|
393
|
+
/** Optional UI label: trim; empty after trim becomes undefined. */
|
|
394
|
+
declare function normalizeOptionalDisplayName(raw: string | undefined): string | undefined;
|
|
395
|
+
|
|
396
|
+
/** Heterogeneous evaluator rows; `unknown` breaks assignability from concrete `EvaluateFn` (contravariance on `input`). */
|
|
397
|
+
type RunConfigEvaluatorRef = Evaluator<any, any, any, any>;
|
|
398
|
+
/** Select evaluators by concrete instances (same module exports as discovery). */
|
|
399
|
+
interface RunConfigRowEvaluators {
|
|
400
|
+
readonly dataset: Dataset;
|
|
401
|
+
readonly evaluators: ReadonlyArray<RunConfigEvaluatorRef>;
|
|
402
|
+
readonly evaluatorPattern?: undefined;
|
|
403
|
+
/**
|
|
404
|
+
* How many times each test case in this dataset is evaluated for this row (default: 1).
|
|
405
|
+
* All executions of the same logical test case share one `repetitionId` in evaluator `meta`.
|
|
406
|
+
*/
|
|
407
|
+
readonly repetitions?: number;
|
|
408
|
+
}
|
|
409
|
+
/** Select evaluators using the same wildcard / regex rules as the runner's `resolveEvaluatorsByNamePattern`. */
|
|
410
|
+
interface RunConfigRowPattern {
|
|
411
|
+
readonly dataset: Dataset;
|
|
412
|
+
readonly evaluatorPattern: string;
|
|
413
|
+
readonly evaluators?: undefined;
|
|
414
|
+
readonly repetitions?: number;
|
|
415
|
+
}
|
|
416
|
+
type RunConfigRow = RunConfigRowEvaluators | RunConfigRowPattern;
|
|
417
|
+
interface RunConfigDefineConfig {
|
|
418
|
+
/**
|
|
419
|
+
* Stable id (letters, digits, `_`, `-`); surfaced in discovery, CLI `--run-config`, and evaluator `meta`.
|
|
420
|
+
* For an unrestricted UI label, set {@link displayName}.
|
|
421
|
+
*/
|
|
422
|
+
name: string;
|
|
423
|
+
/** Optional human-readable label for CLI/TUI (any characters). */
|
|
424
|
+
displayName?: string;
|
|
425
|
+
/** Optional tags; copied to every evaluation as `runConfigTags` on the evaluator callback. */
|
|
426
|
+
tags?: ReadonlyArray<string>;
|
|
427
|
+
runs: ReadonlyArray<RunConfigRow>;
|
|
428
|
+
}
|
|
429
|
+
declare class RunConfig {
|
|
430
|
+
private readonly _name;
|
|
431
|
+
private readonly _displayName;
|
|
432
|
+
private readonly _tags;
|
|
433
|
+
private readonly _runs;
|
|
434
|
+
private constructor();
|
|
435
|
+
static define(config: RunConfigDefineConfig): RunConfig;
|
|
436
|
+
/** Canonical id (branded {@link RunConfigName} at runtime; typed as `string` for ergonomics). */
|
|
437
|
+
getName(): string;
|
|
438
|
+
/** Optional unrestricted display label. */
|
|
439
|
+
getDisplayName(): string | undefined;
|
|
440
|
+
/** Label for CLI/TUI: {@link getDisplayName} if set, otherwise {@link getName}. */
|
|
441
|
+
getDisplayLabel(): string;
|
|
442
|
+
/** Tags from `RunConfig.define({ tags })`; surfaced as `runConfigTags` on evaluator callbacks. */
|
|
443
|
+
getTags(): string[];
|
|
444
|
+
getRuns(): ReadonlyArray<RunConfigRow>;
|
|
445
|
+
}
|
|
446
|
+
|
|
323
447
|
type ScoreDisplayStrategy = 'bar' | 'number' | 'passFail';
|
|
324
448
|
interface ScoreItem<TData = unknown> {
|
|
325
449
|
readonly id: string;
|
|
@@ -387,6 +511,29 @@ interface CollectedEvaluator {
|
|
|
387
511
|
filePath: string;
|
|
388
512
|
evaluator: Evaluator<unknown, unknown, unknown, unknown>;
|
|
389
513
|
}
|
|
514
|
+
interface CollectedRunConfig {
|
|
515
|
+
id: string;
|
|
516
|
+
filePath: string;
|
|
517
|
+
runConfig: RunConfig;
|
|
518
|
+
}
|
|
519
|
+
/** One dataset + evaluator set queued as part of a RunConfig or batch run. */
|
|
520
|
+
interface RunDatasetJob {
|
|
521
|
+
datasetId: string;
|
|
522
|
+
evaluatorIds: ReadonlyArray<string>;
|
|
523
|
+
/** RunConfig name (same as `RunConfig.getName()`). */
|
|
524
|
+
runConfigName: string;
|
|
525
|
+
/**
|
|
526
|
+
* Human-facing RunConfig label (`displayName ?? name`) when the job was expanded from `RunConfig.define`.
|
|
527
|
+
* Omitted for ad-hoc jobs; UI should fall back to {@link runConfigName}.
|
|
528
|
+
*/
|
|
529
|
+
runConfigDisplayLabel?: string;
|
|
530
|
+
/**
|
|
531
|
+
* Tags from `RunConfig.define({ tags })` for this job; forwarded as `runConfigTags` on evaluator callbacks.
|
|
532
|
+
*/
|
|
533
|
+
runConfigTags?: ReadonlyArray<string>;
|
|
534
|
+
/** Evaluates each matching test case this many times (default 1). */
|
|
535
|
+
repetitions: number;
|
|
536
|
+
}
|
|
390
537
|
interface CollectedTestCase {
|
|
391
538
|
id: string;
|
|
392
539
|
filePath: string;
|
|
@@ -398,6 +545,10 @@ interface SearchTestCasesQuery {
|
|
|
398
545
|
includedPaths?: ReadonlyArray<string | RegExp>;
|
|
399
546
|
excludedPaths?: ReadonlyArray<string | RegExp>;
|
|
400
547
|
}
|
|
548
|
+
/** Use with `RunDatasetRequest` for API / TUI runs that are not backed by a `RunConfig` file. */
|
|
549
|
+
declare const PROGRAMMATIC_RUN_CONFIG: {
|
|
550
|
+
readonly runConfigName: "programmatic";
|
|
551
|
+
};
|
|
401
552
|
interface RunDatasetRequest {
|
|
402
553
|
/**
|
|
403
554
|
* Identifier for what triggered the run request (for example, a CLI command).
|
|
@@ -406,7 +557,17 @@ interface RunDatasetRequest {
|
|
|
406
557
|
triggerId?: string;
|
|
407
558
|
datasetId: string;
|
|
408
559
|
evaluatorIds: ReadonlyArray<string>;
|
|
560
|
+
/** RunConfig name surfaced on evaluator `meta` (from the job or `PROGRAMMATIC_RUN_CONFIG`). */
|
|
561
|
+
runConfigName: string;
|
|
409
562
|
concurrency?: number;
|
|
563
|
+
/**
|
|
564
|
+
* How many times each test case is executed (default: 1). For RunConfig-backed runs, set per row on the config.
|
|
565
|
+
*/
|
|
566
|
+
repetitions?: number;
|
|
567
|
+
/**
|
|
568
|
+
* Optional tags for this run; forwarded as `runConfigTags` on evaluator callbacks (e.g. suite labels).
|
|
569
|
+
*/
|
|
570
|
+
runConfigTags?: ReadonlyArray<string>;
|
|
410
571
|
}
|
|
411
572
|
interface RunSnapshot {
|
|
412
573
|
runId: string;
|
|
@@ -443,8 +604,9 @@ type RunnerEvent = {
|
|
|
443
604
|
testCaseName: string;
|
|
444
605
|
startedTestCases: number;
|
|
445
606
|
totalTestCases: number;
|
|
446
|
-
|
|
447
|
-
|
|
607
|
+
repetitionId: string;
|
|
608
|
+
repetitionIndex: number;
|
|
609
|
+
repetitionCount: number;
|
|
448
610
|
} | {
|
|
449
611
|
type: 'TestCaseProgress';
|
|
450
612
|
runId: string;
|
|
@@ -452,8 +614,9 @@ type RunnerEvent = {
|
|
|
452
614
|
testCaseName: string;
|
|
453
615
|
completedTestCases: number;
|
|
454
616
|
totalTestCases: number;
|
|
455
|
-
|
|
456
|
-
|
|
617
|
+
repetitionId: string;
|
|
618
|
+
repetitionIndex: number;
|
|
619
|
+
repetitionCount: number;
|
|
457
620
|
passed: boolean;
|
|
458
621
|
durationMs: number;
|
|
459
622
|
evaluatorScores: ReadonlyArray<{
|
|
@@ -488,11 +651,26 @@ type RunnerEvent = {
|
|
|
488
651
|
interface SubscribeOptions {
|
|
489
652
|
runId?: string;
|
|
490
653
|
}
|
|
654
|
+
interface RunDatasetJobsWithSharedConcurrencyRequest {
|
|
655
|
+
jobs: ReadonlyArray<RunDatasetJob>;
|
|
656
|
+
globalConcurrency: number;
|
|
657
|
+
triggerId?: string;
|
|
658
|
+
}
|
|
491
659
|
interface RunnerApi {
|
|
492
660
|
collectDatasets(): Promise<ReadonlyArray<CollectedDataset>>;
|
|
493
661
|
collectEvaluators(): Promise<ReadonlyArray<CollectedEvaluator>>;
|
|
662
|
+
collectRunConfigs(): Promise<ReadonlyArray<CollectedRunConfig>>;
|
|
494
663
|
resolveDatasetByName(name: string): Promise<CollectedDataset | undefined>;
|
|
495
664
|
resolveEvaluatorsByNamePattern(pattern: string): Promise<ReadonlyArray<CollectedEvaluator>>;
|
|
665
|
+
/**
|
|
666
|
+
* Resolves a RunConfig by display name (case-insensitive).
|
|
667
|
+
* @throws If more than one discovered RunConfig uses the same name (list file paths in the error).
|
|
668
|
+
*/
|
|
669
|
+
resolveRunConfigByName(name: string): Promise<CollectedRunConfig | undefined>;
|
|
670
|
+
expandRunConfigToJobs(collected: CollectedRunConfig): Promise<ReadonlyArray<RunDatasetJob>>;
|
|
671
|
+
/** Resolves each name in order and concatenates expanded jobs (the same name may appear more than once). */
|
|
672
|
+
expandRunConfigNamesToJobs(names: ReadonlyArray<string>): Promise<ReadonlyArray<RunDatasetJob>>;
|
|
673
|
+
runDatasetJobsWithSharedConcurrency(request: RunDatasetJobsWithSharedConcurrencyRequest): Promise<ReadonlyArray<RunSnapshot>>;
|
|
496
674
|
searchTestCases(query?: SearchTestCasesQuery): Promise<ReadonlyArray<CollectedTestCase>>;
|
|
497
675
|
collectDatasetTestCases(datasetId: string): Promise<ReadonlyArray<CollectedTestCase>>;
|
|
498
676
|
runDatasetWith(request: RunDatasetRequest): Promise<RunSnapshot>;
|
|
@@ -538,4 +716,20 @@ interface BinaryScoreData {
|
|
|
538
716
|
}
|
|
539
717
|
declare const binaryScore: ScoreDef<BinaryScoreData>;
|
|
540
718
|
|
|
541
|
-
|
|
719
|
+
/**
|
|
720
|
+
* Map from each tag literal to a `string` value (the same string at runtime).
|
|
721
|
+
* Lets you reference `set['my-tag']` with autocomplete and errors on unknown keys.
|
|
722
|
+
*/
|
|
723
|
+
type TagSetMembers<T extends readonly string[]> = {
|
|
724
|
+
readonly [K in T[number]]: string;
|
|
725
|
+
};
|
|
726
|
+
/**
|
|
727
|
+
* Closed set of tag strings for type-safe references (`set['alpha']` is valid; `set['nope']` is a type error).
|
|
728
|
+
* Values are plain `string`, so they assign to `string[]`, dataset matchers, etc.
|
|
729
|
+
*/
|
|
730
|
+
declare class TagSet {
|
|
731
|
+
private constructor();
|
|
732
|
+
static define<const T extends readonly string[]>(tags: T): TagSetMembers<T>;
|
|
733
|
+
}
|
|
734
|
+
|
|
735
|
+
export { BinaryScoreData, CliState, CollectedDataset, CollectedEvaluator, CollectedRunConfig, CollectedTestCase, ConfigType, CreateDiffLogEntryOptions, Dataset, DeltaScoreData, DiffLogEntry, EvalDataset, EvalMiddleware, EvalRun, EvalsData, EvaluateArgs, EvaluateMeta, Evaluator, EvaluatorLogEntry, EvaluatorName, EvaluatorNameSchema, EvaluatorOption, FormatMetricOptions, FormatScoreOptions, JsonDiffOptions, LatencyData, LogEntry, M4trixEvalConfig, M4trixEvalConfigDiscovery, Metric, MetricDef, MetricItem, PROGRAMMATIC_RUN_CONFIG, PathMatcher, PercentScoreData, PrintJsonDiffOptions, RunConfig, RunConfigDefineConfig, RunConfigName, RunConfigNameSchema, RunConfigRow, RunConfigRowEvaluators, RunConfigRowPattern, RunDatasetJob, RunDatasetJobsWithSharedConcurrencyRequest, RunDatasetRequest, RunSnapshot, RunnerApi, RunnerConfig, RunnerConfigOverrides, RunnerDiscoveryConfig, RunnerEvent, Score, ScoreDef, ScoreDisplayStrategy, ScoreItem, SearchTestCasesQuery, StartupArgs, TagMatcher, TagSet, TagSetMembers, TestCase, TestCaseName, TestCaseNameSchema, TokenCountData, ViewLevel, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, deltaScore, formatScoreData, getEvaluatorDisplayLabel, getEvaluatorTagList, getLogLines, getMetricById, getScoreById, getTestCaseDisplayLabel, getTestCaseTagList, latencyMetric, loadMockData, loadRunnerData, normalizeOptionalDisplayName, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, validateEvaluatorName, validateRunConfigName, validateTestCaseName, withRunnerConfig };
|