@m4trix/evals 0.25.1 → 0.27.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +32 -9
- package/dist/cli-simple.cjs +845 -455
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +846 -456
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +543 -273
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +543 -273
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +948 -545
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +228 -14
- package/dist/index.js +933 -547
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.d.ts
CHANGED
|
@@ -86,6 +86,7 @@ interface RunnerDiscoveryConfig {
|
|
|
86
86
|
rootDir: string;
|
|
87
87
|
datasetSuffixes: ReadonlyArray<string>;
|
|
88
88
|
evaluatorSuffixes: ReadonlyArray<string>;
|
|
89
|
+
runConfigSuffixes: ReadonlyArray<string>;
|
|
89
90
|
testCaseSuffixes: ReadonlyArray<string>;
|
|
90
91
|
excludeDirectories: ReadonlyArray<string>;
|
|
91
92
|
}
|
|
@@ -102,9 +103,11 @@ interface M4trixEvalConfigDiscovery {
|
|
|
102
103
|
rootDir?: string;
|
|
103
104
|
datasetFilePatterns?: ReadonlyArray<string>;
|
|
104
105
|
evaluatorFilePatterns?: ReadonlyArray<string>;
|
|
106
|
+
runConfigFilePatterns?: ReadonlyArray<string>;
|
|
105
107
|
testCaseFilePatterns?: ReadonlyArray<string>;
|
|
106
108
|
datasetSuffixes?: ReadonlyArray<string>;
|
|
107
109
|
evaluatorSuffixes?: ReadonlyArray<string>;
|
|
110
|
+
runConfigSuffixes?: ReadonlyArray<string>;
|
|
108
111
|
testCaseSuffixes?: ReadonlyArray<string>;
|
|
109
112
|
excludeDirectories?: ReadonlyArray<string>;
|
|
110
113
|
}
|
|
@@ -120,16 +123,16 @@ declare function defineConfig<TConfig extends ConfigType>(factory: M4trixEvalCon
|
|
|
120
123
|
declare const defaultRunnerConfig: RunnerConfig;
|
|
121
124
|
declare function withRunnerConfig(overrides?: RunnerConfigOverrides): RunnerConfig;
|
|
122
125
|
|
|
123
|
-
/** Matches a tag by exact string equality or regex test */
|
|
124
|
-
type TagMatcher = string | RegExp;
|
|
125
|
-
/** Matches a file path by glob string or regex test */
|
|
126
|
-
type PathMatcher = string | RegExp;
|
|
127
|
-
|
|
128
126
|
type InputOrBuilder<T> = T | (() => T);
|
|
129
127
|
interface TestCaseDescribeConfig<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any = Schema.Schema<unknown>> {
|
|
128
|
+
/**
|
|
129
|
+
* Stable id (letters, digits, `_`, `-`); used in discovery and matching.
|
|
130
|
+
* For an unrestricted UI label, set {@link displayName}.
|
|
131
|
+
*/
|
|
130
132
|
name: string;
|
|
133
|
+
/** Optional human-readable label for CLI/TUI and evaluator args (any characters). */
|
|
134
|
+
displayName?: string;
|
|
131
135
|
tags: string[];
|
|
132
|
-
reruns?: number;
|
|
133
136
|
inputSchema: TI;
|
|
134
137
|
input: InputOrBuilder<Schema.Schema.Type<TI>>;
|
|
135
138
|
outputSchema?: TO;
|
|
@@ -139,17 +142,38 @@ declare class TestCase<TInput = unknown, TOutput = unknown> {
|
|
|
139
142
|
private readonly _config;
|
|
140
143
|
private constructor();
|
|
141
144
|
static describe<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any = Schema.Schema<unknown>>(config: TestCaseDescribeConfig<TI, TO>): TestCase<Schema.Schema.Type<TI>, Schema.Schema.Type<TO>>;
|
|
142
|
-
getReruns(): number;
|
|
143
145
|
getName(): string;
|
|
146
|
+
getDisplayName(): string | undefined;
|
|
147
|
+
getDisplayLabel(): string;
|
|
144
148
|
getTags(): string[];
|
|
145
149
|
getInputSchema(): Schema.Schema.Any;
|
|
146
150
|
getInput(): TInput;
|
|
147
151
|
getOutputSchema(): Schema.Schema.Any | undefined;
|
|
148
152
|
getOutput(): TOutput | undefined;
|
|
149
153
|
}
|
|
154
|
+
/** CLI-friendly label: {@link TestCase.getDisplayLabel} when present, else {@link TestCase.getName} (supports plain test-case-shaped objects). */
|
|
155
|
+
declare function getTestCaseDisplayLabel(testCase: {
|
|
156
|
+
getDisplayLabel?: () => string;
|
|
157
|
+
getName?: () => string;
|
|
158
|
+
}): string;
|
|
159
|
+
/** Tags for evaluator `args.testCaseTags` (supports plain test-case-shaped objects without `getTags`). */
|
|
160
|
+
declare function getTestCaseTagList(testCase: {
|
|
161
|
+
getTags?: () => ReadonlyArray<string>;
|
|
162
|
+
}): string[];
|
|
163
|
+
|
|
164
|
+
/** Matches a tag by exact string equality or regex test */
|
|
165
|
+
type TagMatcher = string | RegExp;
|
|
166
|
+
/** Matches a file path by glob string or regex test */
|
|
167
|
+
type PathMatcher = string | RegExp;
|
|
150
168
|
|
|
151
169
|
interface DatasetDefineConfig {
|
|
170
|
+
/**
|
|
171
|
+
* Stable id (letters, digits, `_`, `-`); used for discovery ids and `resolveDatasetByName`.
|
|
172
|
+
* For an unrestricted UI label, set {@link displayName}.
|
|
173
|
+
*/
|
|
152
174
|
name: string;
|
|
175
|
+
/** Optional human-readable label for CLI/TUI (any characters). */
|
|
176
|
+
displayName?: string;
|
|
153
177
|
includedTags?: TagMatcher[];
|
|
154
178
|
excludedTags?: TagMatcher[];
|
|
155
179
|
includedPaths?: PathMatcher[];
|
|
@@ -159,13 +183,22 @@ declare class Dataset {
|
|
|
159
183
|
private readonly _config;
|
|
160
184
|
private constructor();
|
|
161
185
|
static define(config: DatasetDefineConfig): Dataset;
|
|
186
|
+
/** Canonical dataset id (same rules as `RunConfig` / `TestCase` `name`). */
|
|
162
187
|
getName(): string;
|
|
188
|
+
getDisplayName(): string | undefined;
|
|
189
|
+
/** Label for CLI/TUI and evaluator `meta.datasetName`: {@link getDisplayName} if set, otherwise {@link getName}. */
|
|
190
|
+
getDisplayLabel(): string;
|
|
163
191
|
getIncludedTags(): ReadonlyArray<TagMatcher>;
|
|
164
192
|
getExcludedTags(): ReadonlyArray<TagMatcher>;
|
|
165
193
|
getIncludedPaths(): ReadonlyArray<PathMatcher>;
|
|
166
194
|
getExcludedPaths(): ReadonlyArray<PathMatcher>;
|
|
167
195
|
matchesTestCase(testCase: TestCase<unknown>, filePath: string): boolean;
|
|
168
196
|
}
|
|
197
|
+
/** CLI / runner: display label for a dataset-shaped object (supports discovery duck-types). */
|
|
198
|
+
declare function getDatasetDisplayLabel(dataset: {
|
|
199
|
+
getDisplayLabel?: () => string;
|
|
200
|
+
getName?: () => string;
|
|
201
|
+
}): string;
|
|
169
202
|
|
|
170
203
|
/**
|
|
171
204
|
* Options for customizing JSON diff output. Passed to logDiff, createDiffLogEntry, and printJsonDiff.
|
|
@@ -241,8 +274,19 @@ interface EvaluateMeta {
|
|
|
241
274
|
* for this specific test-case run.
|
|
242
275
|
*/
|
|
243
276
|
runId: string;
|
|
244
|
-
/**
|
|
245
|
-
|
|
277
|
+
/** Display label for the dataset (`Dataset.getDisplayLabel()`, i.e. `displayName ?? name`). */
|
|
278
|
+
datasetName: string;
|
|
279
|
+
/** Canonical `RunConfig` name (or `programmatic` for API/TUI-only runs). */
|
|
280
|
+
runConfigName: string;
|
|
281
|
+
/**
|
|
282
|
+
* Stable id shared by every execution of the same logical test case when `repetitionCount > 1`
|
|
283
|
+
* (and present with count 1 for consistency).
|
|
284
|
+
*/
|
|
285
|
+
repetitionId: string;
|
|
286
|
+
/** 1-based index of this execution within the repetition group. */
|
|
287
|
+
repetitionIndex: number;
|
|
288
|
+
/** Total scheduled executions for this logical test case in the current run. */
|
|
289
|
+
repetitionCount: number;
|
|
246
290
|
}
|
|
247
291
|
interface EvaluateArgs<TInput, TOutput = unknown, TCtx = Record<string, never>> {
|
|
248
292
|
input: TInput;
|
|
@@ -250,6 +294,12 @@ interface EvaluateArgs<TInput, TOutput = unknown, TCtx = Record<string, never>>
|
|
|
250
294
|
output?: TOutput;
|
|
251
295
|
/** Metadata about the current evaluator invocation. */
|
|
252
296
|
meta: EvaluateMeta;
|
|
297
|
+
/** Tags from `TestCase.describe({ tags })` for the current test case. */
|
|
298
|
+
testCaseTags: string[];
|
|
299
|
+
/** Tags from `RunConfig.define({ tags })` for this job; empty for programmatic runs unless set on the request. */
|
|
300
|
+
runConfigTags: string[];
|
|
301
|
+
/** Tags from `Evaluator.define({ tags })` for this evaluator. */
|
|
302
|
+
evaluatorTags: string[];
|
|
253
303
|
/** Records a diff for this test case; stored in run artifact and shown by CLI */
|
|
254
304
|
logDiff: (expected: unknown, actual: unknown, options?: CreateDiffLogEntryOptions) => void;
|
|
255
305
|
/** Logs a message or object for this test case; stored in run artifact and shown by CLI */
|
|
@@ -266,12 +316,20 @@ interface EvaluateArgs<TInput, TOutput = unknown, TCtx = Record<string, never>>
|
|
|
266
316
|
}
|
|
267
317
|
type EvaluateFn<TInput, TOutput, TScore, TCtx> = (args: EvaluateArgs<TInput, TOutput, TCtx>) => TScore | Error | Promise<TScore | Error>;
|
|
268
318
|
interface EvaluatorDefineConfig<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any, TS extends Schema.Schema.Any> {
|
|
319
|
+
/**
|
|
320
|
+
* Stable id (letters, digits, `_`, `-`); used for discovery, name patterns, and `meta`.
|
|
321
|
+
* For an unrestricted UI label, set {@link displayName}.
|
|
322
|
+
*/
|
|
269
323
|
name: string;
|
|
324
|
+
/** Optional human-readable label for CLI/TUI (any characters). */
|
|
325
|
+
displayName?: string;
|
|
270
326
|
inputSchema: TI;
|
|
271
327
|
outputSchema: TO;
|
|
272
328
|
scoreSchema: TS;
|
|
273
329
|
passThreshold?: number;
|
|
274
330
|
passCriterion?: (score: unknown) => boolean;
|
|
331
|
+
/** Optional tags for this evaluator; surfaced on every `evaluate` invocation. */
|
|
332
|
+
tags?: ReadonlyArray<string>;
|
|
275
333
|
}
|
|
276
334
|
declare class Evaluator<TInput = unknown, TOutput = unknown, TScore = unknown, TCtx = Record<string, never>> {
|
|
277
335
|
private readonly _config;
|
|
@@ -281,7 +339,13 @@ declare class Evaluator<TInput = unknown, TOutput = unknown, TScore = unknown, T
|
|
|
281
339
|
use<TNew>(middleware: EvalMiddleware<TNew>): Evaluator<TInput, TOutput, TScore, TCtx & TNew>;
|
|
282
340
|
define<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any, TS extends Schema.Schema.Any>(config: EvaluatorDefineConfig<TI, TO, TS>): Evaluator<Schema.Schema.Type<TI>, Schema.Schema.Type<TO>, Schema.Schema.Type<TS>, TCtx>;
|
|
283
341
|
evaluate(fn: EvaluateFn<TInput, TOutput, TScore, TCtx>): Evaluator<TInput, TOutput, TScore, TCtx>;
|
|
342
|
+
/** Canonical evaluator id when defined; otherwise undefined (middleware-only chain). */
|
|
284
343
|
getName(): string | undefined;
|
|
344
|
+
getDisplayName(): string | undefined;
|
|
345
|
+
/** Label for CLI/TUI: {@link getDisplayName} if set, otherwise {@link getName}. Undefined if not yet defined. */
|
|
346
|
+
getDisplayLabel(): string | undefined;
|
|
347
|
+
/** Tags from `Evaluator.define({ tags })`; empty until defined. */
|
|
348
|
+
getTags(): string[];
|
|
285
349
|
getInputSchema(): Schema.Schema.Any | undefined;
|
|
286
350
|
getOutputSchema(): Schema.Schema.Any | undefined;
|
|
287
351
|
getScoreSchema(): Schema.Schema.Any | undefined;
|
|
@@ -291,6 +355,15 @@ declare class Evaluator<TInput = unknown, TOutput = unknown, TScore = unknown, T
|
|
|
291
355
|
getPassCriterion(): ((score: unknown) => boolean) | undefined;
|
|
292
356
|
resolveContext(): Promise<TCtx>;
|
|
293
357
|
}
|
|
358
|
+
/** CLI-friendly label: {@link Evaluator.getDisplayLabel} when present, else {@link Evaluator.getName} (supports plain evaluator-shaped objects from discovery). */
|
|
359
|
+
declare function getEvaluatorDisplayLabel(evaluator: {
|
|
360
|
+
getDisplayLabel?: () => string | undefined;
|
|
361
|
+
getName?: () => string | undefined;
|
|
362
|
+
}): string | undefined;
|
|
363
|
+
/** Tags for evaluator `args.evaluatorTags` (plain evaluator-shaped objects without `getTags` yield `[]`). */
|
|
364
|
+
declare function getEvaluatorTagList(evaluator: {
|
|
365
|
+
getTags?: () => ReadonlyArray<string>;
|
|
366
|
+
}): string[];
|
|
294
367
|
|
|
295
368
|
interface MetricItem<TData = unknown> {
|
|
296
369
|
readonly id: string;
|
|
@@ -320,6 +393,76 @@ declare const Metric: {
|
|
|
320
393
|
};
|
|
321
394
|
declare function getMetricById(id: string): MetricDef<unknown> | undefined;
|
|
322
395
|
|
|
396
|
+
/** Branded id for `RunConfig` `name` (decode with {@link RunConfigNameSchema}). */
|
|
397
|
+
declare const RunConfigNameSchema: Schema.brand<Schema.filter<Schema.filter<Schema.filter<typeof Schema.String>>>, "RunConfigName">;
|
|
398
|
+
/** Branded id for `Evaluator.define({ name })` (decode with {@link EvaluatorNameSchema}). */
|
|
399
|
+
declare const EvaluatorNameSchema: Schema.brand<Schema.filter<Schema.filter<Schema.filter<typeof Schema.String>>>, "EvaluatorName">;
|
|
400
|
+
/** Branded id for `TestCase.describe({ name })` (decode with {@link TestCaseNameSchema}). */
|
|
401
|
+
declare const TestCaseNameSchema: Schema.brand<Schema.filter<Schema.filter<Schema.filter<typeof Schema.String>>>, "TestCaseName">;
|
|
402
|
+
/** Branded id for `Dataset.define({ name })` (decode with {@link DatasetNameSchema}). */
|
|
403
|
+
declare const DatasetNameSchema: Schema.brand<Schema.filter<Schema.filter<Schema.filter<typeof Schema.String>>>, "DatasetName">;
|
|
404
|
+
type RunConfigName = Schema.Schema.Type<typeof RunConfigNameSchema>;
|
|
405
|
+
type EvaluatorName = Schema.Schema.Type<typeof EvaluatorNameSchema>;
|
|
406
|
+
type TestCaseName = Schema.Schema.Type<typeof TestCaseNameSchema>;
|
|
407
|
+
type DatasetName = Schema.Schema.Type<typeof DatasetNameSchema>;
|
|
408
|
+
declare function validateRunConfigName(raw: string, context: string): RunConfigName;
|
|
409
|
+
declare function validateEvaluatorName(raw: string, context: string): EvaluatorName;
|
|
410
|
+
declare function validateTestCaseName(raw: string, context: string): TestCaseName;
|
|
411
|
+
declare function validateDatasetName(raw: string, context: string): DatasetName;
|
|
412
|
+
/** Optional UI label: trim; empty after trim becomes undefined. */
|
|
413
|
+
declare function normalizeOptionalDisplayName(raw: string | undefined): string | undefined;
|
|
414
|
+
|
|
415
|
+
/** Heterogeneous evaluator rows; `unknown` breaks assignability from concrete `EvaluateFn` (contravariance on `input`). */
|
|
416
|
+
type RunConfigEvaluatorRef = Evaluator<any, any, any, any>;
|
|
417
|
+
/** Select evaluators by concrete instances (same module exports as discovery). */
|
|
418
|
+
interface RunConfigRowEvaluators {
|
|
419
|
+
readonly dataset: Dataset;
|
|
420
|
+
readonly evaluators: ReadonlyArray<RunConfigEvaluatorRef>;
|
|
421
|
+
readonly evaluatorPattern?: undefined;
|
|
422
|
+
/**
|
|
423
|
+
* How many times each test case in this dataset is evaluated for this row (default: 1).
|
|
424
|
+
* All executions of the same logical test case share one `repetitionId` in evaluator `meta`.
|
|
425
|
+
*/
|
|
426
|
+
readonly repetitions?: number;
|
|
427
|
+
}
|
|
428
|
+
/** Select evaluators using the same wildcard / regex rules as the runner's `resolveEvaluatorsByNamePattern`. */
|
|
429
|
+
interface RunConfigRowPattern {
|
|
430
|
+
readonly dataset: Dataset;
|
|
431
|
+
readonly evaluatorPattern: string;
|
|
432
|
+
readonly evaluators?: undefined;
|
|
433
|
+
readonly repetitions?: number;
|
|
434
|
+
}
|
|
435
|
+
type RunConfigRow = RunConfigRowEvaluators | RunConfigRowPattern;
|
|
436
|
+
interface RunConfigDefineConfig {
|
|
437
|
+
/**
|
|
438
|
+
* Stable id (letters, digits, `_`, `-`); surfaced in discovery, CLI `--run-config`, and evaluator `meta`.
|
|
439
|
+
* For an unrestricted UI label, set {@link displayName}.
|
|
440
|
+
*/
|
|
441
|
+
name: string;
|
|
442
|
+
/** Optional human-readable label for CLI/TUI (any characters). */
|
|
443
|
+
displayName?: string;
|
|
444
|
+
/** Optional tags; copied to every evaluation as `runConfigTags` on the evaluator callback. */
|
|
445
|
+
tags?: ReadonlyArray<string>;
|
|
446
|
+
runs: ReadonlyArray<RunConfigRow>;
|
|
447
|
+
}
|
|
448
|
+
declare class RunConfig {
|
|
449
|
+
private readonly _name;
|
|
450
|
+
private readonly _displayName;
|
|
451
|
+
private readonly _tags;
|
|
452
|
+
private readonly _runs;
|
|
453
|
+
private constructor();
|
|
454
|
+
static define(config: RunConfigDefineConfig): RunConfig;
|
|
455
|
+
/** Canonical id (branded {@link RunConfigName} at runtime; typed as `string` for ergonomics). */
|
|
456
|
+
getName(): string;
|
|
457
|
+
/** Optional unrestricted display label. */
|
|
458
|
+
getDisplayName(): string | undefined;
|
|
459
|
+
/** Label for CLI/TUI: {@link getDisplayName} if set, otherwise {@link getName}. */
|
|
460
|
+
getDisplayLabel(): string;
|
|
461
|
+
/** Tags from `RunConfig.define({ tags })`; surfaced as `runConfigTags` on evaluator callbacks. */
|
|
462
|
+
getTags(): string[];
|
|
463
|
+
getRuns(): ReadonlyArray<RunConfigRow>;
|
|
464
|
+
}
|
|
465
|
+
|
|
323
466
|
type ScoreDisplayStrategy = 'bar' | 'number' | 'passFail';
|
|
324
467
|
interface ScoreItem<TData = unknown> {
|
|
325
468
|
readonly id: string;
|
|
@@ -387,6 +530,29 @@ interface CollectedEvaluator {
|
|
|
387
530
|
filePath: string;
|
|
388
531
|
evaluator: Evaluator<unknown, unknown, unknown, unknown>;
|
|
389
532
|
}
|
|
533
|
+
interface CollectedRunConfig {
|
|
534
|
+
id: string;
|
|
535
|
+
filePath: string;
|
|
536
|
+
runConfig: RunConfig;
|
|
537
|
+
}
|
|
538
|
+
/** One dataset + evaluator set queued as part of a RunConfig or batch run. */
|
|
539
|
+
interface RunDatasetJob {
|
|
540
|
+
datasetId: string;
|
|
541
|
+
evaluatorIds: ReadonlyArray<string>;
|
|
542
|
+
/** RunConfig name (same as `RunConfig.getName()`). */
|
|
543
|
+
runConfigName: string;
|
|
544
|
+
/**
|
|
545
|
+
* Human-facing RunConfig label (`displayName ?? name`) when the job was expanded from `RunConfig.define`.
|
|
546
|
+
* Omitted for ad-hoc jobs; UI should fall back to {@link runConfigName}.
|
|
547
|
+
*/
|
|
548
|
+
runConfigDisplayLabel?: string;
|
|
549
|
+
/**
|
|
550
|
+
* Tags from `RunConfig.define({ tags })` for this job; forwarded as `runConfigTags` on evaluator callbacks.
|
|
551
|
+
*/
|
|
552
|
+
runConfigTags?: ReadonlyArray<string>;
|
|
553
|
+
/** Evaluates each matching test case this many times (default 1). */
|
|
554
|
+
repetitions: number;
|
|
555
|
+
}
|
|
390
556
|
interface CollectedTestCase {
|
|
391
557
|
id: string;
|
|
392
558
|
filePath: string;
|
|
@@ -398,6 +564,10 @@ interface SearchTestCasesQuery {
|
|
|
398
564
|
includedPaths?: ReadonlyArray<string | RegExp>;
|
|
399
565
|
excludedPaths?: ReadonlyArray<string | RegExp>;
|
|
400
566
|
}
|
|
567
|
+
/** Use with `RunDatasetRequest` for API / TUI runs that are not backed by a `RunConfig` file. */
|
|
568
|
+
declare const PROGRAMMATIC_RUN_CONFIG: {
|
|
569
|
+
readonly runConfigName: "programmatic";
|
|
570
|
+
};
|
|
401
571
|
interface RunDatasetRequest {
|
|
402
572
|
/**
|
|
403
573
|
* Identifier for what triggered the run request (for example, a CLI command).
|
|
@@ -406,7 +576,17 @@ interface RunDatasetRequest {
|
|
|
406
576
|
triggerId?: string;
|
|
407
577
|
datasetId: string;
|
|
408
578
|
evaluatorIds: ReadonlyArray<string>;
|
|
579
|
+
/** RunConfig name surfaced on evaluator `meta` (from the job or `PROGRAMMATIC_RUN_CONFIG`). */
|
|
580
|
+
runConfigName: string;
|
|
409
581
|
concurrency?: number;
|
|
582
|
+
/**
|
|
583
|
+
* How many times each test case is executed (default: 1). For RunConfig-backed runs, set per row on the config.
|
|
584
|
+
*/
|
|
585
|
+
repetitions?: number;
|
|
586
|
+
/**
|
|
587
|
+
* Optional tags for this run; forwarded as `runConfigTags` on evaluator callbacks (e.g. suite labels).
|
|
588
|
+
*/
|
|
589
|
+
runConfigTags?: ReadonlyArray<string>;
|
|
410
590
|
}
|
|
411
591
|
interface RunSnapshot {
|
|
412
592
|
runId: string;
|
|
@@ -443,8 +623,9 @@ type RunnerEvent = {
|
|
|
443
623
|
testCaseName: string;
|
|
444
624
|
startedTestCases: number;
|
|
445
625
|
totalTestCases: number;
|
|
446
|
-
|
|
447
|
-
|
|
626
|
+
repetitionId: string;
|
|
627
|
+
repetitionIndex: number;
|
|
628
|
+
repetitionCount: number;
|
|
448
629
|
} | {
|
|
449
630
|
type: 'TestCaseProgress';
|
|
450
631
|
runId: string;
|
|
@@ -452,8 +633,9 @@ type RunnerEvent = {
|
|
|
452
633
|
testCaseName: string;
|
|
453
634
|
completedTestCases: number;
|
|
454
635
|
totalTestCases: number;
|
|
455
|
-
|
|
456
|
-
|
|
636
|
+
repetitionId: string;
|
|
637
|
+
repetitionIndex: number;
|
|
638
|
+
repetitionCount: number;
|
|
457
639
|
passed: boolean;
|
|
458
640
|
durationMs: number;
|
|
459
641
|
evaluatorScores: ReadonlyArray<{
|
|
@@ -488,11 +670,27 @@ type RunnerEvent = {
|
|
|
488
670
|
interface SubscribeOptions {
|
|
489
671
|
runId?: string;
|
|
490
672
|
}
|
|
673
|
+
interface RunDatasetJobsWithSharedConcurrencyRequest {
|
|
674
|
+
jobs: ReadonlyArray<RunDatasetJob>;
|
|
675
|
+
globalConcurrency: number;
|
|
676
|
+
triggerId?: string;
|
|
677
|
+
}
|
|
491
678
|
interface RunnerApi {
|
|
492
679
|
collectDatasets(): Promise<ReadonlyArray<CollectedDataset>>;
|
|
493
680
|
collectEvaluators(): Promise<ReadonlyArray<CollectedEvaluator>>;
|
|
681
|
+
collectRunConfigs(): Promise<ReadonlyArray<CollectedRunConfig>>;
|
|
682
|
+
/** Resolves a dataset by canonical **`Dataset` `name`** (id), case-insensitive. */
|
|
494
683
|
resolveDatasetByName(name: string): Promise<CollectedDataset | undefined>;
|
|
495
684
|
resolveEvaluatorsByNamePattern(pattern: string): Promise<ReadonlyArray<CollectedEvaluator>>;
|
|
685
|
+
/**
|
|
686
|
+
* Resolves a RunConfig by display name (case-insensitive).
|
|
687
|
+
* @throws If more than one discovered RunConfig uses the same name (list file paths in the error).
|
|
688
|
+
*/
|
|
689
|
+
resolveRunConfigByName(name: string): Promise<CollectedRunConfig | undefined>;
|
|
690
|
+
expandRunConfigToJobs(collected: CollectedRunConfig): Promise<ReadonlyArray<RunDatasetJob>>;
|
|
691
|
+
/** Resolves each name in order and concatenates expanded jobs (the same name may appear more than once). */
|
|
692
|
+
expandRunConfigNamesToJobs(names: ReadonlyArray<string>): Promise<ReadonlyArray<RunDatasetJob>>;
|
|
693
|
+
runDatasetJobsWithSharedConcurrency(request: RunDatasetJobsWithSharedConcurrencyRequest): Promise<ReadonlyArray<RunSnapshot>>;
|
|
496
694
|
searchTestCases(query?: SearchTestCasesQuery): Promise<ReadonlyArray<CollectedTestCase>>;
|
|
497
695
|
collectDatasetTestCases(datasetId: string): Promise<ReadonlyArray<CollectedTestCase>>;
|
|
498
696
|
runDatasetWith(request: RunDatasetRequest): Promise<RunSnapshot>;
|
|
@@ -538,4 +736,20 @@ interface BinaryScoreData {
|
|
|
538
736
|
}
|
|
539
737
|
declare const binaryScore: ScoreDef<BinaryScoreData>;
|
|
540
738
|
|
|
541
|
-
|
|
739
|
+
/**
|
|
740
|
+
* Map from each tag literal to a `string` value (the same string at runtime).
|
|
741
|
+
* Lets you reference `set['my-tag']` with autocomplete and errors on unknown keys.
|
|
742
|
+
*/
|
|
743
|
+
type TagSetMembers<T extends readonly string[]> = {
|
|
744
|
+
readonly [K in T[number]]: string;
|
|
745
|
+
};
|
|
746
|
+
/**
|
|
747
|
+
* Closed set of tag strings for type-safe references (`set['alpha']` is valid; `set['nope']` is a type error).
|
|
748
|
+
* Values are plain `string`, so they assign to `string[]`, dataset matchers, etc.
|
|
749
|
+
*/
|
|
750
|
+
declare class TagSet {
|
|
751
|
+
private constructor();
|
|
752
|
+
static define<const T extends readonly string[]>(tags: T): TagSetMembers<T>;
|
|
753
|
+
}
|
|
754
|
+
|
|
755
|
+
export { BinaryScoreData, CliState, CollectedDataset, CollectedEvaluator, CollectedRunConfig, CollectedTestCase, ConfigType, CreateDiffLogEntryOptions, Dataset, DatasetDefineConfig, DatasetName, DatasetNameSchema, DeltaScoreData, DiffLogEntry, EvalDataset, EvalMiddleware, EvalRun, EvalsData, EvaluateArgs, EvaluateMeta, Evaluator, EvaluatorLogEntry, EvaluatorName, EvaluatorNameSchema, EvaluatorOption, FormatMetricOptions, FormatScoreOptions, JsonDiffOptions, LatencyData, LogEntry, M4trixEvalConfig, M4trixEvalConfigDiscovery, Metric, MetricDef, MetricItem, PROGRAMMATIC_RUN_CONFIG, PathMatcher, PercentScoreData, PrintJsonDiffOptions, RunConfig, RunConfigDefineConfig, RunConfigName, RunConfigNameSchema, RunConfigRow, RunConfigRowEvaluators, RunConfigRowPattern, RunDatasetJob, RunDatasetJobsWithSharedConcurrencyRequest, RunDatasetRequest, RunSnapshot, RunnerApi, RunnerConfig, RunnerConfigOverrides, RunnerDiscoveryConfig, RunnerEvent, Score, ScoreDef, ScoreDisplayStrategy, ScoreItem, SearchTestCasesQuery, StartupArgs, TagMatcher, TagSet, TagSetMembers, TestCase, TestCaseName, TestCaseNameSchema, TokenCountData, ViewLevel, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, deltaScore, formatScoreData, getDatasetDisplayLabel, getEvaluatorDisplayLabel, getEvaluatorTagList, getLogLines, getMetricById, getScoreById, getTestCaseDisplayLabel, getTestCaseTagList, latencyMetric, loadMockData, loadRunnerData, normalizeOptionalDisplayName, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, validateDatasetName, validateEvaluatorName, validateRunConfigName, validateTestCaseName, withRunnerConfig };
|