@m4trix/evals 0.25.1 → 0.27.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -86,6 +86,7 @@ interface RunnerDiscoveryConfig {
86
86
  rootDir: string;
87
87
  datasetSuffixes: ReadonlyArray<string>;
88
88
  evaluatorSuffixes: ReadonlyArray<string>;
89
+ runConfigSuffixes: ReadonlyArray<string>;
89
90
  testCaseSuffixes: ReadonlyArray<string>;
90
91
  excludeDirectories: ReadonlyArray<string>;
91
92
  }
@@ -102,9 +103,11 @@ interface M4trixEvalConfigDiscovery {
102
103
  rootDir?: string;
103
104
  datasetFilePatterns?: ReadonlyArray<string>;
104
105
  evaluatorFilePatterns?: ReadonlyArray<string>;
106
+ runConfigFilePatterns?: ReadonlyArray<string>;
105
107
  testCaseFilePatterns?: ReadonlyArray<string>;
106
108
  datasetSuffixes?: ReadonlyArray<string>;
107
109
  evaluatorSuffixes?: ReadonlyArray<string>;
110
+ runConfigSuffixes?: ReadonlyArray<string>;
108
111
  testCaseSuffixes?: ReadonlyArray<string>;
109
112
  excludeDirectories?: ReadonlyArray<string>;
110
113
  }
@@ -120,16 +123,16 @@ declare function defineConfig<TConfig extends ConfigType>(factory: M4trixEvalCon
120
123
  declare const defaultRunnerConfig: RunnerConfig;
121
124
  declare function withRunnerConfig(overrides?: RunnerConfigOverrides): RunnerConfig;
122
125
 
123
- /** Matches a tag by exact string equality or regex test */
124
- type TagMatcher = string | RegExp;
125
- /** Matches a file path by glob string or regex test */
126
- type PathMatcher = string | RegExp;
127
-
128
126
  type InputOrBuilder<T> = T | (() => T);
129
127
  interface TestCaseDescribeConfig<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any = Schema.Schema<unknown>> {
128
+ /**
129
+ * Stable id (letters, digits, `_`, `-`); used in discovery and matching.
130
+ * For an unrestricted UI label, set {@link displayName}.
131
+ */
130
132
  name: string;
133
+ /** Optional human-readable label for CLI/TUI and evaluator args (any characters). */
134
+ displayName?: string;
131
135
  tags: string[];
132
- reruns?: number;
133
136
  inputSchema: TI;
134
137
  input: InputOrBuilder<Schema.Schema.Type<TI>>;
135
138
  outputSchema?: TO;
@@ -139,17 +142,38 @@ declare class TestCase<TInput = unknown, TOutput = unknown> {
139
142
  private readonly _config;
140
143
  private constructor();
141
144
  static describe<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any = Schema.Schema<unknown>>(config: TestCaseDescribeConfig<TI, TO>): TestCase<Schema.Schema.Type<TI>, Schema.Schema.Type<TO>>;
142
- getReruns(): number;
143
145
  getName(): string;
146
+ getDisplayName(): string | undefined;
147
+ getDisplayLabel(): string;
144
148
  getTags(): string[];
145
149
  getInputSchema(): Schema.Schema.Any;
146
150
  getInput(): TInput;
147
151
  getOutputSchema(): Schema.Schema.Any | undefined;
148
152
  getOutput(): TOutput | undefined;
149
153
  }
154
+ /** CLI-friendly label: {@link TestCase.getDisplayLabel} when present, else {@link TestCase.getName} (supports plain test-case-shaped objects). */
155
+ declare function getTestCaseDisplayLabel(testCase: {
156
+ getDisplayLabel?: () => string;
157
+ getName?: () => string;
158
+ }): string;
159
+ /** Tags for evaluator `args.testCaseTags` (supports plain test-case-shaped objects without `getTags`). */
160
+ declare function getTestCaseTagList(testCase: {
161
+ getTags?: () => ReadonlyArray<string>;
162
+ }): string[];
163
+
164
+ /** Matches a tag by exact string equality or regex test */
165
+ type TagMatcher = string | RegExp;
166
+ /** Matches a file path by glob string or regex test */
167
+ type PathMatcher = string | RegExp;
150
168
 
151
169
  interface DatasetDefineConfig {
170
+ /**
171
+ * Stable id (letters, digits, `_`, `-`); used for discovery ids and `resolveDatasetByName`.
172
+ * For an unrestricted UI label, set {@link displayName}.
173
+ */
152
174
  name: string;
175
+ /** Optional human-readable label for CLI/TUI (any characters). */
176
+ displayName?: string;
153
177
  includedTags?: TagMatcher[];
154
178
  excludedTags?: TagMatcher[];
155
179
  includedPaths?: PathMatcher[];
@@ -159,13 +183,22 @@ declare class Dataset {
159
183
  private readonly _config;
160
184
  private constructor();
161
185
  static define(config: DatasetDefineConfig): Dataset;
186
+ /** Canonical dataset id (same rules as `RunConfig` / `TestCase` `name`). */
162
187
  getName(): string;
188
+ getDisplayName(): string | undefined;
189
+ /** Label for CLI/TUI and evaluator `meta.datasetName`: {@link getDisplayName} if set, otherwise {@link getName}. */
190
+ getDisplayLabel(): string;
163
191
  getIncludedTags(): ReadonlyArray<TagMatcher>;
164
192
  getExcludedTags(): ReadonlyArray<TagMatcher>;
165
193
  getIncludedPaths(): ReadonlyArray<PathMatcher>;
166
194
  getExcludedPaths(): ReadonlyArray<PathMatcher>;
167
195
  matchesTestCase(testCase: TestCase<unknown>, filePath: string): boolean;
168
196
  }
197
+ /** CLI / runner: display label for a dataset-shaped object (supports discovery duck-types). */
198
+ declare function getDatasetDisplayLabel(dataset: {
199
+ getDisplayLabel?: () => string;
200
+ getName?: () => string;
201
+ }): string;
169
202
 
170
203
  /**
171
204
  * Options for customizing JSON diff output. Passed to logDiff, createDiffLogEntry, and printJsonDiff.
@@ -241,8 +274,19 @@ interface EvaluateMeta {
241
274
  * for this specific test-case run.
242
275
  */
243
276
  runId: string;
244
- /** Identifier of the dataset currently being evaluated. */
245
- datasetId: string;
277
+ /** Display label for the dataset (`Dataset.getDisplayLabel()`, i.e. `displayName ?? name`). */
278
+ datasetName: string;
279
+ /** Canonical `RunConfig` name (or `programmatic` for API/TUI-only runs). */
280
+ runConfigName: string;
281
+ /**
282
+ * Stable id shared by every execution of the same logical test case when `repetitionCount > 1`
283
+ * (and present with count 1 for consistency).
284
+ */
285
+ repetitionId: string;
286
+ /** 1-based index of this execution within the repetition group. */
287
+ repetitionIndex: number;
288
+ /** Total scheduled executions for this logical test case in the current run. */
289
+ repetitionCount: number;
246
290
  }
247
291
  interface EvaluateArgs<TInput, TOutput = unknown, TCtx = Record<string, never>> {
248
292
  input: TInput;
@@ -250,6 +294,12 @@ interface EvaluateArgs<TInput, TOutput = unknown, TCtx = Record<string, never>>
250
294
  output?: TOutput;
251
295
  /** Metadata about the current evaluator invocation. */
252
296
  meta: EvaluateMeta;
297
+ /** Tags from `TestCase.describe({ tags })` for the current test case. */
298
+ testCaseTags: string[];
299
+ /** Tags from `RunConfig.define({ tags })` for this job; empty for programmatic runs unless set on the request. */
300
+ runConfigTags: string[];
301
+ /** Tags from `Evaluator.define({ tags })` for this evaluator. */
302
+ evaluatorTags: string[];
253
303
  /** Records a diff for this test case; stored in run artifact and shown by CLI */
254
304
  logDiff: (expected: unknown, actual: unknown, options?: CreateDiffLogEntryOptions) => void;
255
305
  /** Logs a message or object for this test case; stored in run artifact and shown by CLI */
@@ -266,12 +316,20 @@ interface EvaluateArgs<TInput, TOutput = unknown, TCtx = Record<string, never>>
266
316
  }
267
317
  type EvaluateFn<TInput, TOutput, TScore, TCtx> = (args: EvaluateArgs<TInput, TOutput, TCtx>) => TScore | Error | Promise<TScore | Error>;
268
318
  interface EvaluatorDefineConfig<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any, TS extends Schema.Schema.Any> {
319
+ /**
320
+ * Stable id (letters, digits, `_`, `-`); used for discovery, name patterns, and `meta`.
321
+ * For an unrestricted UI label, set {@link displayName}.
322
+ */
269
323
  name: string;
324
+ /** Optional human-readable label for CLI/TUI (any characters). */
325
+ displayName?: string;
270
326
  inputSchema: TI;
271
327
  outputSchema: TO;
272
328
  scoreSchema: TS;
273
329
  passThreshold?: number;
274
330
  passCriterion?: (score: unknown) => boolean;
331
+ /** Optional tags for this evaluator; surfaced on every `evaluate` invocation. */
332
+ tags?: ReadonlyArray<string>;
275
333
  }
276
334
  declare class Evaluator<TInput = unknown, TOutput = unknown, TScore = unknown, TCtx = Record<string, never>> {
277
335
  private readonly _config;
@@ -281,7 +339,13 @@ declare class Evaluator<TInput = unknown, TOutput = unknown, TScore = unknown, T
281
339
  use<TNew>(middleware: EvalMiddleware<TNew>): Evaluator<TInput, TOutput, TScore, TCtx & TNew>;
282
340
  define<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any, TS extends Schema.Schema.Any>(config: EvaluatorDefineConfig<TI, TO, TS>): Evaluator<Schema.Schema.Type<TI>, Schema.Schema.Type<TO>, Schema.Schema.Type<TS>, TCtx>;
283
341
  evaluate(fn: EvaluateFn<TInput, TOutput, TScore, TCtx>): Evaluator<TInput, TOutput, TScore, TCtx>;
342
+ /** Canonical evaluator id when defined; otherwise undefined (middleware-only chain). */
284
343
  getName(): string | undefined;
344
+ getDisplayName(): string | undefined;
345
+ /** Label for CLI/TUI: {@link getDisplayName} if set, otherwise {@link getName}. Undefined if not yet defined. */
346
+ getDisplayLabel(): string | undefined;
347
+ /** Tags from `Evaluator.define({ tags })`; empty until defined. */
348
+ getTags(): string[];
285
349
  getInputSchema(): Schema.Schema.Any | undefined;
286
350
  getOutputSchema(): Schema.Schema.Any | undefined;
287
351
  getScoreSchema(): Schema.Schema.Any | undefined;
@@ -291,6 +355,15 @@ declare class Evaluator<TInput = unknown, TOutput = unknown, TScore = unknown, T
291
355
  getPassCriterion(): ((score: unknown) => boolean) | undefined;
292
356
  resolveContext(): Promise<TCtx>;
293
357
  }
358
+ /** CLI-friendly label: {@link Evaluator.getDisplayLabel} when present, else {@link Evaluator.getName} (supports plain evaluator-shaped objects from discovery). */
359
+ declare function getEvaluatorDisplayLabel(evaluator: {
360
+ getDisplayLabel?: () => string | undefined;
361
+ getName?: () => string | undefined;
362
+ }): string | undefined;
363
+ /** Tags for evaluator `args.evaluatorTags` (plain evaluator-shaped objects without `getTags` yield `[]`). */
364
+ declare function getEvaluatorTagList(evaluator: {
365
+ getTags?: () => ReadonlyArray<string>;
366
+ }): string[];
294
367
 
295
368
  interface MetricItem<TData = unknown> {
296
369
  readonly id: string;
@@ -320,6 +393,76 @@ declare const Metric: {
320
393
  };
321
394
  declare function getMetricById(id: string): MetricDef<unknown> | undefined;
322
395
 
396
+ /** Branded id for `RunConfig` `name` (decode with {@link RunConfigNameSchema}). */
397
+ declare const RunConfigNameSchema: Schema.brand<Schema.filter<Schema.filter<Schema.filter<typeof Schema.String>>>, "RunConfigName">;
398
+ /** Branded id for `Evaluator.define({ name })` (decode with {@link EvaluatorNameSchema}). */
399
+ declare const EvaluatorNameSchema: Schema.brand<Schema.filter<Schema.filter<Schema.filter<typeof Schema.String>>>, "EvaluatorName">;
400
+ /** Branded id for `TestCase.describe({ name })` (decode with {@link TestCaseNameSchema}). */
401
+ declare const TestCaseNameSchema: Schema.brand<Schema.filter<Schema.filter<Schema.filter<typeof Schema.String>>>, "TestCaseName">;
402
+ /** Branded id for `Dataset.define({ name })` (decode with {@link DatasetNameSchema}). */
403
+ declare const DatasetNameSchema: Schema.brand<Schema.filter<Schema.filter<Schema.filter<typeof Schema.String>>>, "DatasetName">;
404
+ type RunConfigName = Schema.Schema.Type<typeof RunConfigNameSchema>;
405
+ type EvaluatorName = Schema.Schema.Type<typeof EvaluatorNameSchema>;
406
+ type TestCaseName = Schema.Schema.Type<typeof TestCaseNameSchema>;
407
+ type DatasetName = Schema.Schema.Type<typeof DatasetNameSchema>;
408
+ declare function validateRunConfigName(raw: string, context: string): RunConfigName;
409
+ declare function validateEvaluatorName(raw: string, context: string): EvaluatorName;
410
+ declare function validateTestCaseName(raw: string, context: string): TestCaseName;
411
+ declare function validateDatasetName(raw: string, context: string): DatasetName;
412
+ /** Optional UI label: trim; empty after trim becomes undefined. */
413
+ declare function normalizeOptionalDisplayName(raw: string | undefined): string | undefined;
414
+
415
+ /** Heterogeneous evaluator rows; `unknown` breaks assignability from concrete `EvaluateFn` (contravariance on `input`). */
416
+ type RunConfigEvaluatorRef = Evaluator<any, any, any, any>;
417
+ /** Select evaluators by concrete instances (same module exports as discovery). */
418
+ interface RunConfigRowEvaluators {
419
+ readonly dataset: Dataset;
420
+ readonly evaluators: ReadonlyArray<RunConfigEvaluatorRef>;
421
+ readonly evaluatorPattern?: undefined;
422
+ /**
423
+ * How many times each test case in this dataset is evaluated for this row (default: 1).
424
+ * All executions of the same logical test case share one `repetitionId` in evaluator `meta`.
425
+ */
426
+ readonly repetitions?: number;
427
+ }
428
+ /** Select evaluators using the same wildcard / regex rules as the runner's `resolveEvaluatorsByNamePattern`. */
429
+ interface RunConfigRowPattern {
430
+ readonly dataset: Dataset;
431
+ readonly evaluatorPattern: string;
432
+ readonly evaluators?: undefined;
433
+ readonly repetitions?: number;
434
+ }
435
+ type RunConfigRow = RunConfigRowEvaluators | RunConfigRowPattern;
436
+ interface RunConfigDefineConfig {
437
+ /**
438
+ * Stable id (letters, digits, `_`, `-`); surfaced in discovery, CLI `--run-config`, and evaluator `meta`.
439
+ * For an unrestricted UI label, set {@link displayName}.
440
+ */
441
+ name: string;
442
+ /** Optional human-readable label for CLI/TUI (any characters). */
443
+ displayName?: string;
444
+ /** Optional tags; copied to every evaluation as `runConfigTags` on the evaluator callback. */
445
+ tags?: ReadonlyArray<string>;
446
+ runs: ReadonlyArray<RunConfigRow>;
447
+ }
448
+ declare class RunConfig {
449
+ private readonly _name;
450
+ private readonly _displayName;
451
+ private readonly _tags;
452
+ private readonly _runs;
453
+ private constructor();
454
+ static define(config: RunConfigDefineConfig): RunConfig;
455
+ /** Canonical id (branded {@link RunConfigName} at runtime; typed as `string` for ergonomics). */
456
+ getName(): string;
457
+ /** Optional unrestricted display label. */
458
+ getDisplayName(): string | undefined;
459
+ /** Label for CLI/TUI: {@link getDisplayName} if set, otherwise {@link getName}. */
460
+ getDisplayLabel(): string;
461
+ /** Tags from `RunConfig.define({ tags })`; surfaced as `runConfigTags` on evaluator callbacks. */
462
+ getTags(): string[];
463
+ getRuns(): ReadonlyArray<RunConfigRow>;
464
+ }
465
+
323
466
  type ScoreDisplayStrategy = 'bar' | 'number' | 'passFail';
324
467
  interface ScoreItem<TData = unknown> {
325
468
  readonly id: string;
@@ -387,6 +530,29 @@ interface CollectedEvaluator {
387
530
  filePath: string;
388
531
  evaluator: Evaluator<unknown, unknown, unknown, unknown>;
389
532
  }
533
+ interface CollectedRunConfig {
534
+ id: string;
535
+ filePath: string;
536
+ runConfig: RunConfig;
537
+ }
538
+ /** One dataset + evaluator set queued as part of a RunConfig or batch run. */
539
+ interface RunDatasetJob {
540
+ datasetId: string;
541
+ evaluatorIds: ReadonlyArray<string>;
542
+ /** RunConfig name (same as `RunConfig.getName()`). */
543
+ runConfigName: string;
544
+ /**
545
+ * Human-facing RunConfig label (`displayName ?? name`) when the job was expanded from `RunConfig.define`.
546
+ * Omitted for ad-hoc jobs; UI should fall back to {@link runConfigName}.
547
+ */
548
+ runConfigDisplayLabel?: string;
549
+ /**
550
+ * Tags from `RunConfig.define({ tags })` for this job; forwarded as `runConfigTags` on evaluator callbacks.
551
+ */
552
+ runConfigTags?: ReadonlyArray<string>;
553
+ /** Evaluates each matching test case this many times (default 1). */
554
+ repetitions: number;
555
+ }
390
556
  interface CollectedTestCase {
391
557
  id: string;
392
558
  filePath: string;
@@ -398,6 +564,10 @@ interface SearchTestCasesQuery {
398
564
  includedPaths?: ReadonlyArray<string | RegExp>;
399
565
  excludedPaths?: ReadonlyArray<string | RegExp>;
400
566
  }
567
+ /** Use with `RunDatasetRequest` for API / TUI runs that are not backed by a `RunConfig` file. */
568
+ declare const PROGRAMMATIC_RUN_CONFIG: {
569
+ readonly runConfigName: "programmatic";
570
+ };
401
571
  interface RunDatasetRequest {
402
572
  /**
403
573
  * Identifier for what triggered the run request (for example, a CLI command).
@@ -406,7 +576,17 @@ interface RunDatasetRequest {
406
576
  triggerId?: string;
407
577
  datasetId: string;
408
578
  evaluatorIds: ReadonlyArray<string>;
579
+ /** RunConfig name surfaced on evaluator `meta` (from the job or `PROGRAMMATIC_RUN_CONFIG`). */
580
+ runConfigName: string;
409
581
  concurrency?: number;
582
+ /**
583
+ * How many times each test case is executed (default: 1). For RunConfig-backed runs, set per row on the config.
584
+ */
585
+ repetitions?: number;
586
+ /**
587
+ * Optional tags for this run; forwarded as `runConfigTags` on evaluator callbacks (e.g. suite labels).
588
+ */
589
+ runConfigTags?: ReadonlyArray<string>;
410
590
  }
411
591
  interface RunSnapshot {
412
592
  runId: string;
@@ -443,8 +623,9 @@ type RunnerEvent = {
443
623
  testCaseName: string;
444
624
  startedTestCases: number;
445
625
  totalTestCases: number;
446
- rerunIndex: number;
447
- rerunTotal: number;
626
+ repetitionId: string;
627
+ repetitionIndex: number;
628
+ repetitionCount: number;
448
629
  } | {
449
630
  type: 'TestCaseProgress';
450
631
  runId: string;
@@ -452,8 +633,9 @@ type RunnerEvent = {
452
633
  testCaseName: string;
453
634
  completedTestCases: number;
454
635
  totalTestCases: number;
455
- rerunIndex: number;
456
- rerunTotal: number;
636
+ repetitionId: string;
637
+ repetitionIndex: number;
638
+ repetitionCount: number;
457
639
  passed: boolean;
458
640
  durationMs: number;
459
641
  evaluatorScores: ReadonlyArray<{
@@ -488,11 +670,27 @@ type RunnerEvent = {
488
670
  interface SubscribeOptions {
489
671
  runId?: string;
490
672
  }
673
+ interface RunDatasetJobsWithSharedConcurrencyRequest {
674
+ jobs: ReadonlyArray<RunDatasetJob>;
675
+ globalConcurrency: number;
676
+ triggerId?: string;
677
+ }
491
678
  interface RunnerApi {
492
679
  collectDatasets(): Promise<ReadonlyArray<CollectedDataset>>;
493
680
  collectEvaluators(): Promise<ReadonlyArray<CollectedEvaluator>>;
681
+ collectRunConfigs(): Promise<ReadonlyArray<CollectedRunConfig>>;
682
+ /** Resolves a dataset by canonical **`Dataset` `name`** (id), case-insensitive. */
494
683
  resolveDatasetByName(name: string): Promise<CollectedDataset | undefined>;
495
684
  resolveEvaluatorsByNamePattern(pattern: string): Promise<ReadonlyArray<CollectedEvaluator>>;
685
+ /**
686
+ * Resolves a RunConfig by display name (case-insensitive).
687
+ * @throws If more than one discovered RunConfig uses the same name (list file paths in the error).
688
+ */
689
+ resolveRunConfigByName(name: string): Promise<CollectedRunConfig | undefined>;
690
+ expandRunConfigToJobs(collected: CollectedRunConfig): Promise<ReadonlyArray<RunDatasetJob>>;
691
+ /** Resolves each name in order and concatenates expanded jobs (the same name may appear more than once). */
692
+ expandRunConfigNamesToJobs(names: ReadonlyArray<string>): Promise<ReadonlyArray<RunDatasetJob>>;
693
+ runDatasetJobsWithSharedConcurrency(request: RunDatasetJobsWithSharedConcurrencyRequest): Promise<ReadonlyArray<RunSnapshot>>;
496
694
  searchTestCases(query?: SearchTestCasesQuery): Promise<ReadonlyArray<CollectedTestCase>>;
497
695
  collectDatasetTestCases(datasetId: string): Promise<ReadonlyArray<CollectedTestCase>>;
498
696
  runDatasetWith(request: RunDatasetRequest): Promise<RunSnapshot>;
@@ -538,4 +736,20 @@ interface BinaryScoreData {
538
736
  }
539
737
  declare const binaryScore: ScoreDef<BinaryScoreData>;
540
738
 
541
- export { BinaryScoreData, CliState, CollectedDataset, CollectedEvaluator, CollectedTestCase, ConfigType, CreateDiffLogEntryOptions, Dataset, DeltaScoreData, DiffLogEntry, EvalDataset, EvalMiddleware, EvalRun, EvalsData, EvaluateArgs, EvaluateMeta, Evaluator, EvaluatorLogEntry, EvaluatorOption, FormatMetricOptions, FormatScoreOptions, JsonDiffOptions, LatencyData, LogEntry, M4trixEvalConfig, M4trixEvalConfigDiscovery, Metric, MetricDef, MetricItem, PathMatcher, PercentScoreData, PrintJsonDiffOptions, RunDatasetRequest, RunSnapshot, RunnerApi, RunnerConfig, RunnerConfigOverrides, RunnerDiscoveryConfig, RunnerEvent, Score, ScoreDef, ScoreDisplayStrategy, ScoreItem, SearchTestCasesQuery, StartupArgs, TagMatcher, TestCase, TokenCountData, ViewLevel, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, deltaScore, formatScoreData, getLogLines, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, withRunnerConfig };
739
+ /**
740
+ * Map from each tag literal to a `string` value (the same string at runtime).
741
+ * Lets you reference `set['my-tag']` with autocomplete and errors on unknown keys.
742
+ */
743
+ type TagSetMembers<T extends readonly string[]> = {
744
+ readonly [K in T[number]]: string;
745
+ };
746
+ /**
747
+ * Closed set of tag strings for type-safe references (`set['alpha']` is valid; `set['nope']` is a type error).
748
+ * Values are plain `string`, so they assign to `string[]`, dataset matchers, etc.
749
+ */
750
+ declare class TagSet {
751
+ private constructor();
752
+ static define<const T extends readonly string[]>(tags: T): TagSetMembers<T>;
753
+ }
754
+
755
+ export { BinaryScoreData, CliState, CollectedDataset, CollectedEvaluator, CollectedRunConfig, CollectedTestCase, ConfigType, CreateDiffLogEntryOptions, Dataset, DatasetDefineConfig, DatasetName, DatasetNameSchema, DeltaScoreData, DiffLogEntry, EvalDataset, EvalMiddleware, EvalRun, EvalsData, EvaluateArgs, EvaluateMeta, Evaluator, EvaluatorLogEntry, EvaluatorName, EvaluatorNameSchema, EvaluatorOption, FormatMetricOptions, FormatScoreOptions, JsonDiffOptions, LatencyData, LogEntry, M4trixEvalConfig, M4trixEvalConfigDiscovery, Metric, MetricDef, MetricItem, PROGRAMMATIC_RUN_CONFIG, PathMatcher, PercentScoreData, PrintJsonDiffOptions, RunConfig, RunConfigDefineConfig, RunConfigName, RunConfigNameSchema, RunConfigRow, RunConfigRowEvaluators, RunConfigRowPattern, RunDatasetJob, RunDatasetJobsWithSharedConcurrencyRequest, RunDatasetRequest, RunSnapshot, RunnerApi, RunnerConfig, RunnerConfigOverrides, RunnerDiscoveryConfig, RunnerEvent, Score, ScoreDef, ScoreDisplayStrategy, ScoreItem, SearchTestCasesQuery, StartupArgs, TagMatcher, TagSet, TagSetMembers, TestCase, TestCaseName, TestCaseNameSchema, TokenCountData, ViewLevel, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, deltaScore, formatScoreData, getDatasetDisplayLabel, getEvaluatorDisplayLabel, getEvaluatorTagList, getLogLines, getMetricById, getScoreById, getTestCaseDisplayLabel, getTestCaseTagList, latencyMetric, loadMockData, loadRunnerData, normalizeOptionalDisplayName, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, validateDatasetName, validateEvaluatorName, validateRunConfigName, validateTestCaseName, withRunnerConfig };