@m4trix/evals 0.25.1 → 0.26.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -86,6 +86,7 @@ interface RunnerDiscoveryConfig {
86
86
  rootDir: string;
87
87
  datasetSuffixes: ReadonlyArray<string>;
88
88
  evaluatorSuffixes: ReadonlyArray<string>;
89
+ runConfigSuffixes: ReadonlyArray<string>;
89
90
  testCaseSuffixes: ReadonlyArray<string>;
90
91
  excludeDirectories: ReadonlyArray<string>;
91
92
  }
@@ -102,9 +103,11 @@ interface M4trixEvalConfigDiscovery {
102
103
  rootDir?: string;
103
104
  datasetFilePatterns?: ReadonlyArray<string>;
104
105
  evaluatorFilePatterns?: ReadonlyArray<string>;
106
+ runConfigFilePatterns?: ReadonlyArray<string>;
105
107
  testCaseFilePatterns?: ReadonlyArray<string>;
106
108
  datasetSuffixes?: ReadonlyArray<string>;
107
109
  evaluatorSuffixes?: ReadonlyArray<string>;
110
+ runConfigSuffixes?: ReadonlyArray<string>;
108
111
  testCaseSuffixes?: ReadonlyArray<string>;
109
112
  excludeDirectories?: ReadonlyArray<string>;
110
113
  }
@@ -127,9 +130,14 @@ type PathMatcher = string | RegExp;
127
130
 
128
131
  type InputOrBuilder<T> = T | (() => T);
129
132
  interface TestCaseDescribeConfig<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any = Schema.Schema<unknown>> {
133
+ /**
134
+ * Stable id (letters, digits, `_`, `-`).
135
+ * For an unrestricted UI label, set {@link displayName}.
136
+ */
130
137
  name: string;
138
+ /** Optional human-readable label for CLI/TUI (any characters). */
139
+ displayName?: string;
131
140
  tags: string[];
132
- reruns?: number;
133
141
  inputSchema: TI;
134
142
  input: InputOrBuilder<Schema.Schema.Type<TI>>;
135
143
  outputSchema?: TO;
@@ -139,14 +147,24 @@ declare class TestCase<TInput = unknown, TOutput = unknown> {
139
147
  private readonly _config;
140
148
  private constructor();
141
149
  static describe<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any = Schema.Schema<unknown>>(config: TestCaseDescribeConfig<TI, TO>): TestCase<Schema.Schema.Type<TI>, Schema.Schema.Type<TO>>;
142
- getReruns(): number;
143
150
  getName(): string;
151
+ getDisplayName(): string | undefined;
152
+ getDisplayLabel(): string;
144
153
  getTags(): string[];
145
154
  getInputSchema(): Schema.Schema.Any;
146
155
  getInput(): TInput;
147
156
  getOutputSchema(): Schema.Schema.Any | undefined;
148
157
  getOutput(): TOutput | undefined;
149
158
  }
159
+ /** CLI-friendly label: {@link TestCase.getDisplayLabel} when present, else {@link TestCase.getName} (supports plain test-case-shaped objects). */
160
+ declare function getTestCaseDisplayLabel(testCase: {
161
+ getDisplayLabel?: () => string;
162
+ getName?: () => string;
163
+ }): string;
164
+ /** Tags for evaluator `args.testCaseTags` (supports plain test-case-shaped objects without `getTags`). */
165
+ declare function getTestCaseTagList(testCase: {
166
+ getTags?: () => ReadonlyArray<string>;
167
+ }): string[];
150
168
 
151
169
  interface DatasetDefineConfig {
152
170
  name: string;
@@ -243,6 +261,17 @@ interface EvaluateMeta {
243
261
  runId: string;
244
262
  /** Identifier of the dataset currently being evaluated. */
245
263
  datasetId: string;
264
+ /** Canonical `RunConfig` name (or `programmatic` for API/TUI-only runs). */
265
+ runConfigName: string;
266
+ /**
267
+ * Stable id shared by every execution of the same logical test case when `repetitionCount > 1`
268
+ * (and present with count 1 for consistency).
269
+ */
270
+ repetitionId: string;
271
+ /** 1-based index of this execution within the repetition group. */
272
+ repetitionIndex: number;
273
+ /** Total scheduled executions for this logical test case in the current run. */
274
+ repetitionCount: number;
246
275
  }
247
276
  interface EvaluateArgs<TInput, TOutput = unknown, TCtx = Record<string, never>> {
248
277
  input: TInput;
@@ -250,6 +279,12 @@ interface EvaluateArgs<TInput, TOutput = unknown, TCtx = Record<string, never>>
250
279
  output?: TOutput;
251
280
  /** Metadata about the current evaluator invocation. */
252
281
  meta: EvaluateMeta;
282
+ /** Tags from `TestCase.describe({ tags })` for the current test case. */
283
+ testCaseTags: string[];
284
+ /** Tags from `RunConfig.define({ tags })` for this job; empty for programmatic runs unless set on the request. */
285
+ runConfigTags: string[];
286
+ /** Tags from `Evaluator.define({ tags })` for this evaluator. */
287
+ evaluatorTags: string[];
253
288
  /** Records a diff for this test case; stored in run artifact and shown by CLI */
254
289
  logDiff: (expected: unknown, actual: unknown, options?: CreateDiffLogEntryOptions) => void;
255
290
  /** Logs a message or object for this test case; stored in run artifact and shown by CLI */
@@ -266,12 +301,20 @@ interface EvaluateArgs<TInput, TOutput = unknown, TCtx = Record<string, never>>
266
301
  }
267
302
  type EvaluateFn<TInput, TOutput, TScore, TCtx> = (args: EvaluateArgs<TInput, TOutput, TCtx>) => TScore | Error | Promise<TScore | Error>;
268
303
  interface EvaluatorDefineConfig<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any, TS extends Schema.Schema.Any> {
304
+ /**
305
+ * Stable id (letters, digits, `_`, `-`); used for discovery, name patterns, and `meta`.
306
+ * For an unrestricted UI label, set {@link displayName}.
307
+ */
269
308
  name: string;
309
+ /** Optional human-readable label for CLI/TUI (any characters). */
310
+ displayName?: string;
270
311
  inputSchema: TI;
271
312
  outputSchema: TO;
272
313
  scoreSchema: TS;
273
314
  passThreshold?: number;
274
315
  passCriterion?: (score: unknown) => boolean;
316
+ /** Optional tags for this evaluator; surfaced on every `evaluate` invocation. */
317
+ tags?: ReadonlyArray<string>;
275
318
  }
276
319
  declare class Evaluator<TInput = unknown, TOutput = unknown, TScore = unknown, TCtx = Record<string, never>> {
277
320
  private readonly _config;
@@ -281,7 +324,13 @@ declare class Evaluator<TInput = unknown, TOutput = unknown, TScore = unknown, T
281
324
  use<TNew>(middleware: EvalMiddleware<TNew>): Evaluator<TInput, TOutput, TScore, TCtx & TNew>;
282
325
  define<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any, TS extends Schema.Schema.Any>(config: EvaluatorDefineConfig<TI, TO, TS>): Evaluator<Schema.Schema.Type<TI>, Schema.Schema.Type<TO>, Schema.Schema.Type<TS>, TCtx>;
283
326
  evaluate(fn: EvaluateFn<TInput, TOutput, TScore, TCtx>): Evaluator<TInput, TOutput, TScore, TCtx>;
327
+ /** Canonical evaluator id when defined; otherwise undefined (middleware-only chain). */
284
328
  getName(): string | undefined;
329
+ getDisplayName(): string | undefined;
330
+ /** Label for CLI/TUI: {@link getDisplayName} if set, otherwise {@link getName}. Undefined if not yet defined. */
331
+ getDisplayLabel(): string | undefined;
332
+ /** Tags from `Evaluator.define({ tags })`; empty until defined. */
333
+ getTags(): string[];
285
334
  getInputSchema(): Schema.Schema.Any | undefined;
286
335
  getOutputSchema(): Schema.Schema.Any | undefined;
287
336
  getScoreSchema(): Schema.Schema.Any | undefined;
@@ -291,6 +340,15 @@ declare class Evaluator<TInput = unknown, TOutput = unknown, TScore = unknown, T
291
340
  getPassCriterion(): ((score: unknown) => boolean) | undefined;
292
341
  resolveContext(): Promise<TCtx>;
293
342
  }
343
+ /** CLI-friendly label: {@link Evaluator.getDisplayLabel} when present, else {@link Evaluator.getName} (supports plain evaluator-shaped objects from discovery). */
344
+ declare function getEvaluatorDisplayLabel(evaluator: {
345
+ getDisplayLabel?: () => string | undefined;
346
+ getName?: () => string | undefined;
347
+ }): string | undefined;
348
+ /** Tags for evaluator `args.evaluatorTags` (plain evaluator-shaped objects without `getTags` yield `[]`). */
349
+ declare function getEvaluatorTagList(evaluator: {
350
+ getTags?: () => ReadonlyArray<string>;
351
+ }): string[];
294
352
 
295
353
  interface MetricItem<TData = unknown> {
296
354
  readonly id: string;
@@ -320,6 +378,72 @@ declare const Metric: {
320
378
  };
321
379
  declare function getMetricById(id: string): MetricDef<unknown> | undefined;
322
380
 
381
+ /** Branded id for `RunConfig` `name` (decode with {@link RunConfigNameSchema}). */
382
+ declare const RunConfigNameSchema: Schema.brand<Schema.filter<Schema.filter<Schema.filter<typeof Schema.String>>>, "RunConfigName">;
383
+ /** Branded id for `Evaluator.define({ name })` (decode with {@link EvaluatorNameSchema}). */
384
+ declare const EvaluatorNameSchema: Schema.brand<Schema.filter<Schema.filter<Schema.filter<typeof Schema.String>>>, "EvaluatorName">;
385
+ /** Branded id for `TestCase.describe({ name })` (decode with {@link TestCaseNameSchema}). */
386
+ declare const TestCaseNameSchema: Schema.brand<Schema.filter<Schema.filter<Schema.filter<typeof Schema.String>>>, "TestCaseName">;
387
+ type RunConfigName = Schema.Schema.Type<typeof RunConfigNameSchema>;
388
+ type EvaluatorName = Schema.Schema.Type<typeof EvaluatorNameSchema>;
389
+ type TestCaseName = Schema.Schema.Type<typeof TestCaseNameSchema>;
390
+ declare function validateRunConfigName(raw: string, context: string): RunConfigName;
391
+ declare function validateEvaluatorName(raw: string, context: string): EvaluatorName;
392
+ declare function validateTestCaseName(raw: string, context: string): TestCaseName;
393
+ /** Optional UI label: trim; empty after trim becomes undefined. */
394
+ declare function normalizeOptionalDisplayName(raw: string | undefined): string | undefined;
395
+
396
+ /** Heterogeneous evaluator rows; `unknown` breaks assignability from concrete `EvaluateFn` (contravariance on `input`). */
397
+ type RunConfigEvaluatorRef = Evaluator<any, any, any, any>;
398
+ /** Select evaluators by concrete instances (same module exports as discovery). */
399
+ interface RunConfigRowEvaluators {
400
+ readonly dataset: Dataset;
401
+ readonly evaluators: ReadonlyArray<RunConfigEvaluatorRef>;
402
+ readonly evaluatorPattern?: undefined;
403
+ /**
404
+ * How many times each test case in this dataset is evaluated for this row (default: 1).
405
+ * All executions of the same logical test case share one `repetitionId` in evaluator `meta`.
406
+ */
407
+ readonly repetitions?: number;
408
+ }
409
+ /** Select evaluators using the same wildcard / regex rules as the runner's `resolveEvaluatorsByNamePattern`. */
410
+ interface RunConfigRowPattern {
411
+ readonly dataset: Dataset;
412
+ readonly evaluatorPattern: string;
413
+ readonly evaluators?: undefined;
414
+ readonly repetitions?: number;
415
+ }
416
+ type RunConfigRow = RunConfigRowEvaluators | RunConfigRowPattern;
417
+ interface RunConfigDefineConfig {
418
+ /**
419
+ * Stable id (letters, digits, `_`, `-`); surfaced in discovery, CLI `--run-config`, and evaluator `meta`.
420
+ * For an unrestricted UI label, set {@link displayName}.
421
+ */
422
+ name: string;
423
+ /** Optional human-readable label for CLI/TUI (any characters). */
424
+ displayName?: string;
425
+ /** Optional tags; copied to every evaluation as `runConfigTags` on the evaluator callback. */
426
+ tags?: ReadonlyArray<string>;
427
+ runs: ReadonlyArray<RunConfigRow>;
428
+ }
429
+ declare class RunConfig {
430
+ private readonly _name;
431
+ private readonly _displayName;
432
+ private readonly _tags;
433
+ private readonly _runs;
434
+ private constructor();
435
+ static define(config: RunConfigDefineConfig): RunConfig;
436
+ /** Canonical id (branded {@link RunConfigName} at runtime; typed as `string` for ergonomics). */
437
+ getName(): string;
438
+ /** Optional unrestricted display label. */
439
+ getDisplayName(): string | undefined;
440
+ /** Label for CLI/TUI: {@link getDisplayName} if set, otherwise {@link getName}. */
441
+ getDisplayLabel(): string;
442
+ /** Tags from `RunConfig.define({ tags })`; surfaced as `runConfigTags` on evaluator callbacks. */
443
+ getTags(): string[];
444
+ getRuns(): ReadonlyArray<RunConfigRow>;
445
+ }
446
+
323
447
  type ScoreDisplayStrategy = 'bar' | 'number' | 'passFail';
324
448
  interface ScoreItem<TData = unknown> {
325
449
  readonly id: string;
@@ -387,6 +511,29 @@ interface CollectedEvaluator {
387
511
  filePath: string;
388
512
  evaluator: Evaluator<unknown, unknown, unknown, unknown>;
389
513
  }
514
+ interface CollectedRunConfig {
515
+ id: string;
516
+ filePath: string;
517
+ runConfig: RunConfig;
518
+ }
519
+ /** One dataset + evaluator set queued as part of a RunConfig or batch run. */
520
+ interface RunDatasetJob {
521
+ datasetId: string;
522
+ evaluatorIds: ReadonlyArray<string>;
523
+ /** RunConfig name (same as `RunConfig.getName()`). */
524
+ runConfigName: string;
525
+ /**
526
+ * Human-facing RunConfig label (`displayName ?? name`) when the job was expanded from `RunConfig.define`.
527
+ * Omitted for ad-hoc jobs; UI should fall back to {@link runConfigName}.
528
+ */
529
+ runConfigDisplayLabel?: string;
530
+ /**
531
+ * Tags from `RunConfig.define({ tags })` for this job; forwarded as `runConfigTags` on evaluator callbacks.
532
+ */
533
+ runConfigTags?: ReadonlyArray<string>;
534
+ /** Evaluates each matching test case this many times (default 1). */
535
+ repetitions: number;
536
+ }
390
537
  interface CollectedTestCase {
391
538
  id: string;
392
539
  filePath: string;
@@ -398,6 +545,10 @@ interface SearchTestCasesQuery {
398
545
  includedPaths?: ReadonlyArray<string | RegExp>;
399
546
  excludedPaths?: ReadonlyArray<string | RegExp>;
400
547
  }
548
+ /** Use with `RunDatasetRequest` for API / TUI runs that are not backed by a `RunConfig` file. */
549
+ declare const PROGRAMMATIC_RUN_CONFIG: {
550
+ readonly runConfigName: "programmatic";
551
+ };
401
552
  interface RunDatasetRequest {
402
553
  /**
403
554
  * Identifier for what triggered the run request (for example, a CLI command).
@@ -406,7 +557,17 @@ interface RunDatasetRequest {
406
557
  triggerId?: string;
407
558
  datasetId: string;
408
559
  evaluatorIds: ReadonlyArray<string>;
560
+ /** RunConfig name surfaced on evaluator `meta` (from the job or `PROGRAMMATIC_RUN_CONFIG`). */
561
+ runConfigName: string;
409
562
  concurrency?: number;
563
+ /**
564
+ * How many times each test case is executed (default: 1). For RunConfig-backed runs, set per row on the config.
565
+ */
566
+ repetitions?: number;
567
+ /**
568
+ * Optional tags for this run; forwarded as `runConfigTags` on evaluator callbacks (e.g. suite labels).
569
+ */
570
+ runConfigTags?: ReadonlyArray<string>;
410
571
  }
411
572
  interface RunSnapshot {
412
573
  runId: string;
@@ -443,8 +604,9 @@ type RunnerEvent = {
443
604
  testCaseName: string;
444
605
  startedTestCases: number;
445
606
  totalTestCases: number;
446
- rerunIndex: number;
447
- rerunTotal: number;
607
+ repetitionId: string;
608
+ repetitionIndex: number;
609
+ repetitionCount: number;
448
610
  } | {
449
611
  type: 'TestCaseProgress';
450
612
  runId: string;
@@ -452,8 +614,9 @@ type RunnerEvent = {
452
614
  testCaseName: string;
453
615
  completedTestCases: number;
454
616
  totalTestCases: number;
455
- rerunIndex: number;
456
- rerunTotal: number;
617
+ repetitionId: string;
618
+ repetitionIndex: number;
619
+ repetitionCount: number;
457
620
  passed: boolean;
458
621
  durationMs: number;
459
622
  evaluatorScores: ReadonlyArray<{
@@ -488,11 +651,26 @@ type RunnerEvent = {
488
651
  interface SubscribeOptions {
489
652
  runId?: string;
490
653
  }
654
+ interface RunDatasetJobsWithSharedConcurrencyRequest {
655
+ jobs: ReadonlyArray<RunDatasetJob>;
656
+ globalConcurrency: number;
657
+ triggerId?: string;
658
+ }
491
659
  interface RunnerApi {
492
660
  collectDatasets(): Promise<ReadonlyArray<CollectedDataset>>;
493
661
  collectEvaluators(): Promise<ReadonlyArray<CollectedEvaluator>>;
662
+ collectRunConfigs(): Promise<ReadonlyArray<CollectedRunConfig>>;
494
663
  resolveDatasetByName(name: string): Promise<CollectedDataset | undefined>;
495
664
  resolveEvaluatorsByNamePattern(pattern: string): Promise<ReadonlyArray<CollectedEvaluator>>;
665
+ /**
666
+ * Resolves a RunConfig by display name (case-insensitive).
667
+ * @throws If more than one discovered RunConfig uses the same name (list file paths in the error).
668
+ */
669
+ resolveRunConfigByName(name: string): Promise<CollectedRunConfig | undefined>;
670
+ expandRunConfigToJobs(collected: CollectedRunConfig): Promise<ReadonlyArray<RunDatasetJob>>;
671
+ /** Resolves each name in order and concatenates expanded jobs (the same name may appear more than once). */
672
+ expandRunConfigNamesToJobs(names: ReadonlyArray<string>): Promise<ReadonlyArray<RunDatasetJob>>;
673
+ runDatasetJobsWithSharedConcurrency(request: RunDatasetJobsWithSharedConcurrencyRequest): Promise<ReadonlyArray<RunSnapshot>>;
496
674
  searchTestCases(query?: SearchTestCasesQuery): Promise<ReadonlyArray<CollectedTestCase>>;
497
675
  collectDatasetTestCases(datasetId: string): Promise<ReadonlyArray<CollectedTestCase>>;
498
676
  runDatasetWith(request: RunDatasetRequest): Promise<RunSnapshot>;
@@ -538,4 +716,20 @@ interface BinaryScoreData {
538
716
  }
539
717
  declare const binaryScore: ScoreDef<BinaryScoreData>;
540
718
 
541
- export { BinaryScoreData, CliState, CollectedDataset, CollectedEvaluator, CollectedTestCase, ConfigType, CreateDiffLogEntryOptions, Dataset, DeltaScoreData, DiffLogEntry, EvalDataset, EvalMiddleware, EvalRun, EvalsData, EvaluateArgs, EvaluateMeta, Evaluator, EvaluatorLogEntry, EvaluatorOption, FormatMetricOptions, FormatScoreOptions, JsonDiffOptions, LatencyData, LogEntry, M4trixEvalConfig, M4trixEvalConfigDiscovery, Metric, MetricDef, MetricItem, PathMatcher, PercentScoreData, PrintJsonDiffOptions, RunDatasetRequest, RunSnapshot, RunnerApi, RunnerConfig, RunnerConfigOverrides, RunnerDiscoveryConfig, RunnerEvent, Score, ScoreDef, ScoreDisplayStrategy, ScoreItem, SearchTestCasesQuery, StartupArgs, TagMatcher, TestCase, TokenCountData, ViewLevel, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, deltaScore, formatScoreData, getLogLines, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, withRunnerConfig };
719
+ /**
720
+ * Map from each tag literal to a `string` value (the same string at runtime).
721
+ * Lets you reference `set['my-tag']` with autocomplete and errors on unknown keys.
722
+ */
723
+ type TagSetMembers<T extends readonly string[]> = {
724
+ readonly [K in T[number]]: string;
725
+ };
726
+ /**
727
+ * Closed set of tag strings for type-safe references (`set['alpha']` is valid; `set['nope']` is a type error).
728
+ * Values are plain `string`, so they assign to `string[]`, dataset matchers, etc.
729
+ */
730
+ declare class TagSet {
731
+ private constructor();
732
+ static define<const T extends readonly string[]>(tags: T): TagSetMembers<T>;
733
+ }
734
+
735
+ export { BinaryScoreData, CliState, CollectedDataset, CollectedEvaluator, CollectedRunConfig, CollectedTestCase, ConfigType, CreateDiffLogEntryOptions, Dataset, DeltaScoreData, DiffLogEntry, EvalDataset, EvalMiddleware, EvalRun, EvalsData, EvaluateArgs, EvaluateMeta, Evaluator, EvaluatorLogEntry, EvaluatorName, EvaluatorNameSchema, EvaluatorOption, FormatMetricOptions, FormatScoreOptions, JsonDiffOptions, LatencyData, LogEntry, M4trixEvalConfig, M4trixEvalConfigDiscovery, Metric, MetricDef, MetricItem, PROGRAMMATIC_RUN_CONFIG, PathMatcher, PercentScoreData, PrintJsonDiffOptions, RunConfig, RunConfigDefineConfig, RunConfigName, RunConfigNameSchema, RunConfigRow, RunConfigRowEvaluators, RunConfigRowPattern, RunDatasetJob, RunDatasetJobsWithSharedConcurrencyRequest, RunDatasetRequest, RunSnapshot, RunnerApi, RunnerConfig, RunnerConfigOverrides, RunnerDiscoveryConfig, RunnerEvent, Score, ScoreDef, ScoreDisplayStrategy, ScoreItem, SearchTestCasesQuery, StartupArgs, TagMatcher, TagSet, TagSetMembers, TestCase, TestCaseName, TestCaseNameSchema, TokenCountData, ViewLevel, binaryScore, createLogEntry, createRunner, defaultRunnerConfig, defineConfig, deltaScore, formatScoreData, getEvaluatorDisplayLabel, getEvaluatorTagList, getLogLines, getMetricById, getScoreById, getTestCaseDisplayLabel, getTestCaseTagList, latencyMetric, loadMockData, loadRunnerData, normalizeOptionalDisplayName, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, validateEvaluatorName, validateRunConfigName, validateTestCaseName, withRunnerConfig };