@m4trix/evals 0.11.0 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -91,6 +91,8 @@ interface RunnerDiscoveryConfig {
91
91
  interface RunnerConfig {
92
92
  discovery: RunnerDiscoveryConfig;
93
93
  artifactDirectory: string;
94
+ /** Max concurrent test cases per run. Default: 1 (sequential). */
95
+ maxConcurrency: number;
94
96
  }
95
97
  type RunnerConfigOverrides = Omit<Partial<RunnerConfig>, 'discovery'> & {
96
98
  discovery?: Partial<RunnerDiscoveryConfig>;
@@ -108,6 +110,8 @@ interface M4trixEvalConfigDiscovery {
108
110
  interface M4trixEvalConfig {
109
111
  discovery?: M4trixEvalConfigDiscovery;
110
112
  artifactDirectory?: string;
113
+ /** Max concurrent test cases per run. Default: 1 (sequential). */
114
+ maxConcurrency?: number;
111
115
  }
112
116
  type ConfigType = M4trixEvalConfig;
113
117
  type M4trixEvalConfigFactory<TConfig extends ConfigType = ConfigType> = () => TConfig;
@@ -124,6 +128,7 @@ type InputOrBuilder<T> = T | (() => T);
124
128
  interface TestCaseDescribeConfig<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any = Schema.Schema<unknown>> {
125
129
  name: string;
126
130
  tags: string[];
131
+ reruns?: number;
127
132
  inputSchema: TI;
128
133
  input: InputOrBuilder<Schema.Schema.Type<TI>>;
129
134
  outputSchema?: TO;
@@ -133,6 +138,7 @@ declare class TestCase<TInput = unknown, TOutput = unknown> {
133
138
  private readonly _config;
134
139
  private constructor();
135
140
  static describe<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any = Schema.Schema<unknown>>(config: TestCaseDescribeConfig<TI, TO>): TestCase<Schema.Schema.Type<TI>, Schema.Schema.Type<TO>>;
141
+ getReruns(): number;
136
142
  getName(): string;
137
143
  getTags(): string[];
138
144
  getInputSchema(): Schema.Schema.Any;
@@ -225,17 +231,22 @@ interface MetricItem<TData = unknown> {
225
231
  readonly id: string;
226
232
  readonly data: TData;
227
233
  }
234
+ interface FormatMetricOptions {
235
+ isAggregated?: boolean;
236
+ }
228
237
  interface MetricDef<TData = unknown> {
229
238
  readonly id: string;
230
239
  readonly name?: string;
231
- format(data: TData): string;
240
+ readonly aggregate?: (values: ReadonlyArray<TData>) => TData;
241
+ format(data: TData, options?: FormatMetricOptions): string;
232
242
  make(data: TData): MetricItem<TData>;
233
243
  }
234
244
  declare const Metric: {
235
245
  of<TData>(config: {
236
246
  id: string;
237
247
  name?: string | undefined;
238
- format: (data: TData) => string;
248
+ format: (data: TData, options?: FormatMetricOptions) => string;
249
+ aggregate?: ((values: readonly TData[]) => TData) | undefined;
239
250
  }): MetricDef<TData>;
240
251
  };
241
252
  declare function getMetricById(id: string): MetricDef<unknown> | undefined;
@@ -246,11 +257,15 @@ interface ScoreItem<TData = unknown> {
246
257
  readonly data: TData;
247
258
  readonly passed?: boolean;
248
259
  }
260
+ interface FormatScoreOptions {
261
+ isAggregated?: boolean;
262
+ }
249
263
  interface ScoreDef<TData = unknown> {
250
264
  readonly id: string;
251
265
  readonly name?: string;
252
266
  readonly displayStrategy: ScoreDisplayStrategy;
253
- format(data: TData): string;
267
+ readonly aggregate?: (values: ReadonlyArray<TData>) => TData;
268
+ format(data: TData, options?: FormatScoreOptions): string;
254
269
  make(data: TData, options?: {
255
270
  definePassed?: (data: TData) => boolean;
256
271
  }): ScoreItem<TData>;
@@ -260,7 +275,8 @@ declare const Score: {
260
275
  id: string;
261
276
  name?: string | undefined;
262
277
  displayStrategy: ScoreDisplayStrategy;
263
- format: (data: TData) => string;
278
+ format: (data: TData, options?: FormatScoreOptions) => string;
279
+ aggregate?: ((values: readonly TData[]) => TData) | undefined;
264
280
  }): ScoreDef<TData>;
265
281
  };
266
282
  declare function getScoreById(id: string): ScoreDef<unknown> | undefined;
@@ -326,6 +342,8 @@ type RunnerEvent = {
326
342
  testCaseName: string;
327
343
  completedTestCases: number;
328
344
  totalTestCases: number;
345
+ rerunIndex: number;
346
+ rerunTotal: number;
329
347
  passed: boolean;
330
348
  durationMs: number;
331
349
  evaluatorScores: ReadonlyArray<{
@@ -371,6 +389,7 @@ interface RunnerApi {
371
389
  subscribeRunEvents(listener: (event: RunnerEvent) => void, options?: SubscribeOptions): () => void;
372
390
  getRunSnapshot(runId: string): RunSnapshot | undefined;
373
391
  getAllRunSnapshots(): ReadonlyArray<RunSnapshot>;
392
+ loadRunSnapshotsFromArtifacts(): Promise<ReadonlyArray<RunSnapshot>>;
374
393
  shutdown(): Promise<void>;
375
394
  }
376
395
  declare function createRunner(overrides?: RunnerConfigOverrides): RunnerApi;
@@ -400,4 +419,4 @@ interface BinaryScoreData {
400
419
  }
401
420
  declare const binaryScore: ScoreDef<BinaryScoreData>;
402
421
 
403
- export { BinaryScoreData, CliState, CollectedDataset, CollectedEvaluator, CollectedTestCase, ConfigType, Dataset, EvalDataset, EvalMiddleware, EvalRun, EvalsData, EvaluateArgs, Evaluator, EvaluatorOption, LatencyData, M4trixEvalConfig, M4trixEvalConfigDiscovery, Metric, MetricDef, MetricItem, PathMatcher, PercentScoreData, PrintJsonDiffOptions, RunDatasetRequest, RunSnapshot, RunnerApi, RunnerConfig, RunnerConfigOverrides, RunnerDiscoveryConfig, RunnerEvent, Score, ScoreDef, ScoreDisplayStrategy, ScoreItem, SearchTestCasesQuery, StartupArgs, TagMatcher, TestCase, TokenCountData, ViewLevel, binaryScore, createRunner, defaultRunnerConfig, defineConfig, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, withRunnerConfig };
422
+ export { BinaryScoreData, CliState, CollectedDataset, CollectedEvaluator, CollectedTestCase, ConfigType, Dataset, EvalDataset, EvalMiddleware, EvalRun, EvalsData, EvaluateArgs, Evaluator, EvaluatorOption, FormatMetricOptions, FormatScoreOptions, LatencyData, M4trixEvalConfig, M4trixEvalConfigDiscovery, Metric, MetricDef, MetricItem, PathMatcher, PercentScoreData, PrintJsonDiffOptions, RunDatasetRequest, RunSnapshot, RunnerApi, RunnerConfig, RunnerConfigOverrides, RunnerDiscoveryConfig, RunnerEvent, Score, ScoreDef, ScoreDisplayStrategy, ScoreItem, SearchTestCasesQuery, StartupArgs, TagMatcher, TestCase, TokenCountData, ViewLevel, binaryScore, createRunner, defaultRunnerConfig, defineConfig, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, withRunnerConfig };