@m4trix/evals 0.11.0 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +719 -227
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +721 -229
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +1320 -928
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +1322 -930
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +335 -99
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +24 -5
- package/dist/index.js +337 -101
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.d.ts
CHANGED
|
@@ -91,6 +91,8 @@ interface RunnerDiscoveryConfig {
|
|
|
91
91
|
interface RunnerConfig {
|
|
92
92
|
discovery: RunnerDiscoveryConfig;
|
|
93
93
|
artifactDirectory: string;
|
|
94
|
+
/** Max concurrent test cases per run. Default: 1 (sequential). */
|
|
95
|
+
maxConcurrency: number;
|
|
94
96
|
}
|
|
95
97
|
type RunnerConfigOverrides = Omit<Partial<RunnerConfig>, 'discovery'> & {
|
|
96
98
|
discovery?: Partial<RunnerDiscoveryConfig>;
|
|
@@ -108,6 +110,8 @@ interface M4trixEvalConfigDiscovery {
|
|
|
108
110
|
interface M4trixEvalConfig {
|
|
109
111
|
discovery?: M4trixEvalConfigDiscovery;
|
|
110
112
|
artifactDirectory?: string;
|
|
113
|
+
/** Max concurrent test cases per run. Default: 1 (sequential). */
|
|
114
|
+
maxConcurrency?: number;
|
|
111
115
|
}
|
|
112
116
|
type ConfigType = M4trixEvalConfig;
|
|
113
117
|
type M4trixEvalConfigFactory<TConfig extends ConfigType = ConfigType> = () => TConfig;
|
|
@@ -124,6 +128,7 @@ type InputOrBuilder<T> = T | (() => T);
|
|
|
124
128
|
interface TestCaseDescribeConfig<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any = Schema.Schema<unknown>> {
|
|
125
129
|
name: string;
|
|
126
130
|
tags: string[];
|
|
131
|
+
reruns?: number;
|
|
127
132
|
inputSchema: TI;
|
|
128
133
|
input: InputOrBuilder<Schema.Schema.Type<TI>>;
|
|
129
134
|
outputSchema?: TO;
|
|
@@ -133,6 +138,7 @@ declare class TestCase<TInput = unknown, TOutput = unknown> {
|
|
|
133
138
|
private readonly _config;
|
|
134
139
|
private constructor();
|
|
135
140
|
static describe<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any = Schema.Schema<unknown>>(config: TestCaseDescribeConfig<TI, TO>): TestCase<Schema.Schema.Type<TI>, Schema.Schema.Type<TO>>;
|
|
141
|
+
getReruns(): number;
|
|
136
142
|
getName(): string;
|
|
137
143
|
getTags(): string[];
|
|
138
144
|
getInputSchema(): Schema.Schema.Any;
|
|
@@ -225,17 +231,22 @@ interface MetricItem<TData = unknown> {
|
|
|
225
231
|
readonly id: string;
|
|
226
232
|
readonly data: TData;
|
|
227
233
|
}
|
|
234
|
+
interface FormatMetricOptions {
|
|
235
|
+
isAggregated?: boolean;
|
|
236
|
+
}
|
|
228
237
|
interface MetricDef<TData = unknown> {
|
|
229
238
|
readonly id: string;
|
|
230
239
|
readonly name?: string;
|
|
231
|
-
|
|
240
|
+
readonly aggregate?: (values: ReadonlyArray<TData>) => TData;
|
|
241
|
+
format(data: TData, options?: FormatMetricOptions): string;
|
|
232
242
|
make(data: TData): MetricItem<TData>;
|
|
233
243
|
}
|
|
234
244
|
declare const Metric: {
|
|
235
245
|
of<TData>(config: {
|
|
236
246
|
id: string;
|
|
237
247
|
name?: string | undefined;
|
|
238
|
-
format: (data: TData) => string;
|
|
248
|
+
format: (data: TData, options?: FormatMetricOptions) => string;
|
|
249
|
+
aggregate?: ((values: readonly TData[]) => TData) | undefined;
|
|
239
250
|
}): MetricDef<TData>;
|
|
240
251
|
};
|
|
241
252
|
declare function getMetricById(id: string): MetricDef<unknown> | undefined;
|
|
@@ -246,11 +257,15 @@ interface ScoreItem<TData = unknown> {
|
|
|
246
257
|
readonly data: TData;
|
|
247
258
|
readonly passed?: boolean;
|
|
248
259
|
}
|
|
260
|
+
interface FormatScoreOptions {
|
|
261
|
+
isAggregated?: boolean;
|
|
262
|
+
}
|
|
249
263
|
interface ScoreDef<TData = unknown> {
|
|
250
264
|
readonly id: string;
|
|
251
265
|
readonly name?: string;
|
|
252
266
|
readonly displayStrategy: ScoreDisplayStrategy;
|
|
253
|
-
|
|
267
|
+
readonly aggregate?: (values: ReadonlyArray<TData>) => TData;
|
|
268
|
+
format(data: TData, options?: FormatScoreOptions): string;
|
|
254
269
|
make(data: TData, options?: {
|
|
255
270
|
definePassed?: (data: TData) => boolean;
|
|
256
271
|
}): ScoreItem<TData>;
|
|
@@ -260,7 +275,8 @@ declare const Score: {
|
|
|
260
275
|
id: string;
|
|
261
276
|
name?: string | undefined;
|
|
262
277
|
displayStrategy: ScoreDisplayStrategy;
|
|
263
|
-
format: (data: TData) => string;
|
|
278
|
+
format: (data: TData, options?: FormatScoreOptions) => string;
|
|
279
|
+
aggregate?: ((values: readonly TData[]) => TData) | undefined;
|
|
264
280
|
}): ScoreDef<TData>;
|
|
265
281
|
};
|
|
266
282
|
declare function getScoreById(id: string): ScoreDef<unknown> | undefined;
|
|
@@ -326,6 +342,8 @@ type RunnerEvent = {
|
|
|
326
342
|
testCaseName: string;
|
|
327
343
|
completedTestCases: number;
|
|
328
344
|
totalTestCases: number;
|
|
345
|
+
rerunIndex: number;
|
|
346
|
+
rerunTotal: number;
|
|
329
347
|
passed: boolean;
|
|
330
348
|
durationMs: number;
|
|
331
349
|
evaluatorScores: ReadonlyArray<{
|
|
@@ -371,6 +389,7 @@ interface RunnerApi {
|
|
|
371
389
|
subscribeRunEvents(listener: (event: RunnerEvent) => void, options?: SubscribeOptions): () => void;
|
|
372
390
|
getRunSnapshot(runId: string): RunSnapshot | undefined;
|
|
373
391
|
getAllRunSnapshots(): ReadonlyArray<RunSnapshot>;
|
|
392
|
+
loadRunSnapshotsFromArtifacts(): Promise<ReadonlyArray<RunSnapshot>>;
|
|
374
393
|
shutdown(): Promise<void>;
|
|
375
394
|
}
|
|
376
395
|
declare function createRunner(overrides?: RunnerConfigOverrides): RunnerApi;
|
|
@@ -400,4 +419,4 @@ interface BinaryScoreData {
|
|
|
400
419
|
}
|
|
401
420
|
declare const binaryScore: ScoreDef<BinaryScoreData>;
|
|
402
421
|
|
|
403
|
-
export { BinaryScoreData, CliState, CollectedDataset, CollectedEvaluator, CollectedTestCase, ConfigType, Dataset, EvalDataset, EvalMiddleware, EvalRun, EvalsData, EvaluateArgs, Evaluator, EvaluatorOption, LatencyData, M4trixEvalConfig, M4trixEvalConfigDiscovery, Metric, MetricDef, MetricItem, PathMatcher, PercentScoreData, PrintJsonDiffOptions, RunDatasetRequest, RunSnapshot, RunnerApi, RunnerConfig, RunnerConfigOverrides, RunnerDiscoveryConfig, RunnerEvent, Score, ScoreDef, ScoreDisplayStrategy, ScoreItem, SearchTestCasesQuery, StartupArgs, TagMatcher, TestCase, TokenCountData, ViewLevel, binaryScore, createRunner, defaultRunnerConfig, defineConfig, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, withRunnerConfig };
|
|
422
|
+
export { BinaryScoreData, CliState, CollectedDataset, CollectedEvaluator, CollectedTestCase, ConfigType, Dataset, EvalDataset, EvalMiddleware, EvalRun, EvalsData, EvaluateArgs, Evaluator, EvaluatorOption, FormatMetricOptions, FormatScoreOptions, LatencyData, M4trixEvalConfig, M4trixEvalConfigDiscovery, Metric, MetricDef, MetricItem, PathMatcher, PercentScoreData, PrintJsonDiffOptions, RunDatasetRequest, RunSnapshot, RunnerApi, RunnerConfig, RunnerConfigOverrides, RunnerDiscoveryConfig, RunnerEvent, Score, ScoreDef, ScoreDisplayStrategy, ScoreItem, SearchTestCasesQuery, StartupArgs, TagMatcher, TestCase, TokenCountData, ViewLevel, binaryScore, createRunner, defaultRunnerConfig, defineConfig, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, withRunnerConfig };
|