@m4trix/evals 0.5.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -633,9 +633,18 @@ function mergeRunnerOverrides(base, next) {
633
633
  if (!base) {
634
634
  return next;
635
635
  }
636
- {
636
+ if (!next) {
637
637
  return base;
638
638
  }
639
+ const discovery = base.discovery || next.discovery ? {
640
+ ...base.discovery ?? {},
641
+ ...next.discovery ?? {}
642
+ } : void 0;
643
+ return {
644
+ ...base,
645
+ ...next,
646
+ discovery
647
+ };
639
648
  }
640
649
  function createRunner(overrides) {
641
650
  const fileOverrides = loadRunnerConfigFile();
File without changes
@@ -2,7 +2,7 @@
2
2
  import { randomUUID } from 'crypto';
3
3
  import { Effect, PubSub, Queue, Fiber } from 'effect';
4
4
  import { existsSync } from 'fs';
5
- import { resolve, parse, join, relative, dirname } from 'path';
5
+ import { resolve, relative, join, parse, dirname } from 'path';
6
6
  import * as jitiModule from 'jiti';
7
7
  import { writeFile, mkdir, appendFile, readdir } from 'fs/promises';
8
8
  import { pathToFileURL } from 'url';
@@ -610,9 +610,18 @@ function mergeRunnerOverrides(base, next) {
610
610
  if (!base) {
611
611
  return next;
612
612
  }
613
- {
613
+ if (!next) {
614
614
  return base;
615
615
  }
616
+ const discovery = base.discovery || next.discovery ? {
617
+ ...base.discovery ?? {},
618
+ ...next.discovery ?? {}
619
+ } : void 0;
620
+ return {
621
+ ...base,
622
+ ...next,
623
+ discovery
624
+ };
616
625
  }
617
626
  function createRunner(overrides) {
618
627
  const fileOverrides = loadRunnerConfigFile();
package/dist/cli.cjs CHANGED
@@ -1899,9 +1899,18 @@ function mergeRunnerOverrides(base, next) {
1899
1899
  if (!base) {
1900
1900
  return next;
1901
1901
  }
1902
- {
1902
+ if (!next) {
1903
1903
  return base;
1904
1904
  }
1905
+ const discovery = base.discovery || next.discovery ? {
1906
+ ...base.discovery ?? {},
1907
+ ...next.discovery ?? {}
1908
+ } : void 0;
1909
+ return {
1910
+ ...base,
1911
+ ...next,
1912
+ discovery
1913
+ };
1905
1914
  }
1906
1915
  function createRunner(overrides) {
1907
1916
  const fileOverrides = loadRunnerConfigFile();
package/dist/cli.d.ts CHANGED
File without changes
package/dist/cli.js CHANGED
@@ -1873,9 +1873,18 @@ function mergeRunnerOverrides(base, next) {
1873
1873
  if (!base) {
1874
1874
  return next;
1875
1875
  }
1876
- {
1876
+ if (!next) {
1877
1877
  return base;
1878
1878
  }
1879
+ const discovery = base.discovery || next.discovery ? {
1880
+ ...base.discovery ?? {},
1881
+ ...next.discovery ?? {}
1882
+ } : void 0;
1883
+ return {
1884
+ ...base,
1885
+ ...next,
1886
+ discovery
1887
+ };
1879
1888
  }
1880
1889
  function createRunner(overrides) {
1881
1890
  const fileOverrides = loadRunnerConfigFile();
package/dist/index.cjs CHANGED
@@ -1318,7 +1318,7 @@ var EffectRunner = class {
1318
1318
  }
1319
1319
  };
1320
1320
 
1321
- Object.defineProperty(exports, "S", {
1321
+ Object.defineProperty(exports, 'S', {
1322
1322
  enumerable: true,
1323
1323
  get: function () { return effect.Schema; }
1324
1324
  });
package/dist/index.d.ts CHANGED
@@ -210,7 +210,7 @@ interface MetricDef<TData = unknown> {
210
210
  declare const Metric: {
211
211
  of<TData>(config: {
212
212
  id: string;
213
- name?: string;
213
+ name?: string | undefined;
214
214
  format: (data: TData) => string;
215
215
  }): MetricDef<TData>;
216
216
  };
@@ -234,7 +234,7 @@ interface ScoreDef<TData = unknown> {
234
234
  declare const Score: {
235
235
  of<TData>(config: {
236
236
  id: string;
237
- name?: string;
237
+ name?: string | undefined;
238
238
  displayStrategy: ScoreDisplayStrategy;
239
239
  format: (data: TData) => string;
240
240
  }): ScoreDef<TData>;
@@ -375,4 +375,4 @@ interface BinaryScoreData {
375
375
  }
376
376
  declare const binaryScore: ScoreDef<BinaryScoreData>;
377
377
 
378
- export { type BinaryScoreData, type CliState, type CollectedDataset, type CollectedEvaluator, type CollectedTestCase, type ConfigType, Dataset, type EvalDataset, type EvalMiddleware, type EvalRun, type EvalsData, type EvaluateArgs, Evaluator, type EvaluatorOption, type LatencyData, type M4trixEvalConfig, type M4trixEvalConfigDiscovery, Metric, type MetricDef, type MetricItem, type PathMatcher, type PercentScoreData, type RunDatasetRequest, type RunSnapshot, type RunnerApi, type RunnerConfig, type RunnerConfigOverrides, type RunnerDiscoveryConfig, type RunnerEvent, Score, type ScoreDef, type ScoreDisplayStrategy, type ScoreItem, type SearchTestCasesQuery, type StartupArgs, type TagMatcher, TestCase, type TokenCountData, type ViewLevel, binaryScore, createRunner, defaultRunnerConfig, defineConfig, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, tokenCountMetric, withRunnerConfig };
378
+ export { BinaryScoreData, CliState, CollectedDataset, CollectedEvaluator, CollectedTestCase, ConfigType, Dataset, EvalDataset, EvalMiddleware, EvalRun, EvalsData, EvaluateArgs, Evaluator, EvaluatorOption, LatencyData, M4trixEvalConfig, M4trixEvalConfigDiscovery, Metric, MetricDef, MetricItem, PathMatcher, PercentScoreData, RunDatasetRequest, RunSnapshot, RunnerApi, RunnerConfig, RunnerConfigOverrides, RunnerDiscoveryConfig, RunnerEvent, Score, ScoreDef, ScoreDisplayStrategy, ScoreItem, SearchTestCasesQuery, StartupArgs, TagMatcher, TestCase, TokenCountData, ViewLevel, binaryScore, createRunner, defaultRunnerConfig, defineConfig, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, tokenCountMetric, withRunnerConfig };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@m4trix/evals",
3
- "version": "0.5.0",
3
+ "version": "0.7.0",
4
4
  "description": "Ink-based evaluation console for m4trix",
5
5
  "type": "module",
6
6
  "main": "./dist/index.cjs",
@@ -1 +0,0 @@
1
- #!/usr/bin/env node
package/dist/cli.d.cts DELETED
@@ -1 +0,0 @@
1
- #!/usr/bin/env node
package/dist/index.d.cts DELETED
@@ -1,378 +0,0 @@
1
- import { Schema } from 'effect';
2
- export { Schema as S } from 'effect';
3
-
4
- type EvalStatus = 'PASS' | 'FAILED' | 'RUNNING';
5
- interface EvalDimension {
6
- name: string;
7
- score: number;
8
- }
9
- interface EvalCheck {
10
- name: string;
11
- passed: boolean;
12
- detail?: string;
13
- }
14
- interface EvalFailure {
15
- title: string;
16
- }
17
- interface EvalPerformance {
18
- passRate: number;
19
- avgScore: number;
20
- latencyP95Ms: number;
21
- latencyAvgMs: number;
22
- tokensAvg: number;
23
- tokensP95: number;
24
- costUsd: number;
25
- /** Per-sample latency in ms for sparkline (e.g. last N requests) */
26
- latencyHistoryMs?: number[];
27
- }
28
- interface EvalRunMeta {
29
- model: string;
30
- provider: string;
31
- commit: string;
32
- branch: string;
33
- seed: number;
34
- concurrency: number;
35
- duration: string;
36
- artifact: string;
37
- }
38
- interface EvalRun {
39
- id: string;
40
- label: string;
41
- status: EvalStatus;
42
- performance: EvalPerformance;
43
- dimensions: EvalDimension[];
44
- checks: EvalCheck[];
45
- failures: EvalFailure[];
46
- meta: EvalRunMeta;
47
- }
48
- interface EvalDataset {
49
- id: string;
50
- name: string;
51
- overview: string;
52
- runs: EvalRun[];
53
- }
54
- interface EvaluatorOption {
55
- id: string;
56
- name: string;
57
- configPreview: string;
58
- }
59
- interface EvalsData {
60
- datasets: EvalDataset[];
61
- evaluators: EvaluatorOption[];
62
- }
63
- type PaneFocus = 'left' | 'right';
64
- type ViewLevel = 'datasets' | 'runs' | 'details' | 'new-evaluation';
65
- interface StartupArgs {
66
- datasetId?: string;
67
- runId?: string;
68
- search?: string;
69
- unknownArgs: string[];
70
- }
71
- interface CliState {
72
- level: ViewLevel;
73
- focus: PaneFocus;
74
- datasetMenuIndex: number;
75
- runMenuIndex: number;
76
- detailsScrollOffset: number;
77
- selectedEvaluatorIds: string[];
78
- evaluatorMenuIndex: number;
79
- searchQuery: string;
80
- searchMode: boolean;
81
- startupWarnings: string[];
82
- }
83
-
84
- interface RunnerDiscoveryConfig {
85
- rootDir: string;
86
- datasetSuffixes: ReadonlyArray<string>;
87
- evaluatorSuffixes: ReadonlyArray<string>;
88
- testCaseSuffixes: ReadonlyArray<string>;
89
- excludeDirectories: ReadonlyArray<string>;
90
- }
91
- interface RunnerConfig {
92
- discovery: RunnerDiscoveryConfig;
93
- artifactDirectory: string;
94
- }
95
- type RunnerConfigOverrides = Omit<Partial<RunnerConfig>, 'discovery'> & {
96
- discovery?: Partial<RunnerDiscoveryConfig>;
97
- };
98
- interface M4trixEvalConfigDiscovery {
99
- rootDir?: string;
100
- datasetFilePatterns?: ReadonlyArray<string>;
101
- evaluatorFilePatterns?: ReadonlyArray<string>;
102
- testCaseFilePatterns?: ReadonlyArray<string>;
103
- datasetSuffixes?: ReadonlyArray<string>;
104
- evaluatorSuffixes?: ReadonlyArray<string>;
105
- testCaseSuffixes?: ReadonlyArray<string>;
106
- excludeDirectories?: ReadonlyArray<string>;
107
- }
108
- interface M4trixEvalConfig {
109
- discovery?: M4trixEvalConfigDiscovery;
110
- artifactDirectory?: string;
111
- }
112
- type ConfigType = M4trixEvalConfig;
113
- type M4trixEvalConfigFactory<TConfig extends ConfigType = ConfigType> = () => TConfig;
114
- declare function defineConfig<TConfig extends ConfigType>(factory: M4trixEvalConfigFactory<TConfig>): M4trixEvalConfigFactory<TConfig>;
115
- declare const defaultRunnerConfig: RunnerConfig;
116
- declare function withRunnerConfig(overrides?: RunnerConfigOverrides): RunnerConfig;
117
-
118
- /** Matches a tag by exact string equality or regex test */
119
- type TagMatcher = string | RegExp;
120
- /** Matches a file path by glob string or regex test */
121
- type PathMatcher = string | RegExp;
122
-
123
- type InputOrBuilder<T> = T | (() => T);
124
- interface TestCaseDescribeConfig<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any = Schema.Schema<unknown>> {
125
- name: string;
126
- tags: string[];
127
- inputSchema: TI;
128
- input: InputOrBuilder<Schema.Schema.Type<TI>>;
129
- outputSchema?: TO;
130
- output?: InputOrBuilder<Schema.Schema.Type<TO>>;
131
- }
132
- declare class TestCase<TInput = unknown, TOutput = unknown> {
133
- private readonly _config;
134
- private constructor();
135
- static describe<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any = Schema.Schema<unknown>>(config: TestCaseDescribeConfig<TI, TO>): TestCase<Schema.Schema.Type<TI>, Schema.Schema.Type<TO>>;
136
- getName(): string;
137
- getTags(): string[];
138
- getInputSchema(): Schema.Schema.Any;
139
- getInput(): TInput;
140
- getOutputSchema(): Schema.Schema.Any | undefined;
141
- getOutput(): TOutput | undefined;
142
- }
143
-
144
- interface DatasetDefineConfig {
145
- name: string;
146
- includedTags?: TagMatcher[];
147
- excludedTags?: TagMatcher[];
148
- includedPaths?: PathMatcher[];
149
- excludedPaths?: PathMatcher[];
150
- }
151
- declare class Dataset {
152
- private readonly _config;
153
- private constructor();
154
- static define(config: DatasetDefineConfig): Dataset;
155
- getName(): string;
156
- getIncludedTags(): ReadonlyArray<TagMatcher>;
157
- getExcludedTags(): ReadonlyArray<TagMatcher>;
158
- getIncludedPaths(): ReadonlyArray<PathMatcher>;
159
- getExcludedPaths(): ReadonlyArray<PathMatcher>;
160
- matchesTestCase(testCase: TestCase<unknown>, filePath: string): boolean;
161
- }
162
-
163
- interface EvalMiddleware<TCtx> {
164
- name: string;
165
- resolve: () => TCtx | Promise<TCtx>;
166
- }
167
- interface EvaluateArgs<TInput, TCtx> {
168
- input: TInput;
169
- ctx: TCtx;
170
- output?: unknown;
171
- }
172
- type EvaluateFn<TInput, TScore, TCtx> = (args: EvaluateArgs<TInput, TCtx>) => TScore | Promise<TScore>;
173
- interface EvaluatorDefineConfig<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any, TS extends Schema.Schema.Any> {
174
- name: string;
175
- inputSchema: TI;
176
- outputSchema: TO;
177
- scoreSchema: TS;
178
- passThreshold?: number;
179
- passCriterion?: (score: unknown) => boolean;
180
- }
181
- declare class Evaluator<TInput = unknown, TOutput = unknown, TScore = unknown, TCtx = Record<string, never>> {
182
- private readonly _config;
183
- private constructor();
184
- private getState;
185
- static use<TCtx>(middleware: EvalMiddleware<TCtx>): Evaluator<unknown, unknown, unknown, TCtx>;
186
- use<TNew>(middleware: EvalMiddleware<TNew>): Evaluator<TInput, TOutput, TScore, TCtx & TNew>;
187
- define<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any, TS extends Schema.Schema.Any>(config: EvaluatorDefineConfig<TI, TO, TS>): Evaluator<Schema.Schema.Type<TI>, Schema.Schema.Type<TO>, Schema.Schema.Type<TS>, TCtx>;
188
- evaluate(fn: EvaluateFn<TInput, TScore, TCtx>): Evaluator<TInput, TOutput, TScore, TCtx>;
189
- getName(): string | undefined;
190
- getInputSchema(): Schema.Schema.Any | undefined;
191
- getOutputSchema(): Schema.Schema.Any | undefined;
192
- getScoreSchema(): Schema.Schema.Any | undefined;
193
- getMiddlewares(): ReadonlyArray<EvalMiddleware<unknown>>;
194
- getEvaluateFn(): EvaluateFn<TInput, TScore, TCtx> | undefined;
195
- getPassThreshold(): number | undefined;
196
- getPassCriterion(): ((score: unknown) => boolean) | undefined;
197
- resolveContext(): Promise<TCtx>;
198
- }
199
-
200
- interface MetricItem<TData = unknown> {
201
- readonly id: string;
202
- readonly data: TData;
203
- }
204
- interface MetricDef<TData = unknown> {
205
- readonly id: string;
206
- readonly name?: string;
207
- format(data: TData): string;
208
- make(data: TData): MetricItem<TData>;
209
- }
210
- declare const Metric: {
211
- of<TData>(config: {
212
- id: string;
213
- name?: string;
214
- format: (data: TData) => string;
215
- }): MetricDef<TData>;
216
- };
217
- declare function getMetricById(id: string): MetricDef<unknown> | undefined;
218
-
219
- type ScoreDisplayStrategy = 'bar' | 'number' | 'passFail';
220
- interface ScoreItem<TData = unknown> {
221
- readonly id: string;
222
- readonly data: TData;
223
- readonly passed?: boolean;
224
- }
225
- interface ScoreDef<TData = unknown> {
226
- readonly id: string;
227
- readonly name?: string;
228
- readonly displayStrategy: ScoreDisplayStrategy;
229
- format(data: TData): string;
230
- make(data: TData, options?: {
231
- definePassed?: (data: TData) => boolean;
232
- }): ScoreItem<TData>;
233
- }
234
- declare const Score: {
235
- of<TData>(config: {
236
- id: string;
237
- name?: string;
238
- displayStrategy: ScoreDisplayStrategy;
239
- format: (data: TData) => string;
240
- }): ScoreDef<TData>;
241
- };
242
- declare function getScoreById(id: string): ScoreDef<unknown> | undefined;
243
-
244
- interface CollectedDataset {
245
- id: string;
246
- filePath: string;
247
- dataset: Dataset;
248
- }
249
- interface CollectedEvaluator {
250
- id: string;
251
- filePath: string;
252
- evaluator: Evaluator<unknown, unknown, unknown, unknown>;
253
- }
254
- interface CollectedTestCase {
255
- id: string;
256
- filePath: string;
257
- testCase: TestCase<unknown, unknown>;
258
- }
259
- interface SearchTestCasesQuery {
260
- includedTags?: ReadonlyArray<string | RegExp>;
261
- excludedTags?: ReadonlyArray<string | RegExp>;
262
- includedPaths?: ReadonlyArray<string | RegExp>;
263
- excludedPaths?: ReadonlyArray<string | RegExp>;
264
- }
265
- interface RunDatasetRequest {
266
- datasetId: string;
267
- evaluatorIds: ReadonlyArray<string>;
268
- concurrency?: number;
269
- }
270
- interface RunSnapshot {
271
- runId: string;
272
- datasetId: string;
273
- datasetName: string;
274
- evaluatorIds: ReadonlyArray<string>;
275
- queuedAt: number;
276
- startedAt?: number;
277
- finishedAt?: number;
278
- totalTestCases: number;
279
- completedTestCases: number;
280
- passedTestCases: number;
281
- failedTestCases: number;
282
- status: 'queued' | 'running' | 'completed' | 'failed';
283
- artifactPath: string;
284
- errorMessage?: string;
285
- }
286
- type RunnerEvent = {
287
- type: 'RunQueued';
288
- runId: string;
289
- datasetId: string;
290
- datasetName: string;
291
- evaluatorIds: ReadonlyArray<string>;
292
- totalTestCases: number;
293
- artifactPath: string;
294
- } | {
295
- type: 'RunStarted';
296
- runId: string;
297
- startedAt: number;
298
- } | {
299
- type: 'TestCaseProgress';
300
- runId: string;
301
- testCaseId: string;
302
- testCaseName: string;
303
- completedTestCases: number;
304
- totalTestCases: number;
305
- passed: boolean;
306
- durationMs: number;
307
- evaluatorScores: ReadonlyArray<{
308
- evaluatorId: string;
309
- scores: ReadonlyArray<ScoreItem>;
310
- passed: boolean;
311
- metrics?: ReadonlyArray<MetricItem>;
312
- }>;
313
- output?: unknown;
314
- errorMessage?: string;
315
- } | {
316
- type: 'RunCompleted';
317
- runId: string;
318
- finishedAt: number;
319
- passedTestCases: number;
320
- failedTestCases: number;
321
- totalTestCases: number;
322
- artifactPath: string;
323
- } | {
324
- type: 'RunFailed';
325
- runId: string;
326
- finishedAt: number;
327
- errorMessage: string;
328
- artifactPath: string;
329
- } | {
330
- type: 'ArtifactFlushed';
331
- runId: string;
332
- artifactPath: string;
333
- };
334
-
335
- interface SubscribeOptions {
336
- runId?: string;
337
- }
338
- interface RunnerApi {
339
- collectDatasets(): Promise<ReadonlyArray<CollectedDataset>>;
340
- collectEvaluators(): Promise<ReadonlyArray<CollectedEvaluator>>;
341
- resolveDatasetByName(name: string): Promise<CollectedDataset | undefined>;
342
- resolveEvaluatorsByNamePattern(pattern: string): Promise<ReadonlyArray<CollectedEvaluator>>;
343
- searchTestCases(query?: SearchTestCasesQuery): Promise<ReadonlyArray<CollectedTestCase>>;
344
- collectDatasetTestCases(datasetId: string): Promise<ReadonlyArray<CollectedTestCase>>;
345
- runDatasetWith(request: RunDatasetRequest): Promise<RunSnapshot>;
346
- subscribeRunEvents(listener: (event: RunnerEvent) => void, options?: SubscribeOptions): () => void;
347
- getRunSnapshot(runId: string): RunSnapshot | undefined;
348
- getAllRunSnapshots(): ReadonlyArray<RunSnapshot>;
349
- shutdown(): Promise<void>;
350
- }
351
- declare function createRunner(overrides?: RunnerConfigOverrides): RunnerApi;
352
-
353
- declare function loadMockData(): EvalsData;
354
- declare function loadRunnerData(runner: RunnerApi): Promise<EvalsData>;
355
- declare function parseStartupArgs(argv: string[]): StartupArgs;
356
-
357
- interface TokenCountData {
358
- input?: number;
359
- output?: number;
360
- inputCached?: number;
361
- outputCached?: number;
362
- }
363
- declare const tokenCountMetric: MetricDef<TokenCountData>;
364
- interface LatencyData {
365
- ms: number;
366
- }
367
- declare const latencyMetric: MetricDef<LatencyData>;
368
-
369
- interface PercentScoreData {
370
- value: number;
371
- }
372
- declare const percentScore: ScoreDef<PercentScoreData>;
373
- interface BinaryScoreData {
374
- passed: boolean;
375
- }
376
- declare const binaryScore: ScoreDef<BinaryScoreData>;
377
-
378
- export { type BinaryScoreData, type CliState, type CollectedDataset, type CollectedEvaluator, type CollectedTestCase, type ConfigType, Dataset, type EvalDataset, type EvalMiddleware, type EvalRun, type EvalsData, type EvaluateArgs, Evaluator, type EvaluatorOption, type LatencyData, type M4trixEvalConfig, type M4trixEvalConfigDiscovery, Metric, type MetricDef, type MetricItem, type PathMatcher, type PercentScoreData, type RunDatasetRequest, type RunSnapshot, type RunnerApi, type RunnerConfig, type RunnerConfigOverrides, type RunnerDiscoveryConfig, type RunnerEvent, Score, type ScoreDef, type ScoreDisplayStrategy, type ScoreItem, type SearchTestCasesQuery, type StartupArgs, type TagMatcher, TestCase, type TokenCountData, type ViewLevel, binaryScore, createRunner, defaultRunnerConfig, defineConfig, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, tokenCountMetric, withRunnerConfig };