@m4trix/evals 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +1075 -0
- package/dist/cli-simple.cjs.map +1 -0
- package/dist/cli-simple.d.cts +1 -0
- package/dist/cli-simple.d.ts +1 -0
- package/dist/cli-simple.js +1072 -0
- package/dist/cli-simple.js.map +1 -0
- package/dist/cli.cjs +1981 -0
- package/dist/cli.cjs.map +1 -0
- package/dist/cli.d.cts +1 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.js +1974 -0
- package/dist/cli.js.map +1 -0
- package/dist/index.cjs +1184 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +347 -0
- package/dist/index.d.ts +347 -0
- package/dist/index.js +1165 -0
- package/dist/index.js.map +1 -0
- package/package.json +53 -0
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,347 @@
|
|
|
1
|
+
import { Schema } from 'effect';
|
|
2
|
+
|
|
3
|
+
type EvalStatus = 'PASS' | 'FAILED' | 'RUNNING';
|
|
4
|
+
interface EvalDimension {
|
|
5
|
+
name: string;
|
|
6
|
+
score: number;
|
|
7
|
+
}
|
|
8
|
+
interface EvalCheck {
|
|
9
|
+
name: string;
|
|
10
|
+
passed: boolean;
|
|
11
|
+
detail?: string;
|
|
12
|
+
}
|
|
13
|
+
interface EvalFailure {
|
|
14
|
+
title: string;
|
|
15
|
+
}
|
|
16
|
+
interface EvalPerformance {
|
|
17
|
+
passRate: number;
|
|
18
|
+
avgScore: number;
|
|
19
|
+
latencyP95Ms: number;
|
|
20
|
+
latencyAvgMs: number;
|
|
21
|
+
tokensAvg: number;
|
|
22
|
+
tokensP95: number;
|
|
23
|
+
costUsd: number;
|
|
24
|
+
/** Per-sample latency in ms for sparkline (e.g. last N requests) */
|
|
25
|
+
latencyHistoryMs?: number[];
|
|
26
|
+
}
|
|
27
|
+
interface EvalRunMeta {
|
|
28
|
+
model: string;
|
|
29
|
+
provider: string;
|
|
30
|
+
commit: string;
|
|
31
|
+
branch: string;
|
|
32
|
+
seed: number;
|
|
33
|
+
concurrency: number;
|
|
34
|
+
duration: string;
|
|
35
|
+
artifact: string;
|
|
36
|
+
}
|
|
37
|
+
interface EvalRun {
|
|
38
|
+
id: string;
|
|
39
|
+
label: string;
|
|
40
|
+
status: EvalStatus;
|
|
41
|
+
performance: EvalPerformance;
|
|
42
|
+
dimensions: EvalDimension[];
|
|
43
|
+
checks: EvalCheck[];
|
|
44
|
+
failures: EvalFailure[];
|
|
45
|
+
meta: EvalRunMeta;
|
|
46
|
+
}
|
|
47
|
+
interface EvalDataset {
|
|
48
|
+
id: string;
|
|
49
|
+
name: string;
|
|
50
|
+
overview: string;
|
|
51
|
+
runs: EvalRun[];
|
|
52
|
+
}
|
|
53
|
+
interface EvaluatorOption {
|
|
54
|
+
id: string;
|
|
55
|
+
name: string;
|
|
56
|
+
configPreview: string;
|
|
57
|
+
}
|
|
58
|
+
interface EvalsData {
|
|
59
|
+
datasets: EvalDataset[];
|
|
60
|
+
evaluators: EvaluatorOption[];
|
|
61
|
+
}
|
|
62
|
+
type PaneFocus = 'left' | 'right';
|
|
63
|
+
type ViewLevel = 'datasets' | 'runs' | 'details' | 'new-evaluation';
|
|
64
|
+
interface StartupArgs {
|
|
65
|
+
datasetId?: string;
|
|
66
|
+
runId?: string;
|
|
67
|
+
search?: string;
|
|
68
|
+
unknownArgs: string[];
|
|
69
|
+
}
|
|
70
|
+
interface CliState {
|
|
71
|
+
level: ViewLevel;
|
|
72
|
+
focus: PaneFocus;
|
|
73
|
+
datasetMenuIndex: number;
|
|
74
|
+
runMenuIndex: number;
|
|
75
|
+
detailsScrollOffset: number;
|
|
76
|
+
selectedEvaluatorIds: string[];
|
|
77
|
+
evaluatorMenuIndex: number;
|
|
78
|
+
searchQuery: string;
|
|
79
|
+
searchMode: boolean;
|
|
80
|
+
startupWarnings: string[];
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
interface RunnerDiscoveryConfig {
|
|
84
|
+
rootDir: string;
|
|
85
|
+
datasetSuffixes: ReadonlyArray<string>;
|
|
86
|
+
evaluatorSuffixes: ReadonlyArray<string>;
|
|
87
|
+
testCaseSuffixes: ReadonlyArray<string>;
|
|
88
|
+
excludeDirectories: ReadonlyArray<string>;
|
|
89
|
+
}
|
|
90
|
+
interface RunnerConfig {
|
|
91
|
+
discovery: RunnerDiscoveryConfig;
|
|
92
|
+
artifactDirectory: string;
|
|
93
|
+
}
|
|
94
|
+
declare const defaultRunnerConfig: RunnerConfig;
|
|
95
|
+
declare function withRunnerConfig(overrides?: Partial<RunnerConfig>): RunnerConfig;
|
|
96
|
+
|
|
97
|
+
/** Matches a tag by exact string equality or regex test */
|
|
98
|
+
type TagMatcher = string | RegExp;
|
|
99
|
+
/** Matches a file path by glob string or regex test */
|
|
100
|
+
type PathMatcher = string | RegExp;
|
|
101
|
+
|
|
102
|
+
type InputOrBuilder<T> = T | (() => T);
|
|
103
|
+
interface TestCaseDescribeConfig<TI extends Schema.Schema.Any> {
|
|
104
|
+
name: string;
|
|
105
|
+
tags: string[];
|
|
106
|
+
inputSchema: TI;
|
|
107
|
+
input: InputOrBuilder<Schema.Schema.Type<TI>>;
|
|
108
|
+
}
|
|
109
|
+
declare class TestCase<TInput = unknown> {
|
|
110
|
+
private readonly _config;
|
|
111
|
+
private constructor();
|
|
112
|
+
static describe<TI extends Schema.Schema.Any>(config: TestCaseDescribeConfig<TI>): TestCase<Schema.Schema.Type<TI>>;
|
|
113
|
+
getName(): string;
|
|
114
|
+
getTags(): string[];
|
|
115
|
+
getInputSchema(): Schema.Schema.Any;
|
|
116
|
+
getInput(): TInput;
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
interface DatasetDefineConfig {
|
|
120
|
+
name: string;
|
|
121
|
+
includedTags?: TagMatcher[];
|
|
122
|
+
excludedTags?: TagMatcher[];
|
|
123
|
+
includedPaths?: PathMatcher[];
|
|
124
|
+
excludedPaths?: PathMatcher[];
|
|
125
|
+
}
|
|
126
|
+
declare class Dataset {
|
|
127
|
+
private readonly _config;
|
|
128
|
+
private constructor();
|
|
129
|
+
static define(config: DatasetDefineConfig): Dataset;
|
|
130
|
+
getName(): string;
|
|
131
|
+
getIncludedTags(): ReadonlyArray<TagMatcher>;
|
|
132
|
+
getExcludedTags(): ReadonlyArray<TagMatcher>;
|
|
133
|
+
getIncludedPaths(): ReadonlyArray<PathMatcher>;
|
|
134
|
+
getExcludedPaths(): ReadonlyArray<PathMatcher>;
|
|
135
|
+
matchesTestCase(testCase: TestCase<unknown>, filePath: string): boolean;
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
interface EvalMiddleware<TCtx> {
|
|
139
|
+
name: string;
|
|
140
|
+
resolve: () => TCtx | Promise<TCtx>;
|
|
141
|
+
}
|
|
142
|
+
type EvaluateFn<TInput, TScore, TCtx> = (input: TInput, ctx: TCtx) => TScore | Promise<TScore>;
|
|
143
|
+
interface EvaluatorDefineConfig<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any, TS extends Schema.Schema.Any> {
|
|
144
|
+
name: string;
|
|
145
|
+
inputSchema: TI;
|
|
146
|
+
outputSchema: TO;
|
|
147
|
+
scoreSchema: TS;
|
|
148
|
+
passThreshold?: number;
|
|
149
|
+
passCriterion?: (score: unknown) => boolean;
|
|
150
|
+
}
|
|
151
|
+
declare class Evaluator<TInput = unknown, TOutput = unknown, TScore = unknown, TCtx = Record<string, never>> {
|
|
152
|
+
private readonly _config;
|
|
153
|
+
private constructor();
|
|
154
|
+
private getState;
|
|
155
|
+
static use<TCtx>(middleware: EvalMiddleware<TCtx>): Evaluator<unknown, unknown, unknown, TCtx>;
|
|
156
|
+
use<TNew>(middleware: EvalMiddleware<TNew>): Evaluator<TInput, TOutput, TScore, TCtx & TNew>;
|
|
157
|
+
define<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any, TS extends Schema.Schema.Any>(config: EvaluatorDefineConfig<TI, TO, TS>): Evaluator<Schema.Schema.Type<TI>, Schema.Schema.Type<TO>, Schema.Schema.Type<TS>, TCtx>;
|
|
158
|
+
evaluate(fn: EvaluateFn<TInput, TScore, TCtx>): Evaluator<TInput, TOutput, TScore, TCtx>;
|
|
159
|
+
getName(): string | undefined;
|
|
160
|
+
getInputSchema(): Schema.Schema.Any | undefined;
|
|
161
|
+
getOutputSchema(): Schema.Schema.Any | undefined;
|
|
162
|
+
getScoreSchema(): Schema.Schema.Any | undefined;
|
|
163
|
+
getMiddlewares(): ReadonlyArray<EvalMiddleware<unknown>>;
|
|
164
|
+
getEvaluateFn(): EvaluateFn<TInput, TScore, TCtx> | undefined;
|
|
165
|
+
getPassThreshold(): number | undefined;
|
|
166
|
+
getPassCriterion(): ((score: unknown) => boolean) | undefined;
|
|
167
|
+
resolveContext(): Promise<TCtx>;
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
interface MetricItem<TData = unknown> {
|
|
171
|
+
readonly id: string;
|
|
172
|
+
readonly data: TData;
|
|
173
|
+
}
|
|
174
|
+
interface MetricDef<TData = unknown> {
|
|
175
|
+
readonly id: string;
|
|
176
|
+
readonly name?: string;
|
|
177
|
+
format(data: TData): string;
|
|
178
|
+
make(data: TData): MetricItem<TData>;
|
|
179
|
+
}
|
|
180
|
+
declare const Metric: {
|
|
181
|
+
of<TData>(config: {
|
|
182
|
+
id: string;
|
|
183
|
+
name?: string;
|
|
184
|
+
format: (data: TData) => string;
|
|
185
|
+
}): MetricDef<TData>;
|
|
186
|
+
};
|
|
187
|
+
declare function getMetricById(id: string): MetricDef<unknown> | undefined;
|
|
188
|
+
|
|
189
|
+
type ScoreDisplayStrategy = 'bar' | 'number' | 'passFail';
|
|
190
|
+
interface ScoreItem<TData = unknown> {
|
|
191
|
+
readonly id: string;
|
|
192
|
+
readonly data: TData;
|
|
193
|
+
readonly passed?: boolean;
|
|
194
|
+
}
|
|
195
|
+
interface ScoreDef<TData = unknown> {
|
|
196
|
+
readonly id: string;
|
|
197
|
+
readonly name?: string;
|
|
198
|
+
readonly displayStrategy: ScoreDisplayStrategy;
|
|
199
|
+
format(data: TData): string;
|
|
200
|
+
make(data: TData, options?: {
|
|
201
|
+
definePassed?: (data: TData) => boolean;
|
|
202
|
+
}): ScoreItem<TData>;
|
|
203
|
+
}
|
|
204
|
+
declare const Score: {
|
|
205
|
+
of<TData>(config: {
|
|
206
|
+
id: string;
|
|
207
|
+
name?: string;
|
|
208
|
+
displayStrategy: ScoreDisplayStrategy;
|
|
209
|
+
format: (data: TData) => string;
|
|
210
|
+
}): ScoreDef<TData>;
|
|
211
|
+
};
|
|
212
|
+
declare function getScoreById(id: string): ScoreDef<unknown> | undefined;
|
|
213
|
+
|
|
214
|
+
interface CollectedDataset {
|
|
215
|
+
id: string;
|
|
216
|
+
filePath: string;
|
|
217
|
+
dataset: Dataset;
|
|
218
|
+
}
|
|
219
|
+
interface CollectedEvaluator {
|
|
220
|
+
id: string;
|
|
221
|
+
filePath: string;
|
|
222
|
+
evaluator: Evaluator<unknown, unknown, unknown, unknown>;
|
|
223
|
+
}
|
|
224
|
+
interface CollectedTestCase {
|
|
225
|
+
id: string;
|
|
226
|
+
filePath: string;
|
|
227
|
+
testCase: TestCase<unknown>;
|
|
228
|
+
}
|
|
229
|
+
interface SearchTestCasesQuery {
|
|
230
|
+
includedTags?: ReadonlyArray<string | RegExp>;
|
|
231
|
+
excludedTags?: ReadonlyArray<string | RegExp>;
|
|
232
|
+
includedPaths?: ReadonlyArray<string | RegExp>;
|
|
233
|
+
excludedPaths?: ReadonlyArray<string | RegExp>;
|
|
234
|
+
}
|
|
235
|
+
interface RunDatasetRequest {
|
|
236
|
+
datasetId: string;
|
|
237
|
+
evaluatorIds: ReadonlyArray<string>;
|
|
238
|
+
concurrency?: number;
|
|
239
|
+
}
|
|
240
|
+
interface RunSnapshot {
|
|
241
|
+
runId: string;
|
|
242
|
+
datasetId: string;
|
|
243
|
+
datasetName: string;
|
|
244
|
+
evaluatorIds: ReadonlyArray<string>;
|
|
245
|
+
queuedAt: number;
|
|
246
|
+
startedAt?: number;
|
|
247
|
+
finishedAt?: number;
|
|
248
|
+
totalTestCases: number;
|
|
249
|
+
completedTestCases: number;
|
|
250
|
+
passedTestCases: number;
|
|
251
|
+
failedTestCases: number;
|
|
252
|
+
status: 'queued' | 'running' | 'completed' | 'failed';
|
|
253
|
+
artifactPath: string;
|
|
254
|
+
errorMessage?: string;
|
|
255
|
+
}
|
|
256
|
+
type RunnerEvent = {
|
|
257
|
+
type: 'RunQueued';
|
|
258
|
+
runId: string;
|
|
259
|
+
datasetId: string;
|
|
260
|
+
datasetName: string;
|
|
261
|
+
evaluatorIds: ReadonlyArray<string>;
|
|
262
|
+
totalTestCases: number;
|
|
263
|
+
artifactPath: string;
|
|
264
|
+
} | {
|
|
265
|
+
type: 'RunStarted';
|
|
266
|
+
runId: string;
|
|
267
|
+
startedAt: number;
|
|
268
|
+
} | {
|
|
269
|
+
type: 'TestCaseProgress';
|
|
270
|
+
runId: string;
|
|
271
|
+
testCaseId: string;
|
|
272
|
+
testCaseName: string;
|
|
273
|
+
completedTestCases: number;
|
|
274
|
+
totalTestCases: number;
|
|
275
|
+
passed: boolean;
|
|
276
|
+
durationMs: number;
|
|
277
|
+
evaluatorScores: ReadonlyArray<{
|
|
278
|
+
evaluatorId: string;
|
|
279
|
+
scores: ReadonlyArray<ScoreItem>;
|
|
280
|
+
passed: boolean;
|
|
281
|
+
metrics?: ReadonlyArray<MetricItem>;
|
|
282
|
+
}>;
|
|
283
|
+
errorMessage?: string;
|
|
284
|
+
} | {
|
|
285
|
+
type: 'RunCompleted';
|
|
286
|
+
runId: string;
|
|
287
|
+
finishedAt: number;
|
|
288
|
+
passedTestCases: number;
|
|
289
|
+
failedTestCases: number;
|
|
290
|
+
totalTestCases: number;
|
|
291
|
+
artifactPath: string;
|
|
292
|
+
} | {
|
|
293
|
+
type: 'RunFailed';
|
|
294
|
+
runId: string;
|
|
295
|
+
finishedAt: number;
|
|
296
|
+
errorMessage: string;
|
|
297
|
+
artifactPath: string;
|
|
298
|
+
} | {
|
|
299
|
+
type: 'ArtifactFlushed';
|
|
300
|
+
runId: string;
|
|
301
|
+
artifactPath: string;
|
|
302
|
+
};
|
|
303
|
+
|
|
304
|
+
interface SubscribeOptions {
|
|
305
|
+
runId?: string;
|
|
306
|
+
}
|
|
307
|
+
interface RunnerApi {
|
|
308
|
+
collectDatasets(): Promise<ReadonlyArray<CollectedDataset>>;
|
|
309
|
+
collectEvaluators(): Promise<ReadonlyArray<CollectedEvaluator>>;
|
|
310
|
+
resolveDatasetByName(name: string): Promise<CollectedDataset | undefined>;
|
|
311
|
+
resolveEvaluatorsByNamePattern(pattern: string): Promise<ReadonlyArray<CollectedEvaluator>>;
|
|
312
|
+
searchTestCases(query?: SearchTestCasesQuery): Promise<ReadonlyArray<CollectedTestCase>>;
|
|
313
|
+
collectDatasetTestCases(datasetId: string): Promise<ReadonlyArray<CollectedTestCase>>;
|
|
314
|
+
runDatasetWith(request: RunDatasetRequest): Promise<RunSnapshot>;
|
|
315
|
+
subscribeRunEvents(listener: (event: RunnerEvent) => void, options?: SubscribeOptions): () => void;
|
|
316
|
+
getRunSnapshot(runId: string): RunSnapshot | undefined;
|
|
317
|
+
getAllRunSnapshots(): ReadonlyArray<RunSnapshot>;
|
|
318
|
+
shutdown(): Promise<void>;
|
|
319
|
+
}
|
|
320
|
+
declare function createRunner(overrides?: Partial<RunnerConfig>): RunnerApi;
|
|
321
|
+
|
|
322
|
+
declare function loadMockData(): EvalsData;
|
|
323
|
+
declare function loadRunnerData(runner: RunnerApi): Promise<EvalsData>;
|
|
324
|
+
declare function parseStartupArgs(argv: string[]): StartupArgs;
|
|
325
|
+
|
|
326
|
+
interface TokenCountData {
|
|
327
|
+
input?: number;
|
|
328
|
+
output?: number;
|
|
329
|
+
inputCached?: number;
|
|
330
|
+
outputCached?: number;
|
|
331
|
+
}
|
|
332
|
+
declare const tokenCountMetric: MetricDef<TokenCountData>;
|
|
333
|
+
interface LatencyData {
|
|
334
|
+
ms: number;
|
|
335
|
+
}
|
|
336
|
+
declare const latencyMetric: MetricDef<LatencyData>;
|
|
337
|
+
|
|
338
|
+
interface PercentScoreData {
|
|
339
|
+
value: number;
|
|
340
|
+
}
|
|
341
|
+
declare const percentScore: ScoreDef<PercentScoreData>;
|
|
342
|
+
interface BinaryScoreData {
|
|
343
|
+
passed: boolean;
|
|
344
|
+
}
|
|
345
|
+
declare const binaryScore: ScoreDef<BinaryScoreData>;
|
|
346
|
+
|
|
347
|
+
export { type BinaryScoreData, type CliState, type CollectedDataset, type CollectedEvaluator, type CollectedTestCase, Dataset, type EvalDataset, type EvalMiddleware, type EvalRun, type EvalsData, Evaluator, type EvaluatorOption, type LatencyData, Metric, type MetricDef, type MetricItem, type PathMatcher, type PercentScoreData, type RunDatasetRequest, type RunSnapshot, type RunnerApi, type RunnerConfig, type RunnerDiscoveryConfig, type RunnerEvent, Score, type ScoreDef, type ScoreDisplayStrategy, type ScoreItem, type SearchTestCasesQuery, type StartupArgs, type TagMatcher, TestCase, type TokenCountData, type ViewLevel, binaryScore, createRunner, defaultRunnerConfig, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, tokenCountMetric, withRunnerConfig };
|