@m4trix/evals 0.5.0 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +10 -1
- package/dist/cli-simple.d.ts +0 -0
- package/dist/cli-simple.js +11 -2
- package/dist/cli.cjs +10 -1
- package/dist/cli.d.ts +0 -0
- package/dist/cli.js +10 -1
- package/dist/index.cjs +1 -1
- package/dist/index.d.ts +3 -3
- package/package.json +3 -2
- package/dist/cli-simple.d.cts +0 -1
- package/dist/cli.d.cts +0 -1
- package/dist/index.d.cts +0 -378
package/dist/cli-simple.cjs
CHANGED
|
@@ -633,9 +633,18 @@ function mergeRunnerOverrides(base, next) {
|
|
|
633
633
|
if (!base) {
|
|
634
634
|
return next;
|
|
635
635
|
}
|
|
636
|
-
{
|
|
636
|
+
if (!next) {
|
|
637
637
|
return base;
|
|
638
638
|
}
|
|
639
|
+
const discovery = base.discovery || next.discovery ? {
|
|
640
|
+
...base.discovery ?? {},
|
|
641
|
+
...next.discovery ?? {}
|
|
642
|
+
} : void 0;
|
|
643
|
+
return {
|
|
644
|
+
...base,
|
|
645
|
+
...next,
|
|
646
|
+
discovery
|
|
647
|
+
};
|
|
639
648
|
}
|
|
640
649
|
function createRunner(overrides) {
|
|
641
650
|
const fileOverrides = loadRunnerConfigFile();
|
package/dist/cli-simple.d.ts
CHANGED
|
File without changes
|
package/dist/cli-simple.js
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
import { randomUUID } from 'crypto';
|
|
3
3
|
import { Effect, PubSub, Queue, Fiber } from 'effect';
|
|
4
4
|
import { existsSync } from 'fs';
|
|
5
|
-
import { resolve,
|
|
5
|
+
import { resolve, relative, join, parse, dirname } from 'path';
|
|
6
6
|
import * as jitiModule from 'jiti';
|
|
7
7
|
import { writeFile, mkdir, appendFile, readdir } from 'fs/promises';
|
|
8
8
|
import { pathToFileURL } from 'url';
|
|
@@ -610,9 +610,18 @@ function mergeRunnerOverrides(base, next) {
|
|
|
610
610
|
if (!base) {
|
|
611
611
|
return next;
|
|
612
612
|
}
|
|
613
|
-
{
|
|
613
|
+
if (!next) {
|
|
614
614
|
return base;
|
|
615
615
|
}
|
|
616
|
+
const discovery = base.discovery || next.discovery ? {
|
|
617
|
+
...base.discovery ?? {},
|
|
618
|
+
...next.discovery ?? {}
|
|
619
|
+
} : void 0;
|
|
620
|
+
return {
|
|
621
|
+
...base,
|
|
622
|
+
...next,
|
|
623
|
+
discovery
|
|
624
|
+
};
|
|
616
625
|
}
|
|
617
626
|
function createRunner(overrides) {
|
|
618
627
|
const fileOverrides = loadRunnerConfigFile();
|
package/dist/cli.cjs
CHANGED
|
@@ -1899,9 +1899,18 @@ function mergeRunnerOverrides(base, next) {
|
|
|
1899
1899
|
if (!base) {
|
|
1900
1900
|
return next;
|
|
1901
1901
|
}
|
|
1902
|
-
{
|
|
1902
|
+
if (!next) {
|
|
1903
1903
|
return base;
|
|
1904
1904
|
}
|
|
1905
|
+
const discovery = base.discovery || next.discovery ? {
|
|
1906
|
+
...base.discovery ?? {},
|
|
1907
|
+
...next.discovery ?? {}
|
|
1908
|
+
} : void 0;
|
|
1909
|
+
return {
|
|
1910
|
+
...base,
|
|
1911
|
+
...next,
|
|
1912
|
+
discovery
|
|
1913
|
+
};
|
|
1905
1914
|
}
|
|
1906
1915
|
function createRunner(overrides) {
|
|
1907
1916
|
const fileOverrides = loadRunnerConfigFile();
|
package/dist/cli.d.ts
CHANGED
|
File without changes
|
package/dist/cli.js
CHANGED
|
@@ -1873,9 +1873,18 @@ function mergeRunnerOverrides(base, next) {
|
|
|
1873
1873
|
if (!base) {
|
|
1874
1874
|
return next;
|
|
1875
1875
|
}
|
|
1876
|
-
{
|
|
1876
|
+
if (!next) {
|
|
1877
1877
|
return base;
|
|
1878
1878
|
}
|
|
1879
|
+
const discovery = base.discovery || next.discovery ? {
|
|
1880
|
+
...base.discovery ?? {},
|
|
1881
|
+
...next.discovery ?? {}
|
|
1882
|
+
} : void 0;
|
|
1883
|
+
return {
|
|
1884
|
+
...base,
|
|
1885
|
+
...next,
|
|
1886
|
+
discovery
|
|
1887
|
+
};
|
|
1879
1888
|
}
|
|
1880
1889
|
function createRunner(overrides) {
|
|
1881
1890
|
const fileOverrides = loadRunnerConfigFile();
|
package/dist/index.cjs
CHANGED
package/dist/index.d.ts
CHANGED
|
@@ -210,7 +210,7 @@ interface MetricDef<TData = unknown> {
|
|
|
210
210
|
declare const Metric: {
|
|
211
211
|
of<TData>(config: {
|
|
212
212
|
id: string;
|
|
213
|
-
name?: string;
|
|
213
|
+
name?: string | undefined;
|
|
214
214
|
format: (data: TData) => string;
|
|
215
215
|
}): MetricDef<TData>;
|
|
216
216
|
};
|
|
@@ -234,7 +234,7 @@ interface ScoreDef<TData = unknown> {
|
|
|
234
234
|
declare const Score: {
|
|
235
235
|
of<TData>(config: {
|
|
236
236
|
id: string;
|
|
237
|
-
name?: string;
|
|
237
|
+
name?: string | undefined;
|
|
238
238
|
displayStrategy: ScoreDisplayStrategy;
|
|
239
239
|
format: (data: TData) => string;
|
|
240
240
|
}): ScoreDef<TData>;
|
|
@@ -375,4 +375,4 @@ interface BinaryScoreData {
|
|
|
375
375
|
}
|
|
376
376
|
declare const binaryScore: ScoreDef<BinaryScoreData>;
|
|
377
377
|
|
|
378
|
-
export {
|
|
378
|
+
export { BinaryScoreData, CliState, CollectedDataset, CollectedEvaluator, CollectedTestCase, ConfigType, Dataset, EvalDataset, EvalMiddleware, EvalRun, EvalsData, EvaluateArgs, Evaluator, EvaluatorOption, LatencyData, M4trixEvalConfig, M4trixEvalConfigDiscovery, Metric, MetricDef, MetricItem, PathMatcher, PercentScoreData, RunDatasetRequest, RunSnapshot, RunnerApi, RunnerConfig, RunnerConfigOverrides, RunnerDiscoveryConfig, RunnerEvent, Score, ScoreDef, ScoreDisplayStrategy, ScoreItem, SearchTestCasesQuery, StartupArgs, TagMatcher, TestCase, TokenCountData, ViewLevel, binaryScore, createRunner, defaultRunnerConfig, defineConfig, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, tokenCountMetric, withRunnerConfig };
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@m4trix/evals",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.8.0",
|
|
4
4
|
"description": "Ink-based evaluation console for m4trix",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./dist/index.cjs",
|
|
@@ -27,7 +27,8 @@
|
|
|
27
27
|
"dev": "tsup --watch",
|
|
28
28
|
"lint": "eslint src --ext .ts,.tsx",
|
|
29
29
|
"prepare": "pnpm run build",
|
|
30
|
-
"publish-package": "npm publish --no-git-checks"
|
|
30
|
+
"publish-package": "npm publish --no-git-checks",
|
|
31
|
+
"release": "pnpm -C ../.. run release:bump:evals"
|
|
31
32
|
},
|
|
32
33
|
"dependencies": {
|
|
33
34
|
"effect": "^3.16.10",
|
package/dist/cli-simple.d.cts
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env node
|
package/dist/cli.d.cts
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env node
|
package/dist/index.d.cts
DELETED
|
@@ -1,378 +0,0 @@
|
|
|
1
|
-
import { Schema } from 'effect';
|
|
2
|
-
export { Schema as S } from 'effect';
|
|
3
|
-
|
|
4
|
-
type EvalStatus = 'PASS' | 'FAILED' | 'RUNNING';
|
|
5
|
-
interface EvalDimension {
|
|
6
|
-
name: string;
|
|
7
|
-
score: number;
|
|
8
|
-
}
|
|
9
|
-
interface EvalCheck {
|
|
10
|
-
name: string;
|
|
11
|
-
passed: boolean;
|
|
12
|
-
detail?: string;
|
|
13
|
-
}
|
|
14
|
-
interface EvalFailure {
|
|
15
|
-
title: string;
|
|
16
|
-
}
|
|
17
|
-
interface EvalPerformance {
|
|
18
|
-
passRate: number;
|
|
19
|
-
avgScore: number;
|
|
20
|
-
latencyP95Ms: number;
|
|
21
|
-
latencyAvgMs: number;
|
|
22
|
-
tokensAvg: number;
|
|
23
|
-
tokensP95: number;
|
|
24
|
-
costUsd: number;
|
|
25
|
-
/** Per-sample latency in ms for sparkline (e.g. last N requests) */
|
|
26
|
-
latencyHistoryMs?: number[];
|
|
27
|
-
}
|
|
28
|
-
interface EvalRunMeta {
|
|
29
|
-
model: string;
|
|
30
|
-
provider: string;
|
|
31
|
-
commit: string;
|
|
32
|
-
branch: string;
|
|
33
|
-
seed: number;
|
|
34
|
-
concurrency: number;
|
|
35
|
-
duration: string;
|
|
36
|
-
artifact: string;
|
|
37
|
-
}
|
|
38
|
-
interface EvalRun {
|
|
39
|
-
id: string;
|
|
40
|
-
label: string;
|
|
41
|
-
status: EvalStatus;
|
|
42
|
-
performance: EvalPerformance;
|
|
43
|
-
dimensions: EvalDimension[];
|
|
44
|
-
checks: EvalCheck[];
|
|
45
|
-
failures: EvalFailure[];
|
|
46
|
-
meta: EvalRunMeta;
|
|
47
|
-
}
|
|
48
|
-
interface EvalDataset {
|
|
49
|
-
id: string;
|
|
50
|
-
name: string;
|
|
51
|
-
overview: string;
|
|
52
|
-
runs: EvalRun[];
|
|
53
|
-
}
|
|
54
|
-
interface EvaluatorOption {
|
|
55
|
-
id: string;
|
|
56
|
-
name: string;
|
|
57
|
-
configPreview: string;
|
|
58
|
-
}
|
|
59
|
-
interface EvalsData {
|
|
60
|
-
datasets: EvalDataset[];
|
|
61
|
-
evaluators: EvaluatorOption[];
|
|
62
|
-
}
|
|
63
|
-
type PaneFocus = 'left' | 'right';
|
|
64
|
-
type ViewLevel = 'datasets' | 'runs' | 'details' | 'new-evaluation';
|
|
65
|
-
interface StartupArgs {
|
|
66
|
-
datasetId?: string;
|
|
67
|
-
runId?: string;
|
|
68
|
-
search?: string;
|
|
69
|
-
unknownArgs: string[];
|
|
70
|
-
}
|
|
71
|
-
interface CliState {
|
|
72
|
-
level: ViewLevel;
|
|
73
|
-
focus: PaneFocus;
|
|
74
|
-
datasetMenuIndex: number;
|
|
75
|
-
runMenuIndex: number;
|
|
76
|
-
detailsScrollOffset: number;
|
|
77
|
-
selectedEvaluatorIds: string[];
|
|
78
|
-
evaluatorMenuIndex: number;
|
|
79
|
-
searchQuery: string;
|
|
80
|
-
searchMode: boolean;
|
|
81
|
-
startupWarnings: string[];
|
|
82
|
-
}
|
|
83
|
-
|
|
84
|
-
interface RunnerDiscoveryConfig {
|
|
85
|
-
rootDir: string;
|
|
86
|
-
datasetSuffixes: ReadonlyArray<string>;
|
|
87
|
-
evaluatorSuffixes: ReadonlyArray<string>;
|
|
88
|
-
testCaseSuffixes: ReadonlyArray<string>;
|
|
89
|
-
excludeDirectories: ReadonlyArray<string>;
|
|
90
|
-
}
|
|
91
|
-
interface RunnerConfig {
|
|
92
|
-
discovery: RunnerDiscoveryConfig;
|
|
93
|
-
artifactDirectory: string;
|
|
94
|
-
}
|
|
95
|
-
type RunnerConfigOverrides = Omit<Partial<RunnerConfig>, 'discovery'> & {
|
|
96
|
-
discovery?: Partial<RunnerDiscoveryConfig>;
|
|
97
|
-
};
|
|
98
|
-
interface M4trixEvalConfigDiscovery {
|
|
99
|
-
rootDir?: string;
|
|
100
|
-
datasetFilePatterns?: ReadonlyArray<string>;
|
|
101
|
-
evaluatorFilePatterns?: ReadonlyArray<string>;
|
|
102
|
-
testCaseFilePatterns?: ReadonlyArray<string>;
|
|
103
|
-
datasetSuffixes?: ReadonlyArray<string>;
|
|
104
|
-
evaluatorSuffixes?: ReadonlyArray<string>;
|
|
105
|
-
testCaseSuffixes?: ReadonlyArray<string>;
|
|
106
|
-
excludeDirectories?: ReadonlyArray<string>;
|
|
107
|
-
}
|
|
108
|
-
interface M4trixEvalConfig {
|
|
109
|
-
discovery?: M4trixEvalConfigDiscovery;
|
|
110
|
-
artifactDirectory?: string;
|
|
111
|
-
}
|
|
112
|
-
type ConfigType = M4trixEvalConfig;
|
|
113
|
-
type M4trixEvalConfigFactory<TConfig extends ConfigType = ConfigType> = () => TConfig;
|
|
114
|
-
declare function defineConfig<TConfig extends ConfigType>(factory: M4trixEvalConfigFactory<TConfig>): M4trixEvalConfigFactory<TConfig>;
|
|
115
|
-
declare const defaultRunnerConfig: RunnerConfig;
|
|
116
|
-
declare function withRunnerConfig(overrides?: RunnerConfigOverrides): RunnerConfig;
|
|
117
|
-
|
|
118
|
-
/** Matches a tag by exact string equality or regex test */
|
|
119
|
-
type TagMatcher = string | RegExp;
|
|
120
|
-
/** Matches a file path by glob string or regex test */
|
|
121
|
-
type PathMatcher = string | RegExp;
|
|
122
|
-
|
|
123
|
-
type InputOrBuilder<T> = T | (() => T);
|
|
124
|
-
interface TestCaseDescribeConfig<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any = Schema.Schema<unknown>> {
|
|
125
|
-
name: string;
|
|
126
|
-
tags: string[];
|
|
127
|
-
inputSchema: TI;
|
|
128
|
-
input: InputOrBuilder<Schema.Schema.Type<TI>>;
|
|
129
|
-
outputSchema?: TO;
|
|
130
|
-
output?: InputOrBuilder<Schema.Schema.Type<TO>>;
|
|
131
|
-
}
|
|
132
|
-
declare class TestCase<TInput = unknown, TOutput = unknown> {
|
|
133
|
-
private readonly _config;
|
|
134
|
-
private constructor();
|
|
135
|
-
static describe<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any = Schema.Schema<unknown>>(config: TestCaseDescribeConfig<TI, TO>): TestCase<Schema.Schema.Type<TI>, Schema.Schema.Type<TO>>;
|
|
136
|
-
getName(): string;
|
|
137
|
-
getTags(): string[];
|
|
138
|
-
getInputSchema(): Schema.Schema.Any;
|
|
139
|
-
getInput(): TInput;
|
|
140
|
-
getOutputSchema(): Schema.Schema.Any | undefined;
|
|
141
|
-
getOutput(): TOutput | undefined;
|
|
142
|
-
}
|
|
143
|
-
|
|
144
|
-
interface DatasetDefineConfig {
|
|
145
|
-
name: string;
|
|
146
|
-
includedTags?: TagMatcher[];
|
|
147
|
-
excludedTags?: TagMatcher[];
|
|
148
|
-
includedPaths?: PathMatcher[];
|
|
149
|
-
excludedPaths?: PathMatcher[];
|
|
150
|
-
}
|
|
151
|
-
declare class Dataset {
|
|
152
|
-
private readonly _config;
|
|
153
|
-
private constructor();
|
|
154
|
-
static define(config: DatasetDefineConfig): Dataset;
|
|
155
|
-
getName(): string;
|
|
156
|
-
getIncludedTags(): ReadonlyArray<TagMatcher>;
|
|
157
|
-
getExcludedTags(): ReadonlyArray<TagMatcher>;
|
|
158
|
-
getIncludedPaths(): ReadonlyArray<PathMatcher>;
|
|
159
|
-
getExcludedPaths(): ReadonlyArray<PathMatcher>;
|
|
160
|
-
matchesTestCase(testCase: TestCase<unknown>, filePath: string): boolean;
|
|
161
|
-
}
|
|
162
|
-
|
|
163
|
-
interface EvalMiddleware<TCtx> {
|
|
164
|
-
name: string;
|
|
165
|
-
resolve: () => TCtx | Promise<TCtx>;
|
|
166
|
-
}
|
|
167
|
-
interface EvaluateArgs<TInput, TCtx> {
|
|
168
|
-
input: TInput;
|
|
169
|
-
ctx: TCtx;
|
|
170
|
-
output?: unknown;
|
|
171
|
-
}
|
|
172
|
-
type EvaluateFn<TInput, TScore, TCtx> = (args: EvaluateArgs<TInput, TCtx>) => TScore | Promise<TScore>;
|
|
173
|
-
interface EvaluatorDefineConfig<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any, TS extends Schema.Schema.Any> {
|
|
174
|
-
name: string;
|
|
175
|
-
inputSchema: TI;
|
|
176
|
-
outputSchema: TO;
|
|
177
|
-
scoreSchema: TS;
|
|
178
|
-
passThreshold?: number;
|
|
179
|
-
passCriterion?: (score: unknown) => boolean;
|
|
180
|
-
}
|
|
181
|
-
declare class Evaluator<TInput = unknown, TOutput = unknown, TScore = unknown, TCtx = Record<string, never>> {
|
|
182
|
-
private readonly _config;
|
|
183
|
-
private constructor();
|
|
184
|
-
private getState;
|
|
185
|
-
static use<TCtx>(middleware: EvalMiddleware<TCtx>): Evaluator<unknown, unknown, unknown, TCtx>;
|
|
186
|
-
use<TNew>(middleware: EvalMiddleware<TNew>): Evaluator<TInput, TOutput, TScore, TCtx & TNew>;
|
|
187
|
-
define<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any, TS extends Schema.Schema.Any>(config: EvaluatorDefineConfig<TI, TO, TS>): Evaluator<Schema.Schema.Type<TI>, Schema.Schema.Type<TO>, Schema.Schema.Type<TS>, TCtx>;
|
|
188
|
-
evaluate(fn: EvaluateFn<TInput, TScore, TCtx>): Evaluator<TInput, TOutput, TScore, TCtx>;
|
|
189
|
-
getName(): string | undefined;
|
|
190
|
-
getInputSchema(): Schema.Schema.Any | undefined;
|
|
191
|
-
getOutputSchema(): Schema.Schema.Any | undefined;
|
|
192
|
-
getScoreSchema(): Schema.Schema.Any | undefined;
|
|
193
|
-
getMiddlewares(): ReadonlyArray<EvalMiddleware<unknown>>;
|
|
194
|
-
getEvaluateFn(): EvaluateFn<TInput, TScore, TCtx> | undefined;
|
|
195
|
-
getPassThreshold(): number | undefined;
|
|
196
|
-
getPassCriterion(): ((score: unknown) => boolean) | undefined;
|
|
197
|
-
resolveContext(): Promise<TCtx>;
|
|
198
|
-
}
|
|
199
|
-
|
|
200
|
-
interface MetricItem<TData = unknown> {
|
|
201
|
-
readonly id: string;
|
|
202
|
-
readonly data: TData;
|
|
203
|
-
}
|
|
204
|
-
interface MetricDef<TData = unknown> {
|
|
205
|
-
readonly id: string;
|
|
206
|
-
readonly name?: string;
|
|
207
|
-
format(data: TData): string;
|
|
208
|
-
make(data: TData): MetricItem<TData>;
|
|
209
|
-
}
|
|
210
|
-
declare const Metric: {
|
|
211
|
-
of<TData>(config: {
|
|
212
|
-
id: string;
|
|
213
|
-
name?: string;
|
|
214
|
-
format: (data: TData) => string;
|
|
215
|
-
}): MetricDef<TData>;
|
|
216
|
-
};
|
|
217
|
-
declare function getMetricById(id: string): MetricDef<unknown> | undefined;
|
|
218
|
-
|
|
219
|
-
type ScoreDisplayStrategy = 'bar' | 'number' | 'passFail';
|
|
220
|
-
interface ScoreItem<TData = unknown> {
|
|
221
|
-
readonly id: string;
|
|
222
|
-
readonly data: TData;
|
|
223
|
-
readonly passed?: boolean;
|
|
224
|
-
}
|
|
225
|
-
interface ScoreDef<TData = unknown> {
|
|
226
|
-
readonly id: string;
|
|
227
|
-
readonly name?: string;
|
|
228
|
-
readonly displayStrategy: ScoreDisplayStrategy;
|
|
229
|
-
format(data: TData): string;
|
|
230
|
-
make(data: TData, options?: {
|
|
231
|
-
definePassed?: (data: TData) => boolean;
|
|
232
|
-
}): ScoreItem<TData>;
|
|
233
|
-
}
|
|
234
|
-
declare const Score: {
|
|
235
|
-
of<TData>(config: {
|
|
236
|
-
id: string;
|
|
237
|
-
name?: string;
|
|
238
|
-
displayStrategy: ScoreDisplayStrategy;
|
|
239
|
-
format: (data: TData) => string;
|
|
240
|
-
}): ScoreDef<TData>;
|
|
241
|
-
};
|
|
242
|
-
declare function getScoreById(id: string): ScoreDef<unknown> | undefined;
|
|
243
|
-
|
|
244
|
-
interface CollectedDataset {
|
|
245
|
-
id: string;
|
|
246
|
-
filePath: string;
|
|
247
|
-
dataset: Dataset;
|
|
248
|
-
}
|
|
249
|
-
interface CollectedEvaluator {
|
|
250
|
-
id: string;
|
|
251
|
-
filePath: string;
|
|
252
|
-
evaluator: Evaluator<unknown, unknown, unknown, unknown>;
|
|
253
|
-
}
|
|
254
|
-
interface CollectedTestCase {
|
|
255
|
-
id: string;
|
|
256
|
-
filePath: string;
|
|
257
|
-
testCase: TestCase<unknown, unknown>;
|
|
258
|
-
}
|
|
259
|
-
interface SearchTestCasesQuery {
|
|
260
|
-
includedTags?: ReadonlyArray<string | RegExp>;
|
|
261
|
-
excludedTags?: ReadonlyArray<string | RegExp>;
|
|
262
|
-
includedPaths?: ReadonlyArray<string | RegExp>;
|
|
263
|
-
excludedPaths?: ReadonlyArray<string | RegExp>;
|
|
264
|
-
}
|
|
265
|
-
interface RunDatasetRequest {
|
|
266
|
-
datasetId: string;
|
|
267
|
-
evaluatorIds: ReadonlyArray<string>;
|
|
268
|
-
concurrency?: number;
|
|
269
|
-
}
|
|
270
|
-
interface RunSnapshot {
|
|
271
|
-
runId: string;
|
|
272
|
-
datasetId: string;
|
|
273
|
-
datasetName: string;
|
|
274
|
-
evaluatorIds: ReadonlyArray<string>;
|
|
275
|
-
queuedAt: number;
|
|
276
|
-
startedAt?: number;
|
|
277
|
-
finishedAt?: number;
|
|
278
|
-
totalTestCases: number;
|
|
279
|
-
completedTestCases: number;
|
|
280
|
-
passedTestCases: number;
|
|
281
|
-
failedTestCases: number;
|
|
282
|
-
status: 'queued' | 'running' | 'completed' | 'failed';
|
|
283
|
-
artifactPath: string;
|
|
284
|
-
errorMessage?: string;
|
|
285
|
-
}
|
|
286
|
-
type RunnerEvent = {
|
|
287
|
-
type: 'RunQueued';
|
|
288
|
-
runId: string;
|
|
289
|
-
datasetId: string;
|
|
290
|
-
datasetName: string;
|
|
291
|
-
evaluatorIds: ReadonlyArray<string>;
|
|
292
|
-
totalTestCases: number;
|
|
293
|
-
artifactPath: string;
|
|
294
|
-
} | {
|
|
295
|
-
type: 'RunStarted';
|
|
296
|
-
runId: string;
|
|
297
|
-
startedAt: number;
|
|
298
|
-
} | {
|
|
299
|
-
type: 'TestCaseProgress';
|
|
300
|
-
runId: string;
|
|
301
|
-
testCaseId: string;
|
|
302
|
-
testCaseName: string;
|
|
303
|
-
completedTestCases: number;
|
|
304
|
-
totalTestCases: number;
|
|
305
|
-
passed: boolean;
|
|
306
|
-
durationMs: number;
|
|
307
|
-
evaluatorScores: ReadonlyArray<{
|
|
308
|
-
evaluatorId: string;
|
|
309
|
-
scores: ReadonlyArray<ScoreItem>;
|
|
310
|
-
passed: boolean;
|
|
311
|
-
metrics?: ReadonlyArray<MetricItem>;
|
|
312
|
-
}>;
|
|
313
|
-
output?: unknown;
|
|
314
|
-
errorMessage?: string;
|
|
315
|
-
} | {
|
|
316
|
-
type: 'RunCompleted';
|
|
317
|
-
runId: string;
|
|
318
|
-
finishedAt: number;
|
|
319
|
-
passedTestCases: number;
|
|
320
|
-
failedTestCases: number;
|
|
321
|
-
totalTestCases: number;
|
|
322
|
-
artifactPath: string;
|
|
323
|
-
} | {
|
|
324
|
-
type: 'RunFailed';
|
|
325
|
-
runId: string;
|
|
326
|
-
finishedAt: number;
|
|
327
|
-
errorMessage: string;
|
|
328
|
-
artifactPath: string;
|
|
329
|
-
} | {
|
|
330
|
-
type: 'ArtifactFlushed';
|
|
331
|
-
runId: string;
|
|
332
|
-
artifactPath: string;
|
|
333
|
-
};
|
|
334
|
-
|
|
335
|
-
interface SubscribeOptions {
|
|
336
|
-
runId?: string;
|
|
337
|
-
}
|
|
338
|
-
interface RunnerApi {
|
|
339
|
-
collectDatasets(): Promise<ReadonlyArray<CollectedDataset>>;
|
|
340
|
-
collectEvaluators(): Promise<ReadonlyArray<CollectedEvaluator>>;
|
|
341
|
-
resolveDatasetByName(name: string): Promise<CollectedDataset | undefined>;
|
|
342
|
-
resolveEvaluatorsByNamePattern(pattern: string): Promise<ReadonlyArray<CollectedEvaluator>>;
|
|
343
|
-
searchTestCases(query?: SearchTestCasesQuery): Promise<ReadonlyArray<CollectedTestCase>>;
|
|
344
|
-
collectDatasetTestCases(datasetId: string): Promise<ReadonlyArray<CollectedTestCase>>;
|
|
345
|
-
runDatasetWith(request: RunDatasetRequest): Promise<RunSnapshot>;
|
|
346
|
-
subscribeRunEvents(listener: (event: RunnerEvent) => void, options?: SubscribeOptions): () => void;
|
|
347
|
-
getRunSnapshot(runId: string): RunSnapshot | undefined;
|
|
348
|
-
getAllRunSnapshots(): ReadonlyArray<RunSnapshot>;
|
|
349
|
-
shutdown(): Promise<void>;
|
|
350
|
-
}
|
|
351
|
-
declare function createRunner(overrides?: RunnerConfigOverrides): RunnerApi;
|
|
352
|
-
|
|
353
|
-
declare function loadMockData(): EvalsData;
|
|
354
|
-
declare function loadRunnerData(runner: RunnerApi): Promise<EvalsData>;
|
|
355
|
-
declare function parseStartupArgs(argv: string[]): StartupArgs;
|
|
356
|
-
|
|
357
|
-
interface TokenCountData {
|
|
358
|
-
input?: number;
|
|
359
|
-
output?: number;
|
|
360
|
-
inputCached?: number;
|
|
361
|
-
outputCached?: number;
|
|
362
|
-
}
|
|
363
|
-
declare const tokenCountMetric: MetricDef<TokenCountData>;
|
|
364
|
-
interface LatencyData {
|
|
365
|
-
ms: number;
|
|
366
|
-
}
|
|
367
|
-
declare const latencyMetric: MetricDef<LatencyData>;
|
|
368
|
-
|
|
369
|
-
interface PercentScoreData {
|
|
370
|
-
value: number;
|
|
371
|
-
}
|
|
372
|
-
declare const percentScore: ScoreDef<PercentScoreData>;
|
|
373
|
-
interface BinaryScoreData {
|
|
374
|
-
passed: boolean;
|
|
375
|
-
}
|
|
376
|
-
declare const binaryScore: ScoreDef<BinaryScoreData>;
|
|
377
|
-
|
|
378
|
-
export { type BinaryScoreData, type CliState, type CollectedDataset, type CollectedEvaluator, type CollectedTestCase, type ConfigType, Dataset, type EvalDataset, type EvalMiddleware, type EvalRun, type EvalsData, type EvaluateArgs, Evaluator, type EvaluatorOption, type LatencyData, type M4trixEvalConfig, type M4trixEvalConfigDiscovery, Metric, type MetricDef, type MetricItem, type PathMatcher, type PercentScoreData, type RunDatasetRequest, type RunSnapshot, type RunnerApi, type RunnerConfig, type RunnerConfigOverrides, type RunnerDiscoveryConfig, type RunnerEvent, Score, type ScoreDef, type ScoreDisplayStrategy, type ScoreItem, type SearchTestCasesQuery, type StartupArgs, type TagMatcher, TestCase, type TokenCountData, type ViewLevel, binaryScore, createRunner, defaultRunnerConfig, defineConfig, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, tokenCountMetric, withRunnerConfig };
|