@m4trix/evals 0.2.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +17 -9
- package/dist/cli-simple.cjs +28 -4
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +28 -4
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +20 -3
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +20 -3
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +37 -7
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +17 -7
- package/dist/index.d.ts +17 -7
- package/dist/index.js +34 -7
- package/dist/index.js.map +1 -1
- package/package.json +2 -4
package/dist/index.d.cts
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import { Schema } from 'effect';
|
|
2
|
+
export { Schema as S } from 'effect';
|
|
2
3
|
|
|
3
4
|
type EvalStatus = 'PASS' | 'FAILED' | 'RUNNING';
|
|
4
5
|
interface EvalDimension {
|
|
@@ -110,7 +111,7 @@ interface M4trixEvalConfig {
|
|
|
110
111
|
}
|
|
111
112
|
type ConfigType = M4trixEvalConfig;
|
|
112
113
|
type M4trixEvalConfigFactory<TConfig extends ConfigType = ConfigType> = () => TConfig;
|
|
113
|
-
declare function
|
|
114
|
+
declare function defineConfig<TConfig extends ConfigType>(factory: M4trixEvalConfigFactory<TConfig>): M4trixEvalConfigFactory<TConfig>;
|
|
114
115
|
declare const defaultRunnerConfig: RunnerConfig;
|
|
115
116
|
declare function withRunnerConfig(overrides?: RunnerConfigOverrides): RunnerConfig;
|
|
116
117
|
|
|
@@ -120,20 +121,23 @@ type TagMatcher = string | RegExp;
|
|
|
120
121
|
type PathMatcher = string | RegExp;
|
|
121
122
|
|
|
122
123
|
type InputOrBuilder<T> = T | (() => T);
|
|
123
|
-
interface TestCaseDescribeConfig<TI extends Schema.Schema.Any> {
|
|
124
|
+
interface TestCaseDescribeConfig<TI extends Schema.Schema.Any, TOutputDefinition = unknown> {
|
|
124
125
|
name: string;
|
|
125
126
|
tags: string[];
|
|
126
127
|
inputSchema: TI;
|
|
127
128
|
input: InputOrBuilder<Schema.Schema.Type<TI>>;
|
|
129
|
+
outputDefinition?: InputOrBuilder<TOutputDefinition>;
|
|
130
|
+
outputDefintion?: InputOrBuilder<TOutputDefinition>;
|
|
128
131
|
}
|
|
129
|
-
declare class TestCase<TInput = unknown> {
|
|
132
|
+
declare class TestCase<TInput = unknown, TOutputDefinition = unknown> {
|
|
130
133
|
private readonly _config;
|
|
131
134
|
private constructor();
|
|
132
|
-
static describe<TI extends Schema.Schema.Any>(config: TestCaseDescribeConfig<TI>): TestCase<Schema.Schema.Type<TI
|
|
135
|
+
static describe<TI extends Schema.Schema.Any, TOutputDefinition = unknown>(config: TestCaseDescribeConfig<TI, TOutputDefinition>): TestCase<Schema.Schema.Type<TI>, TOutputDefinition>;
|
|
133
136
|
getName(): string;
|
|
134
137
|
getTags(): string[];
|
|
135
138
|
getInputSchema(): Schema.Schema.Any;
|
|
136
139
|
getInput(): TInput;
|
|
140
|
+
getOutputDefinition(): TOutputDefinition | undefined;
|
|
137
141
|
}
|
|
138
142
|
|
|
139
143
|
interface DatasetDefineConfig {
|
|
@@ -159,7 +163,12 @@ interface EvalMiddleware<TCtx> {
|
|
|
159
163
|
name: string;
|
|
160
164
|
resolve: () => TCtx | Promise<TCtx>;
|
|
161
165
|
}
|
|
162
|
-
|
|
166
|
+
interface EvaluateArgs<TInput, TCtx> {
|
|
167
|
+
input: TInput;
|
|
168
|
+
ctx: TCtx;
|
|
169
|
+
output?: unknown;
|
|
170
|
+
}
|
|
171
|
+
type EvaluateFn<TInput, TScore, TCtx> = (args: EvaluateArgs<TInput, TCtx>) => TScore | Promise<TScore>;
|
|
163
172
|
interface EvaluatorDefineConfig<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any, TS extends Schema.Schema.Any> {
|
|
164
173
|
name: string;
|
|
165
174
|
inputSchema: TI;
|
|
@@ -244,7 +253,7 @@ interface CollectedEvaluator {
|
|
|
244
253
|
interface CollectedTestCase {
|
|
245
254
|
id: string;
|
|
246
255
|
filePath: string;
|
|
247
|
-
testCase: TestCase<unknown>;
|
|
256
|
+
testCase: TestCase<unknown, unknown>;
|
|
248
257
|
}
|
|
249
258
|
interface SearchTestCasesQuery {
|
|
250
259
|
includedTags?: ReadonlyArray<string | RegExp>;
|
|
@@ -300,6 +309,7 @@ type RunnerEvent = {
|
|
|
300
309
|
passed: boolean;
|
|
301
310
|
metrics?: ReadonlyArray<MetricItem>;
|
|
302
311
|
}>;
|
|
312
|
+
outputDefinition?: unknown;
|
|
303
313
|
errorMessage?: string;
|
|
304
314
|
} | {
|
|
305
315
|
type: 'RunCompleted';
|
|
@@ -364,4 +374,4 @@ interface BinaryScoreData {
|
|
|
364
374
|
}
|
|
365
375
|
declare const binaryScore: ScoreDef<BinaryScoreData>;
|
|
366
376
|
|
|
367
|
-
export { type BinaryScoreData, type CliState, type CollectedDataset, type CollectedEvaluator, type CollectedTestCase, type ConfigType, Dataset, type EvalDataset, type EvalMiddleware, type EvalRun, type EvalsData, Evaluator, type EvaluatorOption, type LatencyData, type M4trixEvalConfig, type M4trixEvalConfigDiscovery, Metric, type MetricDef, type MetricItem, type PathMatcher, type PercentScoreData, type RunDatasetRequest, type RunSnapshot, type RunnerApi, type RunnerConfig, type RunnerConfigOverrides, type RunnerDiscoveryConfig, type RunnerEvent, Score, type ScoreDef, type ScoreDisplayStrategy, type ScoreItem, type SearchTestCasesQuery, type StartupArgs, type TagMatcher, TestCase, type TokenCountData, type ViewLevel, binaryScore, createRunner, defaultRunnerConfig,
|
|
377
|
+
export { type BinaryScoreData, type CliState, type CollectedDataset, type CollectedEvaluator, type CollectedTestCase, type ConfigType, Dataset, type EvalDataset, type EvalMiddleware, type EvalRun, type EvalsData, type EvaluateArgs, Evaluator, type EvaluatorOption, type LatencyData, type M4trixEvalConfig, type M4trixEvalConfigDiscovery, Metric, type MetricDef, type MetricItem, type PathMatcher, type PercentScoreData, type RunDatasetRequest, type RunSnapshot, type RunnerApi, type RunnerConfig, type RunnerConfigOverrides, type RunnerDiscoveryConfig, type RunnerEvent, Score, type ScoreDef, type ScoreDisplayStrategy, type ScoreItem, type SearchTestCasesQuery, type StartupArgs, type TagMatcher, TestCase, type TokenCountData, type ViewLevel, binaryScore, createRunner, defaultRunnerConfig, defineConfig, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, tokenCountMetric, withRunnerConfig };
|
package/dist/index.d.ts
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import { Schema } from 'effect';
|
|
2
|
+
export { Schema as S } from 'effect';
|
|
2
3
|
|
|
3
4
|
type EvalStatus = 'PASS' | 'FAILED' | 'RUNNING';
|
|
4
5
|
interface EvalDimension {
|
|
@@ -110,7 +111,7 @@ interface M4trixEvalConfig {
|
|
|
110
111
|
}
|
|
111
112
|
type ConfigType = M4trixEvalConfig;
|
|
112
113
|
type M4trixEvalConfigFactory<TConfig extends ConfigType = ConfigType> = () => TConfig;
|
|
113
|
-
declare function
|
|
114
|
+
declare function defineConfig<TConfig extends ConfigType>(factory: M4trixEvalConfigFactory<TConfig>): M4trixEvalConfigFactory<TConfig>;
|
|
114
115
|
declare const defaultRunnerConfig: RunnerConfig;
|
|
115
116
|
declare function withRunnerConfig(overrides?: RunnerConfigOverrides): RunnerConfig;
|
|
116
117
|
|
|
@@ -120,20 +121,23 @@ type TagMatcher = string | RegExp;
|
|
|
120
121
|
type PathMatcher = string | RegExp;
|
|
121
122
|
|
|
122
123
|
type InputOrBuilder<T> = T | (() => T);
|
|
123
|
-
interface TestCaseDescribeConfig<TI extends Schema.Schema.Any> {
|
|
124
|
+
interface TestCaseDescribeConfig<TI extends Schema.Schema.Any, TOutputDefinition = unknown> {
|
|
124
125
|
name: string;
|
|
125
126
|
tags: string[];
|
|
126
127
|
inputSchema: TI;
|
|
127
128
|
input: InputOrBuilder<Schema.Schema.Type<TI>>;
|
|
129
|
+
outputDefinition?: InputOrBuilder<TOutputDefinition>;
|
|
130
|
+
outputDefintion?: InputOrBuilder<TOutputDefinition>;
|
|
128
131
|
}
|
|
129
|
-
declare class TestCase<TInput = unknown> {
|
|
132
|
+
declare class TestCase<TInput = unknown, TOutputDefinition = unknown> {
|
|
130
133
|
private readonly _config;
|
|
131
134
|
private constructor();
|
|
132
|
-
static describe<TI extends Schema.Schema.Any>(config: TestCaseDescribeConfig<TI>): TestCase<Schema.Schema.Type<TI
|
|
135
|
+
static describe<TI extends Schema.Schema.Any, TOutputDefinition = unknown>(config: TestCaseDescribeConfig<TI, TOutputDefinition>): TestCase<Schema.Schema.Type<TI>, TOutputDefinition>;
|
|
133
136
|
getName(): string;
|
|
134
137
|
getTags(): string[];
|
|
135
138
|
getInputSchema(): Schema.Schema.Any;
|
|
136
139
|
getInput(): TInput;
|
|
140
|
+
getOutputDefinition(): TOutputDefinition | undefined;
|
|
137
141
|
}
|
|
138
142
|
|
|
139
143
|
interface DatasetDefineConfig {
|
|
@@ -159,7 +163,12 @@ interface EvalMiddleware<TCtx> {
|
|
|
159
163
|
name: string;
|
|
160
164
|
resolve: () => TCtx | Promise<TCtx>;
|
|
161
165
|
}
|
|
162
|
-
|
|
166
|
+
interface EvaluateArgs<TInput, TCtx> {
|
|
167
|
+
input: TInput;
|
|
168
|
+
ctx: TCtx;
|
|
169
|
+
output?: unknown;
|
|
170
|
+
}
|
|
171
|
+
type EvaluateFn<TInput, TScore, TCtx> = (args: EvaluateArgs<TInput, TCtx>) => TScore | Promise<TScore>;
|
|
163
172
|
interface EvaluatorDefineConfig<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any, TS extends Schema.Schema.Any> {
|
|
164
173
|
name: string;
|
|
165
174
|
inputSchema: TI;
|
|
@@ -244,7 +253,7 @@ interface CollectedEvaluator {
|
|
|
244
253
|
interface CollectedTestCase {
|
|
245
254
|
id: string;
|
|
246
255
|
filePath: string;
|
|
247
|
-
testCase: TestCase<unknown>;
|
|
256
|
+
testCase: TestCase<unknown, unknown>;
|
|
248
257
|
}
|
|
249
258
|
interface SearchTestCasesQuery {
|
|
250
259
|
includedTags?: ReadonlyArray<string | RegExp>;
|
|
@@ -300,6 +309,7 @@ type RunnerEvent = {
|
|
|
300
309
|
passed: boolean;
|
|
301
310
|
metrics?: ReadonlyArray<MetricItem>;
|
|
302
311
|
}>;
|
|
312
|
+
outputDefinition?: unknown;
|
|
303
313
|
errorMessage?: string;
|
|
304
314
|
} | {
|
|
305
315
|
type: 'RunCompleted';
|
|
@@ -364,4 +374,4 @@ interface BinaryScoreData {
|
|
|
364
374
|
}
|
|
365
375
|
declare const binaryScore: ScoreDef<BinaryScoreData>;
|
|
366
376
|
|
|
367
|
-
export { type BinaryScoreData, type CliState, type CollectedDataset, type CollectedEvaluator, type CollectedTestCase, type ConfigType, Dataset, type EvalDataset, type EvalMiddleware, type EvalRun, type EvalsData, Evaluator, type EvaluatorOption, type LatencyData, type M4trixEvalConfig, type M4trixEvalConfigDiscovery, Metric, type MetricDef, type MetricItem, type PathMatcher, type PercentScoreData, type RunDatasetRequest, type RunSnapshot, type RunnerApi, type RunnerConfig, type RunnerConfigOverrides, type RunnerDiscoveryConfig, type RunnerEvent, Score, type ScoreDef, type ScoreDisplayStrategy, type ScoreItem, type SearchTestCasesQuery, type StartupArgs, type TagMatcher, TestCase, type TokenCountData, type ViewLevel, binaryScore, createRunner, defaultRunnerConfig,
|
|
377
|
+
export { type BinaryScoreData, type CliState, type CollectedDataset, type CollectedEvaluator, type CollectedTestCase, type ConfigType, Dataset, type EvalDataset, type EvalMiddleware, type EvalRun, type EvalsData, type EvaluateArgs, Evaluator, type EvaluatorOption, type LatencyData, type M4trixEvalConfig, type M4trixEvalConfigDiscovery, Metric, type MetricDef, type MetricItem, type PathMatcher, type PercentScoreData, type RunDatasetRequest, type RunSnapshot, type RunnerApi, type RunnerConfig, type RunnerConfigOverrides, type RunnerDiscoveryConfig, type RunnerEvent, Score, type ScoreDef, type ScoreDisplayStrategy, type ScoreItem, type SearchTestCasesQuery, type StartupArgs, type TagMatcher, TestCase, type TokenCountData, type ViewLevel, binaryScore, createRunner, defaultRunnerConfig, defineConfig, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, tokenCountMetric, withRunnerConfig };
|
package/dist/index.js
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
|
-
import { randomUUID } from 'crypto';
|
|
2
1
|
import { Effect, PubSub, Queue, Fiber } from 'effect';
|
|
2
|
+
export { Schema as S } from 'effect';
|
|
3
|
+
import { randomUUID } from 'crypto';
|
|
3
4
|
import { existsSync } from 'fs';
|
|
4
5
|
import { resolve as resolve$1, relative, join, dirname } from 'path';
|
|
5
6
|
import * as jitiModule from 'jiti';
|
|
@@ -305,7 +306,9 @@ var TestCase = class _TestCase {
|
|
|
305
306
|
name: config.name,
|
|
306
307
|
tags: config.tags,
|
|
307
308
|
inputSchema: config.inputSchema,
|
|
308
|
-
input: config.input
|
|
309
|
+
input: config.input,
|
|
310
|
+
outputDefinition: config.outputDefinition,
|
|
311
|
+
outputDefintion: config.outputDefintion
|
|
309
312
|
});
|
|
310
313
|
}
|
|
311
314
|
getName() {
|
|
@@ -320,6 +323,13 @@ var TestCase = class _TestCase {
|
|
|
320
323
|
getInput() {
|
|
321
324
|
return resolve(this._config.input);
|
|
322
325
|
}
|
|
326
|
+
getOutputDefinition() {
|
|
327
|
+
const value = this._config.outputDefinition ?? this._config.outputDefintion;
|
|
328
|
+
if (value === void 0) {
|
|
329
|
+
return void 0;
|
|
330
|
+
}
|
|
331
|
+
return resolve(value);
|
|
332
|
+
}
|
|
323
333
|
};
|
|
324
334
|
|
|
325
335
|
// src/evals/evaluator.ts
|
|
@@ -543,7 +553,7 @@ var binaryScore = Score.of({
|
|
|
543
553
|
});
|
|
544
554
|
|
|
545
555
|
// src/runner/config.ts
|
|
546
|
-
function
|
|
556
|
+
function defineConfig(factory) {
|
|
547
557
|
return factory;
|
|
548
558
|
}
|
|
549
559
|
var defaultRunnerConfig = {
|
|
@@ -624,7 +634,9 @@ function getJitiLoader() {
|
|
|
624
634
|
}
|
|
625
635
|
const createJiti2 = jitiModule.createJiti ?? jitiModule.default;
|
|
626
636
|
if (typeof createJiti2 !== "function") {
|
|
627
|
-
throw new Error(
|
|
637
|
+
throw new Error(
|
|
638
|
+
"Failed to initialize jiti for m4trix eval config loading."
|
|
639
|
+
);
|
|
628
640
|
}
|
|
629
641
|
cachedLoader = createJiti2(import.meta.url, {
|
|
630
642
|
interopDefault: true,
|
|
@@ -647,7 +659,7 @@ function resolveConfigValue(value) {
|
|
|
647
659
|
}
|
|
648
660
|
if (typeof value !== "object") {
|
|
649
661
|
throw new Error(
|
|
650
|
-
"Invalid m4trix eval config export. Expected an object or
|
|
662
|
+
"Invalid m4trix eval config export. Expected an object or defineConfig(() => config)."
|
|
651
663
|
);
|
|
652
664
|
}
|
|
653
665
|
return value;
|
|
@@ -852,6 +864,13 @@ function normalizeResult(result) {
|
|
|
852
864
|
const metrics = Array.isArray(obj.metrics) ? obj.metrics : void 0;
|
|
853
865
|
return { scores, metrics };
|
|
854
866
|
}
|
|
867
|
+
function readOutputDefinition(testCase) {
|
|
868
|
+
const candidate = testCase;
|
|
869
|
+
if (typeof candidate.getOutputDefinition !== "function") {
|
|
870
|
+
return void 0;
|
|
871
|
+
}
|
|
872
|
+
return candidate.getOutputDefinition();
|
|
873
|
+
}
|
|
855
874
|
function nowIsoForFile() {
|
|
856
875
|
return (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
857
876
|
}
|
|
@@ -880,6 +899,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
880
899
|
const started = Date.now();
|
|
881
900
|
const evaluatorScores = [];
|
|
882
901
|
let testCaseError;
|
|
902
|
+
const outputDefinition = readOutputDefinition(testCaseItem.testCase);
|
|
883
903
|
for (const { id: evaluatorId, evaluator } of task.evaluators) {
|
|
884
904
|
const evaluateFn = evaluator.getEvaluateFn();
|
|
885
905
|
if (!evaluateFn) {
|
|
@@ -890,7 +910,13 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
890
910
|
() => Promise.resolve(evaluator.resolveContext())
|
|
891
911
|
);
|
|
892
912
|
const result = yield* Effect.promise(
|
|
893
|
-
() => Promise.resolve(
|
|
913
|
+
() => Promise.resolve(
|
|
914
|
+
evaluateFn({
|
|
915
|
+
input: testCaseItem.testCase.getInput(),
|
|
916
|
+
ctx,
|
|
917
|
+
output: outputDefinition
|
|
918
|
+
})
|
|
919
|
+
)
|
|
894
920
|
);
|
|
895
921
|
const { scores, metrics } = normalizeResult(result);
|
|
896
922
|
const passed = computeEvaluatorPassed(evaluator, result, scores);
|
|
@@ -921,6 +947,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
921
947
|
passed: testCasePassed,
|
|
922
948
|
durationMs: Date.now() - started,
|
|
923
949
|
evaluatorScores,
|
|
950
|
+
outputDefinition,
|
|
924
951
|
errorMessage: testCaseError
|
|
925
952
|
};
|
|
926
953
|
updateSnapshot(task.runId, (snapshot) => ({
|
|
@@ -1267,6 +1294,6 @@ var EffectRunner = class {
|
|
|
1267
1294
|
}
|
|
1268
1295
|
};
|
|
1269
1296
|
|
|
1270
|
-
export { Dataset, Evaluator, Metric, Score, TestCase, binaryScore, createRunner, defaultRunnerConfig,
|
|
1297
|
+
export { Dataset, Evaluator, Metric, Score, TestCase, binaryScore, createRunner, defaultRunnerConfig, defineConfig, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, tokenCountMetric, withRunnerConfig };
|
|
1271
1298
|
//# sourceMappingURL=out.js.map
|
|
1272
1299
|
//# sourceMappingURL=index.js.map
|