@m4trix/evals 0.4.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -1
- package/dist/cli-simple.cjs +20 -11
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.d.ts +0 -0
- package/dist/cli-simple.js +21 -12
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +16 -7
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.d.ts +0 -0
- package/dist/cli.js +16 -7
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +15 -13
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +11 -10
- package/dist/index.js +14 -12
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/cli-simple.d.cts +0 -1
- package/dist/cli.d.cts +0 -1
- package/dist/index.d.cts +0 -377
package/dist/index.d.ts
CHANGED
|
@@ -121,23 +121,24 @@ type TagMatcher = string | RegExp;
|
|
|
121
121
|
type PathMatcher = string | RegExp;
|
|
122
122
|
|
|
123
123
|
type InputOrBuilder<T> = T | (() => T);
|
|
124
|
-
interface TestCaseDescribeConfig<TI extends Schema.Schema.Any,
|
|
124
|
+
interface TestCaseDescribeConfig<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any = Schema.Schema<unknown>> {
|
|
125
125
|
name: string;
|
|
126
126
|
tags: string[];
|
|
127
127
|
inputSchema: TI;
|
|
128
128
|
input: InputOrBuilder<Schema.Schema.Type<TI>>;
|
|
129
|
-
|
|
130
|
-
|
|
129
|
+
outputSchema?: TO;
|
|
130
|
+
output?: InputOrBuilder<Schema.Schema.Type<TO>>;
|
|
131
131
|
}
|
|
132
|
-
declare class TestCase<TInput = unknown,
|
|
132
|
+
declare class TestCase<TInput = unknown, TOutput = unknown> {
|
|
133
133
|
private readonly _config;
|
|
134
134
|
private constructor();
|
|
135
|
-
static describe<TI extends Schema.Schema.Any,
|
|
135
|
+
static describe<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any = Schema.Schema<unknown>>(config: TestCaseDescribeConfig<TI, TO>): TestCase<Schema.Schema.Type<TI>, Schema.Schema.Type<TO>>;
|
|
136
136
|
getName(): string;
|
|
137
137
|
getTags(): string[];
|
|
138
138
|
getInputSchema(): Schema.Schema.Any;
|
|
139
139
|
getInput(): TInput;
|
|
140
|
-
|
|
140
|
+
getOutputSchema(): Schema.Schema.Any | undefined;
|
|
141
|
+
getOutput(): TOutput | undefined;
|
|
141
142
|
}
|
|
142
143
|
|
|
143
144
|
interface DatasetDefineConfig {
|
|
@@ -209,7 +210,7 @@ interface MetricDef<TData = unknown> {
|
|
|
209
210
|
declare const Metric: {
|
|
210
211
|
of<TData>(config: {
|
|
211
212
|
id: string;
|
|
212
|
-
name?: string;
|
|
213
|
+
name?: string | undefined;
|
|
213
214
|
format: (data: TData) => string;
|
|
214
215
|
}): MetricDef<TData>;
|
|
215
216
|
};
|
|
@@ -233,7 +234,7 @@ interface ScoreDef<TData = unknown> {
|
|
|
233
234
|
declare const Score: {
|
|
234
235
|
of<TData>(config: {
|
|
235
236
|
id: string;
|
|
236
|
-
name?: string;
|
|
237
|
+
name?: string | undefined;
|
|
237
238
|
displayStrategy: ScoreDisplayStrategy;
|
|
238
239
|
format: (data: TData) => string;
|
|
239
240
|
}): ScoreDef<TData>;
|
|
@@ -309,7 +310,7 @@ type RunnerEvent = {
|
|
|
309
310
|
passed: boolean;
|
|
310
311
|
metrics?: ReadonlyArray<MetricItem>;
|
|
311
312
|
}>;
|
|
312
|
-
|
|
313
|
+
output?: unknown;
|
|
313
314
|
errorMessage?: string;
|
|
314
315
|
} | {
|
|
315
316
|
type: 'RunCompleted';
|
|
@@ -374,4 +375,4 @@ interface BinaryScoreData {
|
|
|
374
375
|
}
|
|
375
376
|
declare const binaryScore: ScoreDef<BinaryScoreData>;
|
|
376
377
|
|
|
377
|
-
export {
|
|
378
|
+
export { BinaryScoreData, CliState, CollectedDataset, CollectedEvaluator, CollectedTestCase, ConfigType, Dataset, EvalDataset, EvalMiddleware, EvalRun, EvalsData, EvaluateArgs, Evaluator, EvaluatorOption, LatencyData, M4trixEvalConfig, M4trixEvalConfigDiscovery, Metric, MetricDef, MetricItem, PathMatcher, PercentScoreData, RunDatasetRequest, RunSnapshot, RunnerApi, RunnerConfig, RunnerConfigOverrides, RunnerDiscoveryConfig, RunnerEvent, Score, ScoreDef, ScoreDisplayStrategy, ScoreItem, SearchTestCasesQuery, StartupArgs, TagMatcher, TestCase, TokenCountData, ViewLevel, binaryScore, createRunner, defaultRunnerConfig, defineConfig, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, tokenCountMetric, withRunnerConfig };
|
package/dist/index.js
CHANGED
|
@@ -307,8 +307,8 @@ var TestCase = class _TestCase {
|
|
|
307
307
|
tags: config.tags,
|
|
308
308
|
inputSchema: config.inputSchema,
|
|
309
309
|
input: config.input,
|
|
310
|
-
|
|
311
|
-
|
|
310
|
+
outputSchema: config.outputSchema,
|
|
311
|
+
output: config.output
|
|
312
312
|
});
|
|
313
313
|
}
|
|
314
314
|
getName() {
|
|
@@ -323,12 +323,14 @@ var TestCase = class _TestCase {
|
|
|
323
323
|
getInput() {
|
|
324
324
|
return resolve(this._config.input);
|
|
325
325
|
}
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
326
|
+
getOutputSchema() {
|
|
327
|
+
return this._config.outputSchema;
|
|
328
|
+
}
|
|
329
|
+
getOutput() {
|
|
330
|
+
if (this._config.output === void 0) {
|
|
329
331
|
return void 0;
|
|
330
332
|
}
|
|
331
|
-
return resolve(
|
|
333
|
+
return resolve(this._config.output);
|
|
332
334
|
}
|
|
333
335
|
};
|
|
334
336
|
|
|
@@ -864,12 +866,12 @@ function normalizeResult(result) {
|
|
|
864
866
|
const metrics = Array.isArray(obj.metrics) ? obj.metrics : void 0;
|
|
865
867
|
return { scores, metrics };
|
|
866
868
|
}
|
|
867
|
-
function
|
|
869
|
+
function readOutput(testCase) {
|
|
868
870
|
const candidate = testCase;
|
|
869
|
-
if (typeof candidate.
|
|
871
|
+
if (typeof candidate.getOutput !== "function") {
|
|
870
872
|
return void 0;
|
|
871
873
|
}
|
|
872
|
-
return candidate.
|
|
874
|
+
return candidate.getOutput();
|
|
873
875
|
}
|
|
874
876
|
function nowIsoForFile() {
|
|
875
877
|
return (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
@@ -899,7 +901,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
899
901
|
const started = Date.now();
|
|
900
902
|
const evaluatorScores = [];
|
|
901
903
|
let testCaseError;
|
|
902
|
-
const
|
|
904
|
+
const output = readOutput(testCaseItem.testCase);
|
|
903
905
|
for (const { id: evaluatorId, evaluator } of task.evaluators) {
|
|
904
906
|
const evaluateFn = evaluator.getEvaluateFn();
|
|
905
907
|
if (!evaluateFn) {
|
|
@@ -914,7 +916,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
914
916
|
evaluateFn({
|
|
915
917
|
input: testCaseItem.testCase.getInput(),
|
|
916
918
|
ctx,
|
|
917
|
-
output
|
|
919
|
+
output
|
|
918
920
|
})
|
|
919
921
|
)
|
|
920
922
|
);
|
|
@@ -947,7 +949,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
947
949
|
passed: testCasePassed,
|
|
948
950
|
durationMs: Date.now() - started,
|
|
949
951
|
evaluatorScores,
|
|
950
|
-
|
|
952
|
+
output,
|
|
951
953
|
errorMessage: testCaseError
|
|
952
954
|
};
|
|
953
955
|
updateSnapshot(task.runId, (snapshot) => ({
|