@m4trix/evals 0.18.0 → 0.20.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -2
- package/dist/cli-simple.cjs +142 -42
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +142 -42
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +100 -30
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +100 -30
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +97 -28
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +21 -9
- package/dist/index.js +97 -28
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.d.ts
CHANGED
|
@@ -257,8 +257,15 @@ interface EvaluateArgs<TInput, TOutput = unknown, TCtx = Record<string, never>>
|
|
|
257
257
|
log: (message: unknown, options?: {
|
|
258
258
|
label?: string;
|
|
259
259
|
}) => void;
|
|
260
|
+
/**
|
|
261
|
+
* Creates an Error from string/object payloads for `return createError(...)` (or `throw createError(...)`).
|
|
262
|
+
* The payload is also logged and shown by the CLI when the evaluator fails.
|
|
263
|
+
*/
|
|
264
|
+
createError: (message: unknown, options?: {
|
|
265
|
+
label?: string;
|
|
266
|
+
}) => Error;
|
|
260
267
|
}
|
|
261
|
-
type EvaluateFn<TInput, TOutput, TScore, TCtx> = (args: EvaluateArgs<TInput, TOutput, TCtx>) => TScore | Promise<TScore>;
|
|
268
|
+
type EvaluateFn<TInput, TOutput, TScore, TCtx> = (args: EvaluateArgs<TInput, TOutput, TCtx>) => TScore | Error | Promise<TScore | Error>;
|
|
262
269
|
interface EvaluatorDefineConfig<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any, TS extends Schema.Schema.Any> {
|
|
263
270
|
name: string;
|
|
264
271
|
inputSchema: TI;
|
|
@@ -289,6 +296,8 @@ declare class Evaluator<TInput = unknown, TOutput = unknown, TScore = unknown, T
|
|
|
289
296
|
interface MetricItem<TData = unknown> {
|
|
290
297
|
readonly id: string;
|
|
291
298
|
readonly data: TData;
|
|
299
|
+
/** Per-item display name override (wins over def.name in rendering) */
|
|
300
|
+
readonly name?: string;
|
|
292
301
|
}
|
|
293
302
|
interface FormatMetricOptions {
|
|
294
303
|
isAggregated?: boolean;
|
|
@@ -298,7 +307,9 @@ interface MetricDef<TData = unknown> {
|
|
|
298
307
|
readonly name?: string;
|
|
299
308
|
readonly aggregate?: (values: ReadonlyArray<TData>) => TData;
|
|
300
309
|
format(data: TData, options?: FormatMetricOptions): string;
|
|
301
|
-
make(data: TData
|
|
310
|
+
make(data: TData, options?: {
|
|
311
|
+
name?: string;
|
|
312
|
+
}): MetricItem<TData>;
|
|
302
313
|
}
|
|
303
314
|
declare const Metric: {
|
|
304
315
|
of<TData>(config: {
|
|
@@ -315,6 +326,8 @@ interface ScoreItem<TData = unknown> {
|
|
|
315
326
|
readonly id: string;
|
|
316
327
|
readonly data: TData;
|
|
317
328
|
readonly passed?: boolean;
|
|
329
|
+
/** Per-item display name override (wins over def.name in rendering) */
|
|
330
|
+
readonly name?: string;
|
|
318
331
|
/** Attached def for formatting/aggregation without registry lookup (avoids n/a across module boundaries) */
|
|
319
332
|
readonly def?: ScoreDef<TData>;
|
|
320
333
|
}
|
|
@@ -330,6 +343,7 @@ interface ScoreDef<TData = unknown> {
|
|
|
330
343
|
readonly aggregateValues: (values: ReadonlyArray<TData>) => TData;
|
|
331
344
|
make(data: TData, options?: {
|
|
332
345
|
definePassed?: (data: TData) => boolean;
|
|
346
|
+
name?: string;
|
|
333
347
|
}): ScoreItem<TData>;
|
|
334
348
|
}
|
|
335
349
|
/** Helper to format using the right method based on isAggregated (for consumers that need a single entry point) */
|
|
@@ -338,22 +352,20 @@ declare const Score: {
|
|
|
338
352
|
aggregate: {
|
|
339
353
|
/** Average numeric fields. Use for scores like { value, delta }. */
|
|
340
354
|
averageFields<K extends string>(fields: readonly K[]): (values: readonly Record<K, number>[]) => Record<K, number>;
|
|
341
|
-
/** Average
|
|
342
|
-
averageWithVariance<
|
|
343
|
-
value: number;
|
|
344
|
-
}>(values: readonly T[]): T & {
|
|
355
|
+
/** Average selected numeric fields, with sample std dev tracked for `value`. */
|
|
356
|
+
averageWithVariance<K_1 extends string>(fields: readonly K_1[]): (values: readonly Record<K_1, number>[]) => Record<K_1, number> & {
|
|
345
357
|
stdDev?: number | undefined;
|
|
346
358
|
count: number;
|
|
347
359
|
};
|
|
348
360
|
/** All runs must pass. Use for binary scores. */
|
|
349
|
-
all<
|
|
361
|
+
all<T extends {
|
|
350
362
|
passed: boolean;
|
|
351
|
-
}>(values: readonly
|
|
363
|
+
}>(values: readonly T[]): T & {
|
|
352
364
|
passedCount?: number | undefined;
|
|
353
365
|
totalCount?: number | undefined;
|
|
354
366
|
};
|
|
355
367
|
/** Take last value (no aggregation). Use when aggregation is not meaningful. */
|
|
356
|
-
last<
|
|
368
|
+
last<T_1>(values: readonly T_1[]): T_1;
|
|
357
369
|
};
|
|
358
370
|
of<TData>(config: {
|
|
359
371
|
id: string;
|
package/dist/index.js
CHANGED
|
@@ -501,7 +501,11 @@ var Metric = {
|
|
|
501
501
|
name: config.name,
|
|
502
502
|
aggregate: config.aggregate,
|
|
503
503
|
format: config.format,
|
|
504
|
-
make: (data) => ({
|
|
504
|
+
make: (data, options) => ({
|
|
505
|
+
id: config.id,
|
|
506
|
+
data,
|
|
507
|
+
...options?.name !== void 0 && { name: options.name }
|
|
508
|
+
})
|
|
505
509
|
};
|
|
506
510
|
registry.set(config.id, def);
|
|
507
511
|
return def;
|
|
@@ -523,25 +527,61 @@ var ScoreAggregate = {
|
|
|
523
527
|
const count = values.length || 1;
|
|
524
528
|
const result = {};
|
|
525
529
|
for (const field of fields) {
|
|
526
|
-
result[field] = values.reduce(
|
|
530
|
+
result[field] = values.reduce(
|
|
531
|
+
(s, v) => s + (v[field] ?? 0),
|
|
532
|
+
0
|
|
533
|
+
) / count;
|
|
527
534
|
}
|
|
528
535
|
return result;
|
|
529
536
|
};
|
|
530
537
|
},
|
|
531
|
-
/** Average
|
|
532
|
-
averageWithVariance(
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
538
|
+
/** Average selected numeric fields, with sample std dev tracked for `value`. */
|
|
539
|
+
averageWithVariance(fields) {
|
|
540
|
+
return (values) => {
|
|
541
|
+
const count = values.length;
|
|
542
|
+
const result = {};
|
|
543
|
+
for (const field of fields) {
|
|
544
|
+
result[field] = count === 0 ? 0 : values.reduce(
|
|
545
|
+
(sum, item) => sum + (item[field] ?? 0),
|
|
546
|
+
0
|
|
547
|
+
) / count;
|
|
548
|
+
}
|
|
549
|
+
const valueField = "value";
|
|
550
|
+
const hasValueField = fields.includes(valueField);
|
|
551
|
+
if (count === 0) {
|
|
552
|
+
if (hasValueField) {
|
|
553
|
+
result[valueField] = 0;
|
|
554
|
+
}
|
|
555
|
+
return {
|
|
556
|
+
...result,
|
|
557
|
+
stdDev: void 0,
|
|
558
|
+
count: 0
|
|
559
|
+
};
|
|
560
|
+
}
|
|
561
|
+
let stdDev;
|
|
562
|
+
if (hasValueField && count >= 2) {
|
|
563
|
+
const sum = values.reduce(
|
|
564
|
+
(s, v) => s + (v[valueField] ?? 0),
|
|
565
|
+
0
|
|
566
|
+
);
|
|
567
|
+
const sumSq = values.reduce(
|
|
568
|
+
(s, v) => {
|
|
569
|
+
const value = v[valueField] ?? 0;
|
|
570
|
+
return s + value * value;
|
|
571
|
+
},
|
|
572
|
+
0
|
|
573
|
+
);
|
|
574
|
+
const mean = sum / count;
|
|
575
|
+
const variance = (sumSq - count * mean * mean) / (count - 1);
|
|
576
|
+
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
577
|
+
}
|
|
578
|
+
return {
|
|
579
|
+
...values[0],
|
|
580
|
+
...result,
|
|
581
|
+
stdDev,
|
|
582
|
+
count
|
|
583
|
+
};
|
|
584
|
+
};
|
|
545
585
|
},
|
|
546
586
|
/** All runs must pass. Use for binary scores. */
|
|
547
587
|
all(values) {
|
|
@@ -575,6 +615,7 @@ var Score = {
|
|
|
575
615
|
id: config.id,
|
|
576
616
|
data,
|
|
577
617
|
...passed !== void 0 && { passed },
|
|
618
|
+
...options?.name !== void 0 && { name: options.name },
|
|
578
619
|
def
|
|
579
620
|
// Attach def so rendering/aggregation works without registry lookup
|
|
580
621
|
};
|
|
@@ -643,7 +684,7 @@ var percentScore = Score.of({
|
|
|
643
684
|
displayStrategy: "bar",
|
|
644
685
|
formatValue: (data) => data.value.toFixed(2),
|
|
645
686
|
formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
|
|
646
|
-
aggregateValues: Score.aggregate.averageWithVariance
|
|
687
|
+
aggregateValues: Score.aggregate.averageWithVariance(["value"])
|
|
647
688
|
});
|
|
648
689
|
var deltaScore = Score.of({
|
|
649
690
|
id: "delta",
|
|
@@ -675,6 +716,8 @@ function createDiffString(expected, actual, diffOptions) {
|
|
|
675
716
|
function formatLogMessage(msg) {
|
|
676
717
|
if (typeof msg === "string")
|
|
677
718
|
return msg;
|
|
719
|
+
if (msg instanceof Error)
|
|
720
|
+
return msg.stack ?? msg.message;
|
|
678
721
|
try {
|
|
679
722
|
if (msg !== null && typeof msg === "object") {
|
|
680
723
|
return JSON.stringify(msg, null, 2);
|
|
@@ -1021,6 +1064,7 @@ function toNumericScore(value) {
|
|
|
1021
1064
|
}
|
|
1022
1065
|
|
|
1023
1066
|
// src/runner/execution.ts
|
|
1067
|
+
var evaluatorErrorLogEntryKey = "__m4trixEvaluatorLogEntry";
|
|
1024
1068
|
function computeEvaluatorPassed(evaluator, result, scores) {
|
|
1025
1069
|
const scoresWithPassed = scores.filter((s) => "passed" in s && s.passed !== void 0);
|
|
1026
1070
|
if (scoresWithPassed.length > 0) {
|
|
@@ -1077,20 +1121,26 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
1077
1121
|
if (!evaluateFn) {
|
|
1078
1122
|
continue;
|
|
1079
1123
|
}
|
|
1124
|
+
const logs = [];
|
|
1125
|
+
const logDiff = (expected, actual, options) => {
|
|
1126
|
+
logs.push(createDiffLogEntry(expected, actual, options));
|
|
1127
|
+
};
|
|
1128
|
+
const log = (message, options) => {
|
|
1129
|
+
logs.push(createLogEntry(message, options));
|
|
1130
|
+
};
|
|
1131
|
+
const createError = (message, options) => {
|
|
1132
|
+
const entry = createLogEntry(message, options);
|
|
1133
|
+
const error = message instanceof Error ? message : new Error(entry.message);
|
|
1134
|
+
error[evaluatorErrorLogEntryKey] = entry;
|
|
1135
|
+
return error;
|
|
1136
|
+
};
|
|
1080
1137
|
try {
|
|
1081
|
-
const logs = [];
|
|
1082
|
-
const logDiff = (expected, actual, options) => {
|
|
1083
|
-
logs.push(createDiffLogEntry(expected, actual, options));
|
|
1084
|
-
};
|
|
1085
|
-
const log = (message, options) => {
|
|
1086
|
-
logs.push(createLogEntry(message, options));
|
|
1087
|
-
};
|
|
1088
1138
|
const ctx = yield* Effect.promise(
|
|
1089
1139
|
() => Promise.resolve(evaluator.resolveContext())
|
|
1090
1140
|
);
|
|
1091
1141
|
const result = yield* Effect.promise(
|
|
1092
|
-
() => Promise.resolve(
|
|
1093
|
-
evaluateFn({
|
|
1142
|
+
() => Promise.resolve().then(
|
|
1143
|
+
() => evaluateFn({
|
|
1094
1144
|
input: testCaseItem.testCase.getInput(),
|
|
1095
1145
|
ctx,
|
|
1096
1146
|
output,
|
|
@@ -1100,10 +1150,24 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
1100
1150
|
datasetId: task.datasetId
|
|
1101
1151
|
},
|
|
1102
1152
|
logDiff,
|
|
1103
|
-
log
|
|
1153
|
+
log,
|
|
1154
|
+
createError
|
|
1104
1155
|
})
|
|
1105
1156
|
)
|
|
1106
1157
|
);
|
|
1158
|
+
if (result instanceof Error) {
|
|
1159
|
+
const evaluatorError = result;
|
|
1160
|
+
const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
|
|
1161
|
+
logs.push(taggedEntry ?? createLogEntry(result));
|
|
1162
|
+
testCaseError = result.message;
|
|
1163
|
+
evaluatorScores.push({
|
|
1164
|
+
evaluatorId,
|
|
1165
|
+
scores: [],
|
|
1166
|
+
passed: false,
|
|
1167
|
+
logs: logs.length > 0 ? logs : void 0
|
|
1168
|
+
});
|
|
1169
|
+
continue;
|
|
1170
|
+
}
|
|
1107
1171
|
const { scores, metrics } = normalizeResult(result);
|
|
1108
1172
|
const passed2 = computeEvaluatorPassed(evaluator, result, scores);
|
|
1109
1173
|
evaluatorScores.push({
|
|
@@ -1114,11 +1178,16 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
1114
1178
|
logs: logs.length > 0 ? logs : void 0
|
|
1115
1179
|
});
|
|
1116
1180
|
} catch (error) {
|
|
1181
|
+
if (error instanceof Error) {
|
|
1182
|
+
const taggedEntry = error[evaluatorErrorLogEntryKey];
|
|
1183
|
+
logs.push(taggedEntry ?? createLogEntry(error));
|
|
1184
|
+
}
|
|
1117
1185
|
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
1118
1186
|
evaluatorScores.push({
|
|
1119
1187
|
evaluatorId,
|
|
1120
1188
|
scores: [],
|
|
1121
|
-
passed: false
|
|
1189
|
+
passed: false,
|
|
1190
|
+
logs: logs.length > 0 ? logs : void 0
|
|
1122
1191
|
});
|
|
1123
1192
|
}
|
|
1124
1193
|
}
|