@m4trix/evals 0.18.0 → 0.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -257,8 +257,15 @@ interface EvaluateArgs<TInput, TOutput = unknown, TCtx = Record<string, never>>
257
257
  log: (message: unknown, options?: {
258
258
  label?: string;
259
259
  }) => void;
260
+ /**
261
+ * Creates an Error from string/object payloads for `return createError(...)` (or `throw createError(...)`).
262
+ * The payload is also logged and shown by the CLI when the evaluator fails.
263
+ */
264
+ createError: (message: unknown, options?: {
265
+ label?: string;
266
+ }) => Error;
260
267
  }
261
- type EvaluateFn<TInput, TOutput, TScore, TCtx> = (args: EvaluateArgs<TInput, TOutput, TCtx>) => TScore | Promise<TScore>;
268
+ type EvaluateFn<TInput, TOutput, TScore, TCtx> = (args: EvaluateArgs<TInput, TOutput, TCtx>) => TScore | Error | Promise<TScore | Error>;
262
269
  interface EvaluatorDefineConfig<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any, TS extends Schema.Schema.Any> {
263
270
  name: string;
264
271
  inputSchema: TI;
@@ -289,6 +296,8 @@ declare class Evaluator<TInput = unknown, TOutput = unknown, TScore = unknown, T
289
296
  interface MetricItem<TData = unknown> {
290
297
  readonly id: string;
291
298
  readonly data: TData;
299
+ /** Per-item display name override (wins over def.name in rendering) */
300
+ readonly name?: string;
292
301
  }
293
302
  interface FormatMetricOptions {
294
303
  isAggregated?: boolean;
@@ -298,7 +307,9 @@ interface MetricDef<TData = unknown> {
298
307
  readonly name?: string;
299
308
  readonly aggregate?: (values: ReadonlyArray<TData>) => TData;
300
309
  format(data: TData, options?: FormatMetricOptions): string;
301
- make(data: TData): MetricItem<TData>;
310
+ make(data: TData, options?: {
311
+ name?: string;
312
+ }): MetricItem<TData>;
302
313
  }
303
314
  declare const Metric: {
304
315
  of<TData>(config: {
@@ -315,6 +326,8 @@ interface ScoreItem<TData = unknown> {
315
326
  readonly id: string;
316
327
  readonly data: TData;
317
328
  readonly passed?: boolean;
329
+ /** Per-item display name override (wins over def.name in rendering) */
330
+ readonly name?: string;
318
331
  /** Attached def for formatting/aggregation without registry lookup (avoids n/a across module boundaries) */
319
332
  readonly def?: ScoreDef<TData>;
320
333
  }
@@ -330,6 +343,7 @@ interface ScoreDef<TData = unknown> {
330
343
  readonly aggregateValues: (values: ReadonlyArray<TData>) => TData;
331
344
  make(data: TData, options?: {
332
345
  definePassed?: (data: TData) => boolean;
346
+ name?: string;
333
347
  }): ScoreItem<TData>;
334
348
  }
335
349
  /** Helper to format using the right method based on isAggregated (for consumers that need a single entry point) */
@@ -338,22 +352,20 @@ declare const Score: {
338
352
  aggregate: {
339
353
  /** Average numeric fields. Use for scores like { value, delta }. */
340
354
  averageFields<K extends string>(fields: readonly K[]): (values: readonly Record<K, number>[]) => Record<K, number>;
341
- /** Average `value` with sample std dev. Use for percent-style scores. */
342
- averageWithVariance<T extends {
343
- value: number;
344
- }>(values: readonly T[]): T & {
355
+ /** Average selected numeric fields, with sample std dev tracked for `value`. */
356
+ averageWithVariance<K_1 extends string>(fields: readonly K_1[]): (values: readonly Record<K_1, number>[]) => Record<K_1, number> & {
345
357
  stdDev?: number | undefined;
346
358
  count: number;
347
359
  };
348
360
  /** All runs must pass. Use for binary scores. */
349
- all<T_1 extends {
361
+ all<T extends {
350
362
  passed: boolean;
351
- }>(values: readonly T_1[]): T_1 & {
363
+ }>(values: readonly T[]): T & {
352
364
  passedCount?: number | undefined;
353
365
  totalCount?: number | undefined;
354
366
  };
355
367
  /** Take last value (no aggregation). Use when aggregation is not meaningful. */
356
- last<T_2>(values: readonly T_2[]): T_2;
368
+ last<T_1>(values: readonly T_1[]): T_1;
357
369
  };
358
370
  of<TData>(config: {
359
371
  id: string;
package/dist/index.js CHANGED
@@ -501,7 +501,11 @@ var Metric = {
501
501
  name: config.name,
502
502
  aggregate: config.aggregate,
503
503
  format: config.format,
504
- make: (data) => ({ id: config.id, data })
504
+ make: (data, options) => ({
505
+ id: config.id,
506
+ data,
507
+ ...options?.name !== void 0 && { name: options.name }
508
+ })
505
509
  };
506
510
  registry.set(config.id, def);
507
511
  return def;
@@ -523,25 +527,61 @@ var ScoreAggregate = {
523
527
  const count = values.length || 1;
524
528
  const result = {};
525
529
  for (const field of fields) {
526
- result[field] = values.reduce((s, v) => s + (v[field] ?? 0), 0) / count;
530
+ result[field] = values.reduce(
531
+ (s, v) => s + (v[field] ?? 0),
532
+ 0
533
+ ) / count;
527
534
  }
528
535
  return result;
529
536
  };
530
537
  },
531
- /** Average `value` with sample std dev. Use for percent-style scores. */
532
- averageWithVariance(values) {
533
- if (values.length === 0) {
534
- return { value: 0, stdDev: void 0, count: 0 };
535
- }
536
- const sum = values.reduce((s, v) => s + v.value, 0);
537
- const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
538
- const mean = sum / values.length;
539
- let stdDev;
540
- if (values.length >= 2) {
541
- const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
542
- stdDev = variance > 0 ? Math.sqrt(variance) : 0;
543
- }
544
- return { ...values[0], value: mean, stdDev, count: values.length };
538
+ /** Average selected numeric fields, with sample std dev tracked for `value`. */
539
+ averageWithVariance(fields) {
540
+ return (values) => {
541
+ const count = values.length;
542
+ const result = {};
543
+ for (const field of fields) {
544
+ result[field] = count === 0 ? 0 : values.reduce(
545
+ (sum, item) => sum + (item[field] ?? 0),
546
+ 0
547
+ ) / count;
548
+ }
549
+ const valueField = "value";
550
+ const hasValueField = fields.includes(valueField);
551
+ if (count === 0) {
552
+ if (hasValueField) {
553
+ result[valueField] = 0;
554
+ }
555
+ return {
556
+ ...result,
557
+ stdDev: void 0,
558
+ count: 0
559
+ };
560
+ }
561
+ let stdDev;
562
+ if (hasValueField && count >= 2) {
563
+ const sum = values.reduce(
564
+ (s, v) => s + (v[valueField] ?? 0),
565
+ 0
566
+ );
567
+ const sumSq = values.reduce(
568
+ (s, v) => {
569
+ const value = v[valueField] ?? 0;
570
+ return s + value * value;
571
+ },
572
+ 0
573
+ );
574
+ const mean = sum / count;
575
+ const variance = (sumSq - count * mean * mean) / (count - 1);
576
+ stdDev = variance > 0 ? Math.sqrt(variance) : 0;
577
+ }
578
+ return {
579
+ ...values[0],
580
+ ...result,
581
+ stdDev,
582
+ count
583
+ };
584
+ };
545
585
  },
546
586
  /** All runs must pass. Use for binary scores. */
547
587
  all(values) {
@@ -575,6 +615,7 @@ var Score = {
575
615
  id: config.id,
576
616
  data,
577
617
  ...passed !== void 0 && { passed },
618
+ ...options?.name !== void 0 && { name: options.name },
578
619
  def
579
620
  // Attach def so rendering/aggregation works without registry lookup
580
621
  };
@@ -643,7 +684,7 @@ var percentScore = Score.of({
643
684
  displayStrategy: "bar",
644
685
  formatValue: (data) => data.value.toFixed(2),
645
686
  formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
646
- aggregateValues: Score.aggregate.averageWithVariance
687
+ aggregateValues: Score.aggregate.averageWithVariance(["value"])
647
688
  });
648
689
  var deltaScore = Score.of({
649
690
  id: "delta",
@@ -675,6 +716,8 @@ function createDiffString(expected, actual, diffOptions) {
675
716
  function formatLogMessage(msg) {
676
717
  if (typeof msg === "string")
677
718
  return msg;
719
+ if (msg instanceof Error)
720
+ return msg.stack ?? msg.message;
678
721
  try {
679
722
  if (msg !== null && typeof msg === "object") {
680
723
  return JSON.stringify(msg, null, 2);
@@ -1021,6 +1064,7 @@ function toNumericScore(value) {
1021
1064
  }
1022
1065
 
1023
1066
  // src/runner/execution.ts
1067
+ var evaluatorErrorLogEntryKey = "__m4trixEvaluatorLogEntry";
1024
1068
  function computeEvaluatorPassed(evaluator, result, scores) {
1025
1069
  const scoresWithPassed = scores.filter((s) => "passed" in s && s.passed !== void 0);
1026
1070
  if (scoresWithPassed.length > 0) {
@@ -1077,20 +1121,26 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
1077
1121
  if (!evaluateFn) {
1078
1122
  continue;
1079
1123
  }
1124
+ const logs = [];
1125
+ const logDiff = (expected, actual, options) => {
1126
+ logs.push(createDiffLogEntry(expected, actual, options));
1127
+ };
1128
+ const log = (message, options) => {
1129
+ logs.push(createLogEntry(message, options));
1130
+ };
1131
+ const createError = (message, options) => {
1132
+ const entry = createLogEntry(message, options);
1133
+ const error = message instanceof Error ? message : new Error(entry.message);
1134
+ error[evaluatorErrorLogEntryKey] = entry;
1135
+ return error;
1136
+ };
1080
1137
  try {
1081
- const logs = [];
1082
- const logDiff = (expected, actual, options) => {
1083
- logs.push(createDiffLogEntry(expected, actual, options));
1084
- };
1085
- const log = (message, options) => {
1086
- logs.push(createLogEntry(message, options));
1087
- };
1088
1138
  const ctx = yield* Effect.promise(
1089
1139
  () => Promise.resolve(evaluator.resolveContext())
1090
1140
  );
1091
1141
  const result = yield* Effect.promise(
1092
- () => Promise.resolve(
1093
- evaluateFn({
1142
+ () => Promise.resolve().then(
1143
+ () => evaluateFn({
1094
1144
  input: testCaseItem.testCase.getInput(),
1095
1145
  ctx,
1096
1146
  output,
@@ -1100,10 +1150,24 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
1100
1150
  datasetId: task.datasetId
1101
1151
  },
1102
1152
  logDiff,
1103
- log
1153
+ log,
1154
+ createError
1104
1155
  })
1105
1156
  )
1106
1157
  );
1158
+ if (result instanceof Error) {
1159
+ const evaluatorError = result;
1160
+ const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
1161
+ logs.push(taggedEntry ?? createLogEntry(result));
1162
+ testCaseError = result.message;
1163
+ evaluatorScores.push({
1164
+ evaluatorId,
1165
+ scores: [],
1166
+ passed: false,
1167
+ logs: logs.length > 0 ? logs : void 0
1168
+ });
1169
+ continue;
1170
+ }
1107
1171
  const { scores, metrics } = normalizeResult(result);
1108
1172
  const passed2 = computeEvaluatorPassed(evaluator, result, scores);
1109
1173
  evaluatorScores.push({
@@ -1114,11 +1178,16 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
1114
1178
  logs: logs.length > 0 ? logs : void 0
1115
1179
  });
1116
1180
  } catch (error) {
1181
+ if (error instanceof Error) {
1182
+ const taggedEntry = error[evaluatorErrorLogEntryKey];
1183
+ logs.push(taggedEntry ?? createLogEntry(error));
1184
+ }
1117
1185
  testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
1118
1186
  evaluatorScores.push({
1119
1187
  evaluatorId,
1120
1188
  scores: [],
1121
- passed: false
1189
+ passed: false,
1190
+ logs: logs.length > 0 ? logs : void 0
1122
1191
  });
1123
1192
  }
1124
1193
  }